File Coverage

Bio/SeqIO/embl.pm
Criterion Covered Total %
statement 544 651 83.5
branch 267 434 61.5
condition 90 128 70.3
subroutine 30 33 90.9
pod 2 2 100.0
total 933 1248 74.7


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::EMBL
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Ewan Birney
7             #
8             # Copyright Ewan Birney
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::embl - EMBL sequence input/output stream
17              
18             =head1 SYNOPSIS
19              
20             It is probably best not to use this object directly, but
21             rather go through the SeqIO handler system. Go:
22              
23             $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
24              
25             while ( (my $seq = $stream->next_seq()) ) {
26             # do something with $seq
27             }
28              
29             =head1 DESCRIPTION
30              
31             This object can transform Bio::Seq objects to and from EMBL flat
32             file databases.
33              
34             There is a lot of flexibility here about how to dump things which
35             should be documented more fully.
36              
37             There should be a common object that this and Genbank share (probably
38             with Swissprot). Too much of the magic is identical.
39              
40             =head2 Optional functions
41              
42             =over 3
43              
44             =item _show_dna()
45              
46             (output only) shows the dna or not
47              
48             =item _post_sort()
49              
50             (output only) provides a sorting func which is applied to the FTHelpers
51             before printing
52              
53             =item _id_generation_func()
54              
55             This is function which is called as
56              
57             print "ID ", $func($annseq), "\n";
58              
59             To generate the ID line. If it is not there, it generates a sensible ID
60             line using a number of tools.
61              
62             If you want to output annotations in EMBL format they need to be
63             stored in a Bio::Annotation::Collection object which is accessible
64             through the Bio::SeqI interface method L.
65              
66             The following are the names of the keys which are polled from a
67             L object.
68              
69             reference - Should contain Bio::Annotation::Reference objects
70             comment - Should contain Bio::Annotation::Comment objects
71             dblink - Should contain Bio::Annotation::DBLink objects
72              
73             =back
74              
75             =head1 FEEDBACK
76              
77             =head2 Mailing Lists
78              
79             User feedback is an integral part of the evolution of this and other
80             Bioperl modules. Send your comments and suggestions preferably to one
81             of the Bioperl mailing lists. Your participation is much appreciated.
82              
83             bioperl-l@bioperl.org - General discussion
84             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
85              
86             =head2 Support
87              
88             Please direct usage questions or support issues to the mailing list:
89              
90             I
91              
92             rather than to the module maintainer directly. Many experienced and
93             reponsive experts will be able look at the problem and quickly
94             address it. Please include a thorough description of the problem
95             with code and data examples if at all possible.
96              
97             =head2 Reporting Bugs
98              
99             Report bugs to the Bioperl bug tracking system to help us keep track
100             the bugs and their resolution. Bug reports can be submitted via
101             the web:
102              
103             https://github.com/bioperl/bioperl-live/issues
104              
105             =head1 AUTHOR - Ewan Birney
106              
107             Email birney@ebi.ac.uk
108              
109             =head1 APPENDIX
110              
111             The rest of the documentation details each of the object
112             methods. Internal methods are usually preceded with a _
113              
114             =cut
115              
116              
117             # Let the code begin...
118              
119              
120             package Bio::SeqIO::embl;
121 7     7   599 use vars qw(%FTQUAL_NO_QUOTE);
  7         13  
  7         402  
122 7     7   44 use strict;
  7         16  
  7         162  
123 7     7   1383 use Bio::SeqIO::FTHelper;
  7         81  
  7         204  
124 7     7   44 use Bio::SeqFeature::Generic;
  7         11  
  7         145  
125 7     7   1212 use Bio::Species;
  7         18  
  7         190  
126 7     7   46 use Bio::Seq::SeqFactory;
  7         13  
  7         169  
127 7     7   34 use Bio::Annotation::Collection;
  7         13  
  7         168  
128 7     7   1283 use Bio::Annotation::Comment;
  7         17  
  7         189  
129 7     7   1410 use Bio::Annotation::Reference;
  7         20  
  7         219  
130 7     7   46 use Bio::Annotation::DBLink;
  7         19  
  7         189  
131              
132 7     7   41 use base qw(Bio::SeqIO);
  7         16  
  7         41516  
133              
134             # Note that a qualifier that exceeds one line (i.e. a long label) will
135             # automatically be quoted regardless:
136             %FTQUAL_NO_QUOTE=(
137             'anticodon'=>1,
138             'citation'=>1,
139             'codon'=>1,
140             'codon_start'=>1,
141             'cons_splice'=>1,
142             'direction'=>1,
143             'evidence'=>1,
144             'label'=>1,
145             'mod_base'=> 1,
146             'number'=> 1,
147             'rpt_type'=> 1,
148             'rpt_unit'=> 1,
149             'transl_except'=> 1,
150             'transl_table'=> 1,
151             'usedin'=> 1,
152             );
153              
154             sub _initialize {
155 31     31   99 my($self,@args) = @_;
156              
157 31         160 $self->SUPER::_initialize(@args);
158             # hash for functions for decoding keys.
159 31         115 $self->{'_func_ftunit_hash'} = {};
160             # sets this to one by default. People can change it
161 31         143 $self->_show_dna(1);
162 31 50       160 if ( ! defined $self->sequence_factory ) {
163 31         122 $self->sequence_factory(Bio::Seq::SeqFactory->new
164             (-verbose => $self->verbose(),
165             -type => 'Bio::Seq::RichSeq'));
166             }
167             }
168              
169             =head2 next_seq
170              
171             Title : next_seq
172             Usage : $seq = $stream->next_seq()
173             Function: returns the next sequence in the stream
174             Returns : Bio::Seq object
175             Args :
176              
177             =cut
178              
179             sub next_seq {
180 25     25 1 132 my ($self,@args) = @_;
181 25         70 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
182             $date, $comment, @date_arr);
183              
184 25         203 my ($annotation, %params, @features) =
185             Bio::Annotation::Collection->new();
186              
187 25         124 $line = $self->_readline;
188             # This needs to be before the first eof() test
189              
190 25 100       84 if ( !defined $line ) {
191 1         5 return; # no throws - end of file
192             }
193              
194 24 100       161 if ( $line =~ /^\s+$/ ) {
195 1         8 while ( defined ($line = $self->_readline) ) {
196 1 50       7 $line =~/^\S/ && last;
197             }
198             # return without error if the whole next sequence was just a single
199             # blank line and then eof
200 1 50       4 return unless $line;
201             }
202              
203             # no ID as 1st non-blank line, need short circuit and exit routine
204 24 50       126 $self->throw("EMBL stream with no ID. Not embl in my book")
205             unless $line =~ /^ID\s+\S+/;
206              
207             # At this point we are sure that $line contains an ID header line
208 24         52 my $alphabet;
209 24 100       90 if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
210              
211             # New style header (EMBL Release >= 87, after June 2006)
212 10         32 my $topology;
213             my $sv;
214              
215             # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
216             # This regexp comes from the new2old.pl conversion script, from EBI
217 10 100       90 if ($line =~ m/^ID (\S+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
218 7         74 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
219             }
220 10 100       37 if (defined $sv) {
221 7         27 $params{'-seq_version'} = $sv;
222 7         19 $params{'-version'} = $sv;
223             }
224              
225 10 50 66     46 if (defined $topology && $topology eq 'circular') {
226 0         0 $params{'-is_circular'} = 1;
227             }
228            
229 10 100       35 if (defined $mol ) {
230 7 100       32 if ($mol =~ /DNA/) {
    50          
    0          
231 6         17 $alphabet = 'dna';
232             } elsif ($mol =~ /RNA/) {
233 1         3 $alphabet = 'rna';
234             } elsif ($mol =~ /AA/) {
235 0         0 $alphabet = 'protein';
236             }
237             }
238             } else {
239            
240             # Old style header (EMBL Release < 87, before June 2006)
241 14 100       115 if ($line =~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/) {
242 12         63 ($name, $mol, $div) = ($1, $2, $3);
243             }
244            
245 14 100       34 if ($mol) {
246 12 50       38 if ( $mol =~ /circular/ ) {
247 0         0 $params{'-is_circular'} = 1;
248 0         0 $mol =~ s|circular ||;
249             }
250 12 50       35 if (defined $mol ) {
251 12 100       49 if ($mol =~ /DNA/) {
    50          
    0          
252 10         25 $alphabet='dna';
253             } elsif ($mol =~ /RNA/) {
254 2         4 $alphabet='rna';
255             } elsif ($mol =~ /AA/) {
256 0         0 $alphabet='protein';
257             }
258             }
259             }
260             }
261              
262 24 100 66     150 unless( defined $name && length($name) ) {
263 5         12 $name = "unknown_id";
264             }
265              
266             # $self->warn("not parsing upper annotation in EMBL file yet!");
267 24         43 my $buffer = $line;
268 24         39 local $_;
269 24         37 BEFORE_FEATURE_TABLE :
270             my $ncbi_taxid;
271 24         88 until ( !defined $buffer ) {
272 499         741 $_ = $buffer;
273             # Exit at start of Feature table
274 499 100       1476 if ( /^(F[HT]|SQ)/ ) {
275 24 100 100     196 $self->_pushback($_) if( $1 eq 'SQ' || $1 eq 'FT');
276 24         75 last;
277             }
278             # Description line(s)
279 475 100       993 if (/^DE\s+(\S.*\S)/) {
280 22 100       110 $desc .= $desc ? " $1" : $1;
281             }
282              
283             #accession number
284 475 100 100     1538 if ( /^AC\s+(.*)?/ || /^PA\s+(.*)?/) {
285 24         164 my @accs = split(/[; ]+/, $1); # allow space in addition
286             $params{'-accession_number'} = shift @accs
287 24 100       107 unless defined $params{'-accession_number'};
288 24         45 push @{$params{'-secondary_accessions'}}, @accs;
  24         86  
289             }
290              
291             #version number
292 475 100       851 if ( /^SV\s+\S+\.(\d+);?/ ) {
293 6         17 my $sv = $1;
294             #$sv =~ s/\;//;
295 6         16 $params{'-seq_version'} = $sv;
296 6         14 $params{'-version'} = $sv;
297             }
298              
299             #date (NOTE: takes last date line)
300 475 100       816 if ( /^DT\s+(.+)$/ ) {
301 26         74 my $line = $1;
302 26         97 my ($date, $version) = split(' ', $line, 2);
303 26         51 $date =~ tr/,//d; # remove comma if new version
304 26 100       62 if ($version) {
305 24 100       137 if ($version =~ /\(Rel\. (\d+), Created\)/ms ) {
    100          
306 11         99 my $release = Bio::Annotation::SimpleValue->new(
307             -tagname => 'creation_release',
308             -value => $1
309             );
310 11         74 $annotation->add_Annotation($release);
311             } elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/ms ) {
312 11         55 my $release = Bio::Annotation::SimpleValue->new(
313             -tagname => 'update_release',
314             -value => $1
315             );
316 11         47 $annotation->add_Annotation($release);
317              
318 11         43 my $update = Bio::Annotation::SimpleValue->new(
319             -tagname => 'update_version',
320             -value => $2
321             );
322 11         30 $annotation->add_Annotation($update);
323             }
324             }
325 26         40 push @{$params{'-dates'}}, $date;
  26         85  
326             }
327              
328             #keywords
329 475 100       1953 if ( /^KW (.*)\S*$/ ) {
    100          
    100          
    100          
    100          
    100          
330 29         161 my @kw = split(/\s*\;\s*/,$1);
331 29         55 push @{$params{'-keywords'}}, @kw;
  29         114  
332             }
333              
334             # Organism name and phylogenetic information
335             elsif (/^O[SC]/) {
336             # pass the accession number so we can give an informative throw message if necessary
337 21         124 my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
338 21         78 $params{'-species'}= $species;
339             }
340              
341             # NCBI TaxID Xref
342             elsif (/^OX/) {
343 3 50       34 if (/NCBI_TaxID=(\d+)/) {
344 3         19 $ncbi_taxid=$1;
345             }
346              
347 3         17 my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
348 3         10 foreach my $dblink ( @links ) {
349 1         7 $annotation->add_Annotation('dblink',$dblink);
350             }
351             }
352              
353             # References
354             elsif (/^R/) {
355 99         311 my @refs = $self->_read_EMBL_References(\$buffer);
356 99         168 foreach my $ref ( @refs ) {
357 99         327 $annotation->add_Annotation('reference',$ref);
358             }
359             }
360              
361             # DB Xrefs
362             elsif (/^DR/) {
363 5         27 my @links = $self->_read_EMBL_DBLink(\$buffer);
364 5         13 foreach my $dblink ( @links ) {
365 25         51 $annotation->add_Annotation('dblink',$dblink);
366             }
367             }
368              
369             # Comments
370             elsif (/^CC\s+(.*)/) {
371 15         63 $comment .= $1;
372 15         35 $comment .= " ";
373 15         47 while (defined ($_ = $self->_readline) ) {
374 257 100       724 if (/^CC\s+(.*)/) {
375 242         470 $comment .= $1;
376 242         397 $comment .= " ";
377             } else {
378 15         34 last;
379             }
380             }
381 15         146 my $commobj = Bio::Annotation::Comment->new();
382 15         71 $commobj->text($comment);
383 15         60 $annotation->add_Annotation('comment',$commobj);
384 15         30 $comment = "";
385             }
386              
387             # Get next line.
388 475         1036 $buffer = $self->_readline;
389             }
390              
391 24         69 while ( defined ($_ = $self->_readline) ) {
392 46 100       168 /^FT\s{3}\w/ && last;
393 24 100       63 /^SQ / && last;
394 22 50       68 /^CO / && last;
395             }
396 24         49 $buffer = $_;
397              
398 24 100 66     186 if (defined($buffer) && $buffer =~ /^FT /) {
399 22         77 until ( !defined ($buffer) ) {
400 256         757 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
401              
402             # process ftunit
403 256         750 my $feat =
404             $ftunit->_generic_seqfeature($self->location_factory(), $name);
405              
406             # add taxon_id from source if available
407             # Notice, this will override what is found in the OX line.
408             # this is by design as this seems to be the official way
409             # of specifying a TaxID
410 256 100 100     793 if ($params{'-species'} && ($feat->primary_tag eq 'source')
      100        
      66        
411             && $feat->has_tag('db_xref')
412             && (! $params{'-species'}->ncbi_taxid())) {
413 12         44 foreach my $tagval ($feat->get_tag_values('db_xref')) {
414 12 50       54 if (index($tagval,"taxon:") == 0) {
415 12         60 $params{'-species'}->ncbi_taxid(substr($tagval,6));
416 12         38 last;
417             }
418             }
419             }
420              
421             # add feature to list of features
422 256         437 push(@features, $feat);
423              
424 256 100       1460 if ( $buffer !~ /^FT/ ) {
425 22         88 last;
426             }
427             }
428             }
429             # Set taxid found in OX line
430 24 100 100     175 if ($params{'-species'} && defined $ncbi_taxid
      100        
431             && (! $params{'-species'}->ncbi_taxid())) {
432 2         9 $params{'-species'}->ncbi_taxid($ncbi_taxid);
433             }
434              
435             # skip comments
436 24   66     200 while ( defined ($buffer) && $buffer =~ /^XX/ ) {
437 20         72 $buffer = $self->_readline();
438             }
439            
440 24 100       78 if ( $buffer =~ /^CO/ ) {
441             # bug#2982
442             # special : create contig as annotation
443 4         16 while ( defined ($buffer) ) {
444 4         18 $annotation->add_Annotation($_) for $self->_read_EMBL_Contig(\$buffer);
445 4 50 66     46 if ( !$buffer || $buffer !~ /^CO/ ) {
446 4         9 last;
447             }
448             }
449 4   100     14 $buffer ||= '';
450             }
451 24 100       94 if ($buffer !~ /^\/\//) { # if no SQ lines following CO (bug#2958)
452 20 100       127 if ( $buffer !~ /^SQ/ ) {
453 1         9 while ( defined ($_ = $self->_readline) ) {
454 0 0       0 /^SQ/ && last;
455             }
456             }
457 20         39 $seqc = "";
458 20         61 while ( defined ($_ = $self->_readline) ) {
459 2924 100       5750 m{^//} && last;
460 2906         4154 $_ = uc($_);
461 2906         29191 s/[^A-Za-z]//g;
462 2906         7603 $seqc .= $_;
463             }
464             }
465 24         143 my $seq = $self->sequence_factory->create
466             (-verbose => $self->verbose(),
467             -division => $div,
468             -seq => $seqc,
469             -desc => $desc,
470             -display_id => $name,
471             -annotation => $annotation,
472             -molecule => $mol,
473             -alphabet => $alphabet,
474             -features => \@features,
475             %params);
476 24         385 return $seq;
477             }
478              
479              
480              
481             =head2 _write_ID_line
482              
483             Title : _write_ID_line
484             Usage : $self->_write_ID_line($seq);
485             Function: Writes the EMBL Release 87 format ID line to the stream, unless
486             : there is a user-supplied ID line generation function in which
487             : case that is used instead.
488             : ( See Bio::SeqIO::embl::_id_generation_function(). )
489             Returns : nothing
490             Args : Bio::Seq object
491              
492             =cut
493              
494             sub _write_ID_line {
495              
496 11     11   30 my ($self, $seq) = @_;
497              
498 11         25 my $id_line;
499             # If there is a user-supplied ID generation function, use it.
500 11 50       47 if ( $self->_id_generation_func ) {
501 0         0 $id_line = "ID " . &{$self->_id_generation_func}($seq) . "\nXX\n";
  0         0  
502             }
503             # Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
504             else {
505              
506             # The sequence name is supposed to be the primary accession number,
507 11         51 my $name = $seq->accession_number();
508 11 100 66     90 if ( not(defined $name) || $name eq 'unknown') {
509             # but if it is not present, use the sequence ID or the empty string
510 5   100     25 $name = $seq->id() || '';
511             }
512            
513 11 50       55 $self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
514              
515             # Use the sequence version, or default to 1.
516 11   50     53 my $version = $seq->version() || 1;
517              
518 11         52 my $len = $seq->length();
519              
520             # Taxonomic division.
521 11         18 my $div;
522 11 100 100     214 if ( $seq->can('division') && defined($seq->division) &&
      66        
523             $self->_is_valid_division($seq->division) ) {
524 2         7 $div = $seq->division();
525             } else {
526 9   50     66 $div ||= 'UNC'; # 'UNC' is the EMBL division code for 'unclassified'.
527             }
528              
529 11         24 my $mol;
530             # If the molecule type is a valid EMBL type, use it.
531 11 50 100     143 if ( $seq->can('molecule')
    100 66        
      100        
532             && defined($seq->molecule)
533             && $self->_is_valid_molecule_type($seq->molecule)
534             ) {
535 0         0 $mol = $seq->molecule();
536             }
537             # Otherwise, choose unassigned DNA or RNA based on the alphabet.
538             elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
539 6         16 my $alphabet =$seq->primary_seq->alphabet;
540 6 50       19 if ($alphabet eq 'dna') {
    0          
    0          
541 6         14 $mol ='unassigned DNA';
542             } elsif ($alphabet eq 'rna') {
543 0         0 $mol='unassigned RNA';
544             } elsif ($alphabet eq 'protein') {
545 0         0 $self->warn("Protein sequence found; EMBL is a nucleotide format.");
546 0         0 $mol='AA'; # AA is not a valid EMBL molecule type.
547             }
548             }
549              
550 11         30 my $topology = 'linear';
551 11 50       51 if ($seq->is_circular) {
552 0         0 $topology = 'circular';
553             }
554              
555 11   100     52 $mol ||= ''; # 'unassigned'; ?
556 11         76 $id_line = "ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
557             }
558 11         292 $self->_print($id_line);
559             }
560              
561             =head2 _is_valid_division
562              
563             Title : _is_valid_division
564             Usage : $self->_is_valid_division($div)
565             Function: tests division code for validity
566             Returns : true if $div is a valid EMBL release 87 taxonomic division.
567             Args : taxonomic division code string
568              
569             =cut
570              
571             sub _is_valid_division {
572 2     2   11 my ($self, $division) = @_;
573              
574 2         35 my %EMBL_divisions = (
575             "PHG" => 1, # Bacteriophage
576             "ENV" => 1, # Environmental Sample
577             "FUN" => 1, # Fungal
578             "HUM" => 1, # Human
579             "INV" => 1, # Invertebrate
580             "MAM" => 1, # Other Mammal
581             "VRT" => 1, # Other Vertebrate
582             "MUS" => 1, # Mus musculus
583             "PLN" => 1, # Plant
584             "PRO" => 1, # Prokaryote
585             "ROD" => 1, # Other Rodent
586             "SYN" => 1, # Synthetic
587             "UNC" => 1, # Unclassified
588             "VRL" => 1 # Viral
589             );
590              
591 2         15 return exists($EMBL_divisions{$division});
592             }
593              
594             =head2 _is_valid_molecule_type
595              
596             Title : _is_valid_molecule_type
597             Usage : $self->_is_valid_molecule_type($mol)
598             Function: tests molecule type for validity
599             Returns : true if $mol is a valid EMBL release 87 molecule type.
600             Args : molecule type string
601              
602             =cut
603              
604             sub _is_valid_molecule_type {
605 2     2   8 my ($self, $moltype) = @_;
606              
607 2         23 my %EMBL_molecule_types = (
608             "genomic DNA" => 1,
609             "genomic RNA" => 1,
610             "mRNA" => 1,
611             "tRNA" => 1,
612             "rRNA" => 1,
613             "snoRNA" => 1,
614             "snRNA" => 1,
615             "scRNA" => 1,
616             "pre-RNA" => 1,
617             "other RNA" => 1,
618             "other DNA" => 1,
619             "unassigned DNA" => 1,
620             "unassigned RNA" => 1
621             );
622              
623 2         26 return exists($EMBL_molecule_types{$moltype});
624             }
625              
626             =head2 write_seq
627              
628             Title : write_seq
629             Usage : $stream->write_seq($seq)
630             Function: writes the $seq object (must be seq) to the stream
631             Returns : 1 for success and undef for error
632             Args : array of 1 to n Bio::SeqI objects
633              
634              
635             =cut
636              
637             sub write_seq {
638 13     13 1 114 my ($self,@seqs) = @_;
639              
640 13         40 foreach my $seq ( @seqs ) {
641 13 50       44 $self->throw("Attempting to write with no seq!") unless defined $seq;
642 13 100 100     129 unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
643 4 50       22 $self->warn("$seq is not a SeqI compliant sequence object!")
644             if $self->verbose >= 0;
645 4 100 66     30 unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
646 2         34 $self->throw("$seq is not a PrimarySeqI compliant sequence object!");
647             }
648             }
649 11   100     61 my $str = $seq->seq || '';
650              
651             # Write the ID line.
652 11         53 $self->_write_ID_line($seq);
653              
654              
655             # Write the accession line if present
656 11         21 my( $acc );
657             {
658 11 50 66     25 if ( my $func = $self->_ac_generation_func ) {
  11 100       38  
    50          
659 0         0 $acc = &{$func}($seq);
  0         0  
660             } elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
661             defined($seq->accession_number) ) {
662 4         12 $acc = $seq->accession_number;
663 4         22 $acc = join("; ", $acc, $seq->get_secondary_accessions);
664             } elsif ( $seq->can('accession_number') ) {
665 7         27 $acc = $seq->accession_number;
666             }
667              
668 11 50       44 if (defined $acc) {
669 11 50       59 $self->_print("AC $acc;\n",
670             "XX\n") || return;
671             }
672             }
673              
674             # Date lines
675 11         32 my $switch=0;
676 11 100       73 if ( $seq->can('get_dates') ) {
677 4         18 my @dates = $seq->get_dates();
678 4         8 my $ct = 1;
679 4         8 my $date_flag = 0;
680 4         16 my ($cr) = $seq->annotation->get_Annotations("creation_release");
681 4         16 my ($ur) = $seq->annotation->get_Annotations("update_release");
682 4         13 my ($uv) = $seq->annotation->get_Annotations("update_version");
683              
684 4 0 33     14 unless ($cr && $ur && $ur) {
      33        
685 4         8 $date_flag = 1;
686             }
687              
688 4         10 foreach my $dt (@dates) {
689 0 0       0 if (!$date_flag) {
690 0 0       0 $self->_write_line_EMBL_regex("DT ","DT ",
691             $dt." (Rel. ".($cr->value()).", Created)",
692             '\s+|$',80) if $ct == 1;
693 0 0       0 $self->_write_line_EMBL_regex("DT ","DT ",
694             $dt." (Rel. ".($ur->value()).", Last updated, Version ".($uv->value()).")",
695             '\s+|$',80) if $ct == 2;
696             } else { # other formats?
697 0         0 $self->_write_line_EMBL_regex("DT ","DT ",
698             $dt,'\s+|$',80);
699             }
700 0         0 $switch =1;
701 0         0 $ct++;
702             }
703 4 50       14 if ($switch == 1) {
704 0 0       0 $self->_print("XX\n") || return;
705             }
706             }
707              
708             # Description lines
709 11 50       60 $self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80) || return; #'
710 11 50       38 $self->_print( "XX\n") || return;
711              
712             # if there, write the kw line
713             {
714 11         21 my( $kw );
  11         22  
715 11 50       61 if ( my $func = $self->_kw_generation_func ) {
    100          
716 0         0 $kw = &{$func}($seq);
  0         0  
717             } elsif ( $seq->can('keywords') ) {
718 4         16 $kw = $seq->keywords;
719             }
720 11 100       44 if (defined $kw) {
721 4 50       632 $self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+|$', 80) || return; #'
722 4 50       14 $self->_print( "XX\n") || return;
723             }
724             }
725              
726             # Organism lines
727              
728 11 100 100     112 if ($seq->can('species') && (my $spec = $seq->species)) {
729 6         39 my @class = $spec->classification();
730 6         11 shift @class; # get rid of species name. Some embl files include
731             # the species name in the OC lines, but this seems
732             # more like an error than something we need to
733             # emulate
734 6         34 my $OS = $spec->scientific_name;
735 6 50       24 if ($spec->common_name) {
736 0         0 $OS .= ' ('.$spec->common_name.')';
737             }
738 6 50       290 $self->_print("OS $OS\n") || return;
739 6         41 my $OC = join('; ', reverse(@class)) .'.';
740 6 50       22 $self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80) || return;
741 6 50       29 if ($spec->organelle) {
742 0 0       0 $self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80) || return;
743             }
744 6         21 my $ncbi_taxid = $spec->ncbi_taxid;
745 6 100       16 if ($ncbi_taxid) {
746 2 50       10 $self->_print("OX NCBI_TaxID=$ncbi_taxid\n") || return;
747             }
748 6 50       21 $self->_print("XX\n") || return;
749             }
750             # Reference lines
751 11         27 my $t = 1;
752 11 100 66     88 if ( $seq->can('annotation') && defined $seq->annotation ) {
753 9         24 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
754 6 50       19 $self->_print( "RN [$t]\n") || return;
755              
756             # Having no RP line is legal, but we need both
757             # start and end for a valid location.
758 6 50       21 if ($ref->comment) {
759 0 0       0 $self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80) || return; #'
760             }
761 6         17 my $start = $ref->start;
762 6         18 my $end = $ref->end;
763 6 50 33     20 if ($start and $end) {
    0 0        
764 6 50       16 $self->_print( "RP $start-$end\n") || return;
765             } elsif ($start or $end) {
766 0         0 $self->throw("Both start and end are needed for a valid RP line.".
767             " Got: start='$start' end='$end'");
768             }
769              
770 6 50       17 if (my $med = $ref->medline) {
771 0 0       0 $self->_print( "RX MEDLINE; $med.\n") || return;
772             }
773 6 50       13 if (my $pm = $ref->pubmed) {
774 0 0       0 $self->_print( "RX PUBMED; $pm.\n") || return;
775             }
776 6         15 my $authors = $ref->authors;
777 6         73 $authors =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
778              
779 6 50       22 $self->_write_line_EMBL_regex("RA ", "RA ",
780             $authors . ";",
781             '\s+|$', 80) || return; #'
782              
783             # If there is no title to the reference, it appears
784             # as a single semi-colon. All titles must end in
785             # a semi-colon.
786 6   100     17 my $ref_title = $ref->title || '';
787 6         40 $ref_title =~ s/[\s;]*$/;/;
788 6 50       16 $self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80) || return; #'
789 6 50       16 $self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80) || return; #'
790 6 50       14 $self->_print("XX\n") || return;
791 6         14 $t++;
792             }
793              
794             # DB Xref lines
795 9 100       47 if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
796 2         4 for my $dr (@db_xref) {
797 43         84 my $db_name = $dr->database;
798 43         74 my $prim = $dr->primary_id;
799              
800 43   50     74 my $opt = $dr->optional_id || '';
801 43 50       102 my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
802 43 50       68 $self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80) || return; #'
803             }
804 2 50       9 $self->_print("XX\n") || return;
805             }
806            
807             # Comment lines
808 9         37 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
809 4 50       19 $self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80) || return; #'
810 4 50       14 $self->_print("XX\n") || return;
811             }
812             }
813             # "\\s\+\|\$"
814              
815             ## FEATURE TABLE
816              
817 11 50       60 $self->_print("FH Key Location/Qualifiers\n") || return;
818 11 50       34 $self->_print("FH\n") || return;
819              
820 11 100       240 my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
821 11 100       42 if ($feats[0]) {
822 6 50       25 if ( defined $self->_post_sort ) {
823             # we need to read things into an array.
824             # Process. Sort them. Print 'em
825              
826 0         0 my $post_sort_func = $self->_post_sort();
827 0         0 my @fth;
828              
829 0         0 foreach my $sf ( @feats ) {
830 0         0 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
831             }
832              
833 0         0 @fth = sort { &$post_sort_func($a,$b) } @fth;
  0         0  
834              
835 0         0 foreach my $fth ( @fth ) {
836 0 0       0 $self->_print_EMBL_FTHelper($fth) || return;
837             }
838             } else {
839             # not post sorted. And so we can print as we get them.
840             # lower memory load...
841              
842 6         23 foreach my $sf ( @feats ) {
843 38         138 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
844 38         73 foreach my $fth ( @fth ) {
845 38 50       99 if ( $fth->key eq 'CONTIG') {
846 0         0 $self->_show_dna(0);
847             }
848 38 50       138 $self->_print_EMBL_FTHelper($fth) || return;
849             }
850             }
851             }
852             }
853              
854 11 50       61 if ( $self->_show_dna() == 0 ) {
855 0 0       0 $self->_print( "//\n") || return;
856 0         0 return;
857             }
858 11 50       42 $self->_print( "XX\n") || return;
859              
860             # finished printing features.
861              
862             # print contig if present : bug#2982
863 11 100 66     127 if ( $seq->can('annotation') && defined $seq->annotation) {
864 9         42 foreach my $ctg ( $seq->annotation->get_Annotations('contig') ) {
865 0 0       0 if ($ctg->value) {
866 0 0       0 $self->_write_line_EMBL_regex("CO ","CO ", $ctg->value,
867             '[,]|$', 80) || return;
868             }
869             }
870             }
871             # print sequence lines only if sequence is present! bug#2982
872 11 100       46 if (length($str)) {
873 8         48 $str =~ tr/A-Z/a-z/;
874              
875             # Count each nucleotide
876 8         30 my $alen = $str =~ tr/a/a/;
877 8         29 my $clen = $str =~ tr/c/c/;
878 8         27 my $glen = $str =~ tr/g/g/;
879 8         23 my $tlen = $str =~ tr/t/t/;
880            
881 8         32 my $len = $seq->length();
882 8         24 my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
883 8 50       32 if ( $olen < 0 ) {
884 0         0 $self->warn("Weird. More atgc than bases. Problem!");
885             }
886            
887 8 50       63 $self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") || return;
888              
889 8         19 my $nuc = 60; # Number of nucleotides per line
890 8         20 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
891 8         14 my $out_pat = 'A11' x 6; # Pattern for packing a line
892 8         16 my $length = length($str);
893            
894             # Calculate the number of nucleotides which fit on whole lines
895 8         45 my $whole = int($length / $nuc) * $nuc;
896              
897             # Print the whole lines
898 8         12 my( $i );
899 8         33 for ($i = 0; $i < $whole; $i += $nuc) {
900 162         752 my $blocks = pack $out_pat,
901             unpack $whole_pat,
902             substr($str, $i, $nuc);
903 162 50       718 $self->_print(sprintf(" $blocks%9d\n", $i + $nuc)) || return;
904             }
905              
906             # Print the last line
907 8 50       46 if (my $last = substr($str, $i)) {
908 8         19 my $last_len = length($last);
909 8         46 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
910 8         83 my $blocks = pack $out_pat,
911             unpack($last_pat, $last);
912 8 50       67 $self->_print(sprintf(" $blocks%9d\n", $length)) ||
913             return; # Add the length to the end
914             }
915             }
916            
917 11 50       49 $self->_print( "//\n") || return;
918            
919 11 50 33     76 $self->flush if $self->_flush_on_write && defined $self->_fh;
920             }
921 11         85 return 1;
922             }
923              
924             =head2 _print_EMBL_FTHelper
925              
926             Title : _print_EMBL_FTHelper
927             Usage :
928             Function: Internal function
929             Returns : 1 if writing succeeded, otherwise undef
930             Args :
931              
932              
933             =cut
934              
935             sub _print_EMBL_FTHelper {
936 38     38   81 my ($self,$fth) = @_;
937              
938 38 50 33     238 if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
939 0         0 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
940             }
941              
942              
943             #$self->_print( "FH Key Location/Qualifiers\n");
944             #$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc));
945             # let
946 38 50       102 if ( $fth->key eq 'CONTIG' ) {
947 0 0       0 $self->_print("XX\n") || return;
948 0 0       0 $self->_write_line_EMBL_regex("CO ",
949             "CO ",$fth->loc,
950             '\,|$',80) || return; #'
951 0         0 return 1;
952             }
953 38 50       110 $self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key),
954             "FT ",$fth->loc,
955             '\,|$',80) || return; #'
956 38         76 foreach my $tag (sort keys %{$fth->field} ) {
  38         105  
957 108 50       249 if ( ! defined $fth->field->{$tag} ) {
958 0         0 next;
959             }
960 108         138 foreach my $value ( @{$fth->field->{$tag}} ) {
  108         190  
961 139         243 $value =~ s/\"/\"\"/g;
962 139 50 66     412 if ($value eq "_no_value") {
    50          
963 0 0       0 $self->_write_line_EMBL_regex("FT ",
964             "FT ",
965             "/$tag",'.|$',80) || return; #'
966             }
967             # there are almost 3x more quoted qualifier values and they
968             # are more common too so we take quoted ones first
969             #
970             # Long qualifiers, that will be line wrapped, are always quoted
971             elsif (!$FTQUAL_NO_QUOTE{$tag} or length("/$tag=$value")>=60) {
972 139 100       413 my $pat = $value =~ /\s+/ ? '\s+|\-|$' : '.|\-|$';
973 139 50       414 $self->_write_line_EMBL_regex("FT ",
974             "FT ",
975             "/$tag=\"$value\"",$pat,80) || return;
976             } else {
977 0 0       0 $self->_write_line_EMBL_regex("FT ",
978             "FT ",
979             "/$tag=$value",'.|$',80) || return; #'
980             }
981             }
982             }
983              
984 38         273 return 1;
985             }
986              
987              
988              
989             =head2 _read_EMBL_Contig()
990              
991             Title : _read_EMBL_Contig
992             Usage :
993             Function: convert CO lines into annotations
994             Returns :
995             Args :
996              
997             =cut
998              
999             sub _read_EMBL_Contig {
1000 4     4   11 my ($self, $buffer) = @_;
1001 4         6 my @ret;
1002 4 50       18 if ( $$buffer !~ /^CO/ ) {
1003 0         0 warn("Not parsing line '$$buffer' which maybe important");
1004             }
1005 4         14 $self->_pushback($$buffer);
1006 4         12 while ( defined ($_ = $self->_readline) ) {
1007 356 100       906 /^C/ || last;
1008 353 50       898 /^CO\s+(.*)/ && do {
1009 353         907 push @ret, Bio::Annotation::SimpleValue->new( -tagname => 'contig',
1010             -value => $1);
1011             };
1012             }
1013 4         12 $$buffer = $_;
1014 4         59 return @ret;
1015              
1016             }
1017             #'
1018             =head2 _read_EMBL_References
1019              
1020             Title : _read_EMBL_References
1021             Usage :
1022             Function: Reads references from EMBL format. Internal function really
1023             Example :
1024             Returns :
1025             Args :
1026              
1027              
1028             =cut
1029              
1030             sub _read_EMBL_References {
1031 99     99   184 my ($self,$buffer) = @_;
1032 99         130 my (@refs);
1033              
1034             # assume things are starting with RN
1035              
1036 99 50       289 if ( $$buffer !~ /^RN/ ) {
1037 0         0 warn("Not parsing line '$$buffer' which maybe important");
1038             }
1039 99         519 my $b1;
1040             my $b2;
1041 99         0 my $title;
1042 99         0 my $loc;
1043 99         0 my $au;
1044 99         0 my $med;
1045 99         0 my $pm;
1046 99         0 my $com;
1047              
1048 99         185 while ( defined ($_ = $self->_readline) ) {
1049 727 100       1820 /^R/ || last;
1050 628 100       1147 /^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
  40         153  
  40         78  
1051 628 100       977 /^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
  55         134  
1052 628 100       935 /^RX PUBMED;\s+(\d+)/ && do {$pm=$1};
  10         25  
1053 628 100       1148 /^RA (.*)/ && do {
1054 193         529 $au = $self->_concatenate_lines($au,$1); next;
  193         384  
1055             };
1056 435 100       759 /^RT (.*)/ && do {
1057 164         323 $title = $self->_concatenate_lines($title,$1); next;
  164         326  
1058             };
1059 271 100       621 /^RL (.*)/ && do {
1060 156         323 $loc = $self->_concatenate_lines($loc,$1); next;
  156         351  
1061             };
1062 115 100       272 /^RC (.*)/ && do {
1063 3         12 $com = $self->_concatenate_lines($com,$1); next;
  3         7  
1064             };
1065             }
1066              
1067 99         426 my $ref = Bio::Annotation::Reference->new();
1068 99         540 $au =~ s/;\s*$//g;
1069 99         325 $title =~ s/;\s*$//g;
1070              
1071 99         295 $ref->start($b1);
1072 99         228 $ref->end($b2);
1073 99         253 $ref->authors($au);
1074 99         224 $ref->title($title);
1075 99         217 $ref->location($loc);
1076 99         278 $ref->medline($med);
1077 99         286 $ref->comment($com);
1078 99         226 $ref->pubmed($pm);
1079              
1080 99         133 push(@refs,$ref);
1081 99         170 $$buffer = $_;
1082              
1083 99         258 return @refs;
1084             }
1085              
1086             =head2 _read_EMBL_Species
1087              
1088             Title : _read_EMBL_Species
1089             Usage :
1090             Function: Reads the EMBL Organism species and classification
1091             lines.
1092             Example :
1093             Returns : A Bio::Species object
1094             Args : a reference to the current line buffer, accession number
1095              
1096             =cut
1097              
1098             sub _read_EMBL_Species {
1099 21     21   74 my( $self, $buffer, $acc ) = @_;
1100 21         39 my $org;
1101              
1102 21         50 $_ = $$buffer;
1103 21         49 my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1104 21   66     96 while (defined( $_ ||= $self->_readline )) {
1105 92 100       446 if (/^OS\s+(.+)/) {
    100          
    50          
1106 21 50       87 $sci_name .= ($sci_name) ? ' '.$1 : $1;
1107             } elsif (s/^OC\s+(.+)$//) {
1108 50         181 $class_lines .= $1;
1109             } elsif (/^OG\s+(.*)/) {
1110 0         0 $org = $1;
1111             } else {
1112 21         59 last;
1113             }
1114              
1115 71         250 $_ = undef; # Empty $_ to trigger read of next line
1116             }
1117              
1118             # $$buffer = $_;
1119 21         96 $self->_pushback($_);
1120            
1121 21         40 $sci_name =~ s{\.$}{};
1122 21 50       60 $sci_name || return;
1123              
1124             # Convert data in classification lines into classification array.
1125             # only split on ';' or '.' so that classification that is 2 or more words
1126             # will still get matched, use map() to remove trailing/leading/intervening
1127             # spaces
1128 21         242 my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?
  224         401  
  224         326  
  224         244  
  224         333  
1129              
1130             # do we have a genus?
1131 21         64 my $possible_genus = $class[-1];
1132 21 50       80 $possible_genus .= "|$class[-2]" if $class[-2];
1133 21 50       686 if ($sci_name =~ /^($possible_genus)/) {
1134 21         74 $genus = $1;
1135 21         263 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1136             } else {
1137 0         0 $species = $sci_name;
1138             }
1139              
1140             # Don't make a species object if it is "Unknown" or "None"
1141 21 50       86 if ($genus) {
1142 21 50       111 return if $genus =~ /^(Unknown|None)$/i;
1143             }
1144              
1145             # is this organism of rank species or is it lower?
1146             # (doesn't catch everything, but at least the guess isn't dangerous)
1147 21 50       120 if ($species =~ /subsp\.|var\./) {
1148 0         0 ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
1149             }
1150              
1151             # sometimes things have common name in brackets, like
1152             # Schizosaccharomyces pombe (fission yeast), so get rid of the common
1153             # name bit. Probably dangerous if real scientific species name ends in
1154             # bracketed bit.
1155 21 50       67 unless ($class[-1] eq 'Viruses') {
1156 21         110 ($species, $common) = $species =~ /^(.+)\s+\((.+)\)$/;
1157 21 100       110 $sci_name =~ s/\s+\(.+\)$// if $common;
1158             }
1159              
1160             # Bio::Species array needs array in Species -> Kingdom direction
1161 21 50       65 unless ($class[-1] eq $sci_name) {
1162 21         59 push(@class, $sci_name);
1163             }
1164 21         44 @class = reverse @class;
1165              
1166             # do minimal sanity checks before we hand off to Bio::Species which won't
1167             # be able to give informative throw messages if it has to throw because
1168             # of problems here
1169 21 50       67 $self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1170 21         35 my %names;
1171 21         90 foreach my $i (0..$#class) {
1172 245         270 my $name = $class[$i];
1173 245         406 $names{$name}++;
1174             # this code breaks examples like: Xenopus (Silurana) tropicalis
1175             # commenting out, see bug 3158
1176            
1177             #if ($names{$name} > 1 && ($name ne $class[$i - 1])) {
1178             # $self->warn("$acc seems to have an invalid species classification:$name ne $class[$i - 1]");
1179             #}
1180             }
1181 21         170 my $make = Bio::Species->new();
1182 21         112 $make->scientific_name($sci_name);
1183 21         93 $make->classification(@class);
1184 21 50       97 unless ($class[-1] eq 'Viruses') {
1185 21 50       136 $make->genus($genus) if $genus;
1186 21 100       96 $make->species($species) if $species;
1187 21 50       66 $make->sub_species($sub_species) if $sub_species;
1188 21 100       89 $make->common_name($common) if $common;
1189             }
1190 21 50       56 $make->organelle($org) if $org;
1191 21         208 return $make;
1192             }
1193              
1194             =head2 _read_EMBL_DBLink
1195              
1196             Title : _read_EMBL_DBLink
1197             Usage :
1198             Function: Reads the EMBL database cross reference ("DR") lines
1199             Example :
1200             Returns : A list of Bio::Annotation::DBLink objects
1201             Args :
1202              
1203             =cut
1204              
1205             sub _read_EMBL_DBLink {
1206 5     5   11 my( $self,$buffer ) = @_;
1207 5         7 my( @db_link );
1208              
1209 5         15 $_ = $$buffer;
1210 5   66     23 while (defined( $_ ||= $self->_readline )) {
1211 30 100       145 if ( /^DR ([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?\.$/) {
1212 25         72 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1213 25         77 my $link = Bio::Annotation::DBLink->new(-database => $databse,
1214             -primary_id => $prim_id,
1215             -optional_id => $sec_id);
1216              
1217 25         45 push(@db_link, $link);
1218             } else {
1219 5         8 last;
1220             }
1221 25         68 $_ = undef; # Empty $_ to trigger read of next line
1222             }
1223              
1224 5         11 $$buffer = $_;
1225 5         12 return @db_link;
1226             }
1227              
1228             =head2 _read_EMBL_TaxID_DBLink
1229              
1230             Title : _read_EMBL_TaxID_DBLink
1231             Usage :
1232             Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1233             Example :
1234             Returns : A list of Bio::Annotation::DBLink objects
1235             Args :
1236              
1237             =cut
1238              
1239             sub _read_EMBL_TaxID_DBLink {
1240 3     3   9 my( $self,$buffer ) = @_;
1241 3         5 my( @db_link );
1242              
1243 3         6 $_ = $$buffer;
1244 3   66     16 while (defined( $_ ||= $self->_readline )) {
1245 4 100       68 if ( /^OX (\S+)=(\d+);$/ ) {
1246 1         10 my ($databse, $prim_id) = ($1,$2);
1247 1         20 my $link = Bio::Annotation::DBLink->new(-database => $databse,
1248             -primary_id => $prim_id,);
1249 1         6 push(@db_link, $link);
1250             } else {
1251 3         8 last;
1252             }
1253 1         8 $_ = undef; # Empty $_ to trigger read of next line
1254             }
1255              
1256 3         6 $$buffer = $_;
1257 3         8 return @db_link;
1258             }
1259              
1260             =head2 _filehandle
1261              
1262             Title : _filehandle
1263             Usage : $obj->_filehandle($newval)
1264             Function:
1265             Example :
1266             Returns : value of _filehandle
1267             Args : newvalue (optional)
1268              
1269              
1270             =cut
1271              
1272             sub _filehandle{
1273 0     0   0 my ($obj,$value) = @_;
1274 0 0       0 if ( defined $value) {
1275 0         0 $obj->{'_filehandle'} = $value;
1276             }
1277 0         0 return $obj->{'_filehandle'};
1278              
1279             }
1280              
1281             =head2 _read_FTHelper_EMBL
1282              
1283             Title : _read_FTHelper_EMBL
1284             Usage : _read_FTHelper_EMBL($buffer)
1285             Function: reads the next FT key line
1286             Example :
1287             Returns : Bio::SeqIO::FTHelper object
1288             Args : filehandle and reference to a scalar
1289              
1290              
1291             =cut
1292              
1293             sub _read_FTHelper_EMBL {
1294 256     256   380 my ($self,$buffer) = @_;
1295              
1296 256         344 my ($key, # The key of the feature
1297             $loc, # The location line from the feature
1298             @qual, # An arrray of lines making up the qualifiers
1299             );
1300              
1301 256 50       1017 if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
    0          
1302 256         600 $key = $1;
1303 256         437 $loc = $2;
1304             # Read all the lines up to the next feature
1305 256         587 while ( defined($_ = $self->_readline) ) {
1306 2038 100       11902 if (/^FT(\s+)(.+?)\s*$/) {
1307             # Lines inside features are preceeded by 19 spaces
1308             # A new feature is preceeded by 3 spaces
1309 2016 100       3895 if (length($1) > 4) {
1310             # Add to qualifiers if we're in the qualifiers
1311 1782 100       2838 if (@qual) {
    100          
1312 1523         3891 push(@qual, $2);
1313             }
1314             # Start the qualifier list if it's the first qualifier
1315             elsif (substr($2, 0, 1) eq '/') {
1316 254         752 @qual = ($2);
1317             }
1318             # We're still in the location line, so append to location
1319             else {
1320 5         23 $loc .= $2;
1321             }
1322             } else {
1323             # We've reached the start of the next feature
1324 234         328 last;
1325             }
1326             } else {
1327             # We're at the end of the feature table
1328 22         44 last;
1329             }
1330             }
1331             } elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1332 0         0 $key = 'CONTIG';
1333 0         0 $loc = $1;
1334             # Read all the lines up to the next feature
1335 0         0 while ( defined($_ = $self->_readline) ) {
1336 0 0       0 if (/^CO\s+(\S+)\s*$/) {
1337 0         0 $loc .= $1;
1338             } else {
1339             # We've reached the start of the next feature
1340 0         0 last;
1341             }
1342             }
1343             } else {
1344             # No feature key
1345 0         0 return;
1346             }
1347              
1348             # Put the first line of the next feature into the buffer
1349 256         371 $$buffer = $_;
1350              
1351             # Make the new FTHelper object
1352 256         797 my $out = Bio::SeqIO::FTHelper->new();
1353 256         631 $out->verbose($self->verbose());
1354 256         604 $out->key($key);
1355 256         551 $out->loc($loc);
1356              
1357             # Now parse and add any qualifiers. (@qual is kept
1358             # intact to provide informative error messages.)
1359 256         272 my $last_unquoted_qualifier;
1360             QUAL:
1361 256         518 for (my $i = 0; $i < @qual; $i++) {
1362 1091         1481 my $data = $qual[$i];
1363 1091         1127 my ( $qualifier, $value );
1364            
1365 1091 50       4997 unless (( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s*(.*))?})) {
1366 0 0       0 if ( defined $last_unquoted_qualifier ) {
1367             # handle case of unquoted multiline - read up everything until the next qualifier
1368 0   0     0 do {
1369             # Protein sequence translations need to be joined without spaces,
1370             # other qualifiers need those.
1371 0 0       0 $value .= ' ' if $qualifier ne "translation";
1372 0         0 $value .= $data;
1373             } while defined($data = $qual[++$i]) && $data !~ m[^/];
1374 0         0 $i--;
1375 0         0 $out->field->{$last_unquoted_qualifier}->[-1] .= $value;
1376 0         0 $last_unquoted_qualifier = undef;
1377 0         0 next QUAL;
1378             } else {
1379 0         0 $self->throw("Can't see new qualifier in: $_\nfrom:\n"
1380             . join('', map "$_\n", @qual));
1381             }
1382             }
1383 1091 50       1979 $qualifier = '' if not defined $qualifier;
1384              
1385 1091         1420 $last_unquoted_qualifier = undef;
1386 1091 50       1447 if (defined $value) {
1387             # Do we have a quoted value?
1388 1091 100       1672 if (substr($value, 0, 1) eq '"') {
1389             # Keep adding to value until we find the trailing quote
1390             # and the quotes are balanced
1391             QUOTES:
1392 971   66     3663 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1393 686         729 $i++;
1394 686         770 my $next = $qual[$i];
1395 686 50       971 if (!defined($next)) {
1396 0         0 $self->warn("Unbalanced quote in:\n".join("\n", @qual).
1397             "\nAdding quote to close...".
1398             "Check sequence quality!");
1399 0         0 $value .= '"';
1400 0         0 last QUOTES;
1401             }
1402              
1403             # Protein sequence translations need to be joined without spaces,
1404             # other qualifiers need those.
1405 686 100       880 if ($qualifier eq "translation") {
1406 569         1600 $value .= $next;
1407             } else {
1408 117         489 $value .= " $next";
1409             }
1410             }
1411             # Trim leading and trailing quotes
1412 971         8297 $value =~ s/^"|"$//g;
1413             # Undouble internal quotes
1414 971         1680 $value =~ s/""/"/g; #"
1415             } else {
1416 120         169 $last_unquoted_qualifier = $qualifier;
1417             }
1418             } else {
1419 0         0 $value = '_no_value';
1420             }
1421              
1422             # Store the qualifier
1423 1091   100     2178 $out->field->{$qualifier} ||= [];
1424 1091         1173 push(@{$out->field->{$qualifier}},$value);
  1091         1684  
1425             }
1426              
1427 256         657 return $out;
1428             }
1429              
1430             =head2 _write_line_EMBL
1431              
1432             Title : _write_line_EMBL
1433             Usage :
1434             Function: internal function
1435             Example :
1436             Returns : 1 if writing succeeded, else undef
1437             Args :
1438              
1439              
1440             =cut
1441              
1442             sub _write_line_EMBL {
1443 0     0   0 my ($self,$pre1,$pre2,$line,$length) = @_;
1444              
1445 0 0       0 $length || $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1446 0         0 my $subl = $length - length $pre2;
1447 0         0 my $linel = length $line;
1448 0         0 my $i;
1449              
1450 0         0 my $sub = substr($line,0,$length - length $pre1);
1451              
1452 0 0       0 $self->_print( "$pre1$sub\n") || return;
1453              
1454 0         0 for ($i= ($length - length $pre1);$i < $linel;) {
1455 0         0 $sub = substr($line,$i,($subl));
1456 0 0       0 $self->_print( "$pre2$sub\n") || return;
1457 0         0 $i += $subl;
1458             }
1459              
1460 0         0 return 1;
1461             }
1462              
1463             =head2 _write_line_EMBL_regex
1464              
1465             Title : _write_line_EMBL_regex
1466             Usage :
1467             Function: internal function for writing lines of specified
1468             length, with different first and the next line
1469             left hand headers and split at specific points in the
1470             text
1471             Example :
1472             Returns : nothing
1473             Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1474              
1475              
1476             =cut
1477              
1478             sub _write_line_EMBL_regex {
1479 263     263   549 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1480              
1481             #print STDOUT "Going to print with $line!\n";
1482              
1483 263 50       456 $length || $self->throw("Programming error - called write_line_EMBL_regex without length.");
1484              
1485 263         378 my $subl = $length - (length $pre1) -1 ;
1486 263         257 my( @lines );
1487              
1488 263         410 CHUNK: while($line) {
1489 401         633 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1490 401 50       4174 if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1491 401         1077 my $l = $1.$2;
1492 401 100       682 $l =~ s/#/ /g # remove word wrap protection char '#'
1493             if $pre1 eq "RA ";
1494 401         566 my $newl = $3;
1495 401         681 $line = substr($line,length($l));
1496             # be strict about not padding spaces according to
1497             # genbank format
1498 401         1135 $l =~ s/\s+$//;
1499 401 50       654 next CHUNK if ($l eq '');
1500 401         528 push(@lines, $l);
1501 401         895 next CHUNK;
1502             }
1503             }
1504             # if we get here none of the patterns matched $subl or less chars
1505 0         0 $self->warn("trouble dissecting \"$line\"\n into chunks ".
1506             "of $subl chars or less - this tag won't print right");
1507             # insert a space char to prevent infinite loops
1508 0         0 $line = substr($line,0,$subl) . " " . substr($line,$subl);
1509             }
1510 263         397 my $s = shift @lines;
1511 263 100 50     1072 ($self->_print("$pre1$s\n") || return) if $s;
1512 263         462 foreach my $s ( @lines ) {
1513 145 50       300 $self->_print("$pre2$s\n") || return;
1514             }
1515              
1516 263         707 return 1;
1517             }
1518              
1519             =head2 _post_sort
1520              
1521             Title : _post_sort
1522             Usage : $obj->_post_sort($newval)
1523             Function:
1524             Returns : value of _post_sort
1525             Args : newvalue (optional)
1526              
1527              
1528             =cut
1529              
1530             sub _post_sort{
1531 6     6   11 my $obj = shift;
1532 6 50       21 if ( @_ ) {
1533 0         0 my $value = shift;
1534 0         0 $obj->{'_post_sort'} = $value;
1535             }
1536 6         24 return $obj->{'_post_sort'};
1537              
1538             }
1539              
1540             =head2 _show_dna
1541              
1542             Title : _show_dna
1543             Usage : $obj->_show_dna($newval)
1544             Function:
1545             Returns : value of _show_dna
1546             Args : newvalue (optional)
1547              
1548              
1549             =cut
1550              
1551             sub _show_dna{
1552 42     42   77 my $obj = shift;
1553 42 100       119 if ( @_ ) {
1554 31         60 my $value = shift;
1555 31         79 $obj->{'_show_dna'} = $value;
1556             }
1557 42         121 return $obj->{'_show_dna'};
1558              
1559             }
1560              
1561             =head2 _id_generation_func
1562              
1563             Title : _id_generation_func
1564             Usage : $obj->_id_generation_func($newval)
1565             Function:
1566             Returns : value of _id_generation_func
1567             Args : newvalue (optional)
1568              
1569              
1570             =cut
1571              
1572             sub _id_generation_func{
1573 11     11   24 my $obj = shift;
1574 11 50       34 if ( @_ ) {
1575 0         0 my $value = shift;
1576 0         0 $obj->{'_id_generation_func'} = $value;
1577             }
1578 11         39 return $obj->{'_id_generation_func'};
1579              
1580             }
1581              
1582             =head2 _ac_generation_func
1583              
1584             Title : _ac_generation_func
1585             Usage : $obj->_ac_generation_func($newval)
1586             Function:
1587             Returns : value of _ac_generation_func
1588             Args : newvalue (optional)
1589              
1590              
1591             =cut
1592              
1593             sub _ac_generation_func{
1594 11     11   28 my $obj = shift;
1595 11 50       32 if ( @_ ) {
1596 0         0 my $value = shift;
1597 0         0 $obj->{'_ac_generation_func'} = $value;
1598             }
1599 11         190 return $obj->{'_ac_generation_func'};
1600              
1601             }
1602              
1603             =head2 _sv_generation_func
1604              
1605             Title : _sv_generation_func
1606             Usage : $obj->_sv_generation_func($newval)
1607             Function:
1608             Returns : value of _sv_generation_func
1609             Args : newvalue (optional)
1610              
1611              
1612             =cut
1613              
1614             sub _sv_generation_func{
1615 0     0   0 my $obj = shift;
1616 0 0       0 if ( @_ ) {
1617 0         0 my $value = shift;
1618 0         0 $obj->{'_sv_generation_func'} = $value;
1619             }
1620 0         0 return $obj->{'_sv_generation_func'};
1621              
1622             }
1623              
1624             =head2 _kw_generation_func
1625              
1626             Title : _kw_generation_func
1627             Usage : $obj->_kw_generation_func($newval)
1628             Function:
1629             Returns : value of _kw_generation_func
1630             Args : newvalue (optional)
1631              
1632              
1633             =cut
1634              
1635             sub _kw_generation_func{
1636 11     11   24 my $obj = shift;
1637 11 50       30 if ( @_ ) {
1638 0         0 my $value = shift;
1639 0         0 $obj->{'_kw_generation_func'} = $value;
1640             }
1641 11         103 return $obj->{'_kw_generation_func'};
1642              
1643             }
1644              
1645             1;