File Coverage

Bio/SeqIO/embl.pm
Criterion Covered Total %
statement 538 651 82.6
branch 267 434 61.5
condition 90 128 70.3
subroutine 30 33 90.9
pod 2 2 100.0
total 927 1248 74.2


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::EMBL
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Ewan Birney
7             #
8             # Copyright Ewan Birney
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::embl - EMBL sequence input/output stream
17              
18             =head1 SYNOPSIS
19              
20             It is probably best not to use this object directly, but
21             rather go through the SeqIO handler system. Go:
22              
23             $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
24              
25             while ( (my $seq = $stream->next_seq()) ) {
26             # do something with $seq
27             }
28              
29             =head1 DESCRIPTION
30              
31             This object can transform Bio::Seq objects to and from EMBL flat
32             file databases.
33              
34             There is a lot of flexibility here about how to dump things which
35             should be documented more fully.
36              
37             There should be a common object that this and Genbank share (probably
38             with Swissprot). Too much of the magic is identical.
39              
40             =head2 Optional functions
41              
42             =over 3
43              
44             =item _show_dna()
45              
46             (output only) shows the dna or not
47              
48             =item _post_sort()
49              
50             (output only) provides a sorting func which is applied to the FTHelpers
51             before printing
52              
53             =item _id_generation_func()
54              
55             This is function which is called as
56              
57             print "ID ", $func($annseq), "\n";
58              
59             To generate the ID line. If it is not there, it generates a sensible ID
60             line using a number of tools.
61              
62             If you want to output annotations in EMBL format they need to be
63             stored in a Bio::Annotation::Collection object which is accessible
64             through the Bio::SeqI interface method L.
65              
66             The following are the names of the keys which are polled from a
67             L object.
68              
69             reference - Should contain Bio::Annotation::Reference objects
70             comment - Should contain Bio::Annotation::Comment objects
71             dblink - Should contain Bio::Annotation::DBLink objects
72              
73             =back
74              
75             =head1 FEEDBACK
76              
77             =head2 Mailing Lists
78              
79             User feedback is an integral part of the evolution of this and other
80             Bioperl modules. Send your comments and suggestions preferably to one
81             of the Bioperl mailing lists. Your participation is much appreciated.
82              
83             bioperl-l@bioperl.org - General discussion
84             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
85              
86             =head2 Support
87              
88             Please direct usage questions or support issues to the mailing list:
89              
90             I
91              
92             rather than to the module maintainer directly. Many experienced and
93             reponsive experts will be able look at the problem and quickly
94             address it. Please include a thorough description of the problem
95             with code and data examples if at all possible.
96              
97             =head2 Reporting Bugs
98              
99             Report bugs to the Bioperl bug tracking system to help us keep track
100             the bugs and their resolution. Bug reports can be submitted via
101             the web:
102              
103             https://github.com/bioperl/bioperl-live/issues
104              
105             =head1 AUTHOR - Ewan Birney
106              
107             Email birney@ebi.ac.uk
108              
109             =head1 APPENDIX
110              
111             The rest of the documentation details each of the object
112             methods. Internal methods are usually preceded with a _
113              
114             =cut
115              
116              
117             # Let the code begin...
118              
119              
120             package Bio::SeqIO::embl;
121 7     7   651 use vars qw(%FTQUAL_NO_QUOTE);
  7         10  
  7         405  
122 7     7   32 use strict;
  7         7  
  7         177  
123 7     7   1443 use Bio::SeqIO::FTHelper;
  7         13  
  7         160  
124 7     7   29 use Bio::SeqFeature::Generic;
  7         10  
  7         124  
125 7     7   1290 use Bio::Species;
  7         10  
  7         163  
126 7     7   33 use Bio::Seq::SeqFactory;
  7         9  
  7         142  
127 7     7   26 use Bio::Annotation::Collection;
  7         9  
  7         151  
128 7     7   1140 use Bio::Annotation::Comment;
  7         8  
  7         143  
129 7     7   1530 use Bio::Annotation::Reference;
  7         9  
  7         181  
130 7     7   35 use Bio::Annotation::DBLink;
  7         9  
  7         196  
131              
132 7     7   25 use base qw(Bio::SeqIO);
  7         11  
  7         42142  
133              
134             # Note that a qualifier that exceeds one line (i.e. a long label) will
135             # automatically be quoted regardless:
136             %FTQUAL_NO_QUOTE=(
137             'anticodon'=>1,
138             'citation'=>1,
139             'codon'=>1,
140             'codon_start'=>1,
141             'cons_splice'=>1,
142             'direction'=>1,
143             'evidence'=>1,
144             'label'=>1,
145             'mod_base'=> 1,
146             'number'=> 1,
147             'rpt_type'=> 1,
148             'rpt_unit'=> 1,
149             'transl_except'=> 1,
150             'transl_table'=> 1,
151             'usedin'=> 1,
152             );
153              
154             sub _initialize {
155 31     31   75 my($self,@args) = @_;
156              
157 31         110 $self->SUPER::_initialize(@args);
158             # hash for functions for decoding keys.
159 31         92 $self->{'_func_ftunit_hash'} = {};
160             # sets this to one by default. People can change it
161 31         105 $self->_show_dna(1);
162 31 50       103 if ( ! defined $self->sequence_factory ) {
163 31         131 $self->sequence_factory(Bio::Seq::SeqFactory->new
164             (-verbose => $self->verbose(),
165             -type => 'Bio::Seq::RichSeq'));
166             }
167             }
168              
169             =head2 next_seq
170              
171             Title : next_seq
172             Usage : $seq = $stream->next_seq()
173             Function: returns the next sequence in the stream
174             Returns : Bio::Seq object
175             Args :
176              
177             =cut
178              
179             sub next_seq {
180 25     25 1 86 my ($self,@args) = @_;
181 25         30 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
182             $date, $comment, @date_arr);
183              
184 25         150 my ($annotation, %params, @features) =
185             Bio::Annotation::Collection->new();
186              
187 25         78 $line = $self->_readline;
188             # This needs to be before the first eof() test
189              
190 25 100       67 if ( !defined $line ) {
191 1         3 return; # no throws - end of file
192             }
193              
194 24 100       124 if ( $line =~ /^\s+$/ ) {
195 1         4 while ( defined ($line = $self->_readline) ) {
196 1 50       6 $line =~/^\S/ && last;
197             }
198             # return without error if the whole next sequence was just a single
199             # blank line and then eof
200 1 50       3 return unless $line;
201             }
202              
203             # no ID as 1st non-blank line, need short circuit and exit routine
204 24 50       105 $self->throw("EMBL stream with no ID. Not embl in my book")
205             unless $line =~ /^ID\s+\S+/;
206              
207             # At this point we are sure that $line contains an ID header line
208 24         27 my $alphabet;
209 24 100       69 if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
210              
211             # New style header (EMBL Release >= 87, after June 2006)
212 10         16 my $topology;
213             my $sv;
214              
215             # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
216             # This regexp comes from the new2old.pl conversion script, from EBI
217 10 100       56 if ($line =~ m/^ID (\S+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
218 7         43 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
219             }
220 10 100       28 if (defined $sv) {
221 7         13 $params{'-seq_version'} = $sv;
222 7         13 $params{'-version'} = $sv;
223             }
224              
225 10 50 66     48 if (defined $topology && $topology eq 'circular') {
226 0         0 $params{'-is_circular'} = 1;
227             }
228            
229 10 100       25 if (defined $mol ) {
230 7 100       26 if ($mol =~ /DNA/) {
    50          
    0          
231 6         11 $alphabet = 'dna';
232             } elsif ($mol =~ /RNA/) {
233 1         3 $alphabet = 'rna';
234             } elsif ($mol =~ /AA/) {
235 0         0 $alphabet = 'protein';
236             }
237             }
238             } else {
239            
240             # Old style header (EMBL Release < 87, before June 2006)
241 14 100       109 if ($line =~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/) {
242 12         55 ($name, $mol, $div) = ($1, $2, $3);
243             }
244            
245 14 100       33 if ($mol) {
246 12 50       32 if ( $mol =~ /circular/ ) {
247 0         0 $params{'-is_circular'} = 1;
248 0         0 $mol =~ s|circular ||;
249             }
250 12 50       29 if (defined $mol ) {
251 12 100       38 if ($mol =~ /DNA/) {
    50          
    0          
252 10         22 $alphabet='dna';
253             } elsif ($mol =~ /RNA/) {
254 2         3 $alphabet='rna';
255             } elsif ($mol =~ /AA/) {
256 0         0 $alphabet='protein';
257             }
258             }
259             }
260             }
261              
262 24 100 66     119 unless( defined $name && length($name) ) {
263 5         10 $name = "unknown_id";
264             }
265              
266             # $self->warn("not parsing upper annotation in EMBL file yet!");
267 24         27 my $buffer = $line;
268 24         29 local $_;
269 24         28 BEFORE_FEATURE_TABLE :
270             my $ncbi_taxid;
271 24         63 until ( !defined $buffer ) {
272 499         404 $_ = $buffer;
273             # Exit at start of Feature table
274 499 100       1093 if ( /^(F[HT]|SQ)/ ) {
275 24 100 100     150 $self->_pushback($_) if( $1 eq 'SQ' || $1 eq 'FT');
276 24         35 last;
277             }
278             # Description line(s)
279 475 100       701 if (/^DE\s+(\S.*\S)/) {
280 22 100       91 $desc .= $desc ? " $1" : $1;
281             }
282              
283             #accession number
284 475 100 100     1418 if ( /^AC\s+(.*)?/ || /^PA\s+(.*)?/) {
285 24         109 my @accs = split(/[; ]+/, $1); # allow space in addition
286             $params{'-accession_number'} = shift @accs
287 24 100       82 unless defined $params{'-accession_number'};
288 24         32 push @{$params{'-secondary_accessions'}}, @accs;
  24         67  
289             }
290              
291             #version number
292 475 100       652 if ( /^SV\s+\S+\.(\d+);?/ ) {
293 6         11 my $sv = $1;
294             #$sv =~ s/\;//;
295 6         13 $params{'-seq_version'} = $sv;
296 6         9 $params{'-version'} = $sv;
297             }
298              
299             #date (NOTE: takes last date line)
300 475 100       663 if ( /^DT\s+(.+)$/ ) {
301 26         55 my $line = $1;
302 26         66 my ($date, $version) = split(' ', $line, 2);
303 26         41 $date =~ tr/,//d; # remove comma if new version
304 26 100       54 if ($version) {
305 24 100       150 if ($version =~ /\(Rel\. (\d+), Created\)/ms ) {
    100          
306 11         88 my $release = Bio::Annotation::SimpleValue->new(
307             -tagname => 'creation_release',
308             -value => $1
309             );
310 11         37 $annotation->add_Annotation($release);
311             } elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/ms ) {
312 11         47 my $release = Bio::Annotation::SimpleValue->new(
313             -tagname => 'update_release',
314             -value => $1
315             );
316 11         29 $annotation->add_Annotation($release);
317              
318 11         53 my $update = Bio::Annotation::SimpleValue->new(
319             -tagname => 'update_version',
320             -value => $2
321             );
322 11         27 $annotation->add_Annotation($update);
323             }
324             }
325 26         33 push @{$params{'-dates'}}, $date;
  26         57  
326             }
327              
328             #keywords
329 475 100       1856 if ( /^KW (.*)\S*$/ ) {
    100          
    100          
    100          
    100          
    100          
330 29         154 my @kw = split(/\s*\;\s*/,$1);
331 29         33 push @{$params{'-keywords'}}, @kw;
  29         110  
332             }
333              
334             # Organism name and phylogenetic information
335             elsif (/^O[SC]/) {
336             # pass the accession number so we can give an informative throw message if necessary
337 21         79 my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
338 21         50 $params{'-species'}= $species;
339             }
340              
341             # NCBI TaxID Xref
342             elsif (/^OX/) {
343 3 50       19 if (/NCBI_TaxID=(\d+)/) {
344 3         12 $ncbi_taxid=$1;
345             }
346              
347 3         12 my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
348 3         9 foreach my $dblink ( @links ) {
349 1         6 $annotation->add_Annotation('dblink',$dblink);
350             }
351             }
352              
353             # References
354             elsif (/^R/) {
355 99         228 my @refs = $self->_read_EMBL_References(\$buffer);
356 99         136 foreach my $ref ( @refs ) {
357 99         235 $annotation->add_Annotation('reference',$ref);
358             }
359             }
360              
361             # DB Xrefs
362             elsif (/^DR/) {
363 5         19 my @links = $self->_read_EMBL_DBLink(\$buffer);
364 5         7 foreach my $dblink ( @links ) {
365 25         36 $annotation->add_Annotation('dblink',$dblink);
366             }
367             }
368              
369             # Comments
370             elsif (/^CC\s+(.*)/) {
371 15         39 $comment .= $1;
372 15         23 $comment .= " ";
373 15         30 while (defined ($_ = $self->_readline) ) {
374 257 100       569 if (/^CC\s+(.*)/) {
375 242         313 $comment .= $1;
376 242         356 $comment .= " ";
377             } else {
378 15         20 last;
379             }
380             }
381 15         103 my $commobj = Bio::Annotation::Comment->new();
382 15         46 $commobj->text($comment);
383 15         31 $annotation->add_Annotation('comment',$commobj);
384 15         15 $comment = "";
385             }
386              
387             # Get next line.
388 475         897 $buffer = $self->_readline;
389             }
390              
391 24         54 while ( defined ($_ = $self->_readline) ) {
392 46 100       126 /^FT\s{3}\w/ && last;
393 24 100       50 /^SQ / && last;
394 22 50       64 /^CO / && last;
395             }
396 24         41 $buffer = $_;
397              
398 24 100 66     125 if (defined($buffer) && $buffer =~ /^FT /) {
399 22         59 until ( !defined ($buffer) ) {
400 256         558 my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
401              
402             # process ftunit
403 256         538 my $feat =
404             $ftunit->_generic_seqfeature($self->location_factory(), $name);
405              
406             # add taxon_id from source if available
407             # Notice, this will override what is found in the OX line.
408             # this is by design as this seems to be the official way
409             # of specifying a TaxID
410 256 100 100     705 if ($params{'-species'} && ($feat->primary_tag eq 'source')
      100        
      66        
411             && $feat->has_tag('db_xref')
412             && (! $params{'-species'}->ncbi_taxid())) {
413 12         31 foreach my $tagval ($feat->get_tag_values('db_xref')) {
414 12 50       45 if (index($tagval,"taxon:") == 0) {
415 12         39 $params{'-species'}->ncbi_taxid(substr($tagval,6));
416 12         23 last;
417             }
418             }
419             }
420              
421             # add feature to list of features
422 256         312 push(@features, $feat);
423              
424 256 100       1157 if ( $buffer !~ /^FT/ ) {
425 22         60 last;
426             }
427             }
428             }
429             # Set taxid found in OX line
430 24 100 100     129 if ($params{'-species'} && defined $ncbi_taxid
      100        
431             && (! $params{'-species'}->ncbi_taxid())) {
432 2         5 $params{'-species'}->ncbi_taxid($ncbi_taxid);
433             }
434              
435             # skip comments
436 24   66     143 while ( defined ($buffer) && $buffer =~ /^XX/ ) {
437 20         48 $buffer = $self->_readline();
438             }
439            
440 24 100       107 if ( $buffer =~ /^CO/ ) {
441             # bug#2982
442             # special : create contig as annotation
443 4         11 while ( defined ($buffer) ) {
444 4         14 $annotation->add_Annotation($_) for $self->_read_EMBL_Contig(\$buffer);
445 4 50 66     27 if ( !$buffer || $buffer !~ /^CO/ ) {
446 4         5 last;
447             }
448             }
449 4   100     13 $buffer ||= '';
450             }
451 24 100       98 if ($buffer !~ /^\/\//) { # if no SQ lines following CO (bug#2958)
452 20 100       67 if ( $buffer !~ /^SQ/ ) {
453 1         4 while ( defined ($_ = $self->_readline) ) {
454 0 0       0 /^SQ/ && last;
455             }
456             }
457 20         23 $seqc = "";
458 20         50 while ( defined ($_ = $self->_readline) ) {
459 2924 100       3835 m{^//} && last;
460 2906         2635 $_ = uc($_);
461 2906         21416 s/[^A-Za-z]//g;
462 2906         5507 $seqc .= $_;
463             }
464             }
465 24         88 my $seq = $self->sequence_factory->create
466             (-verbose => $self->verbose(),
467             -division => $div,
468             -seq => $seqc,
469             -desc => $desc,
470             -display_id => $name,
471             -annotation => $annotation,
472             -molecule => $mol,
473             -alphabet => $alphabet,
474             -features => \@features,
475             %params);
476 24         228 return $seq;
477             }
478              
479              
480              
481             =head2 _write_ID_line
482              
483             Title : _write_ID_line
484             Usage : $self->_write_ID_line($seq);
485             Function: Writes the EMBL Release 87 format ID line to the stream, unless
486             : there is a user-supplied ID line generation function in which
487             : case that is used instead.
488             : ( See Bio::SeqIO::embl::_id_generation_function(). )
489             Returns : nothing
490             Args : Bio::Seq object
491              
492             =cut
493              
494             sub _write_ID_line {
495              
496 11     11   18 my ($self, $seq) = @_;
497              
498 11         14 my $id_line;
499             # If there is a user-supplied ID generation function, use it.
500 11 50       35 if ( $self->_id_generation_func ) {
501 0         0 $id_line = "ID " . &{$self->_id_generation_func}($seq) . "\nXX\n";
  0         0  
502             }
503             # Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
504             else {
505              
506             # The sequence name is supposed to be the primary accession number,
507 11         45 my $name = $seq->accession_number();
508 11 100 66     75 if ( not(defined $name) || $name eq 'unknown') {
509             # but if it is not present, use the sequence ID or the empty string
510 5   100     16 $name = $seq->id() || '';
511             }
512            
513 11 50       46 $self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
514              
515             # Use the sequence version, or default to 1.
516 11   50     44 my $version = $seq->version() || 1;
517              
518 11         47 my $len = $seq->length();
519              
520             # Taxonomic division.
521 11         18 my $div;
522 11 100 100     157 if ( $seq->can('division') && defined($seq->division) &&
      66        
523             $self->_is_valid_division($seq->division) ) {
524 2         7 $div = $seq->division();
525             } else {
526 9   50     50 $div ||= 'UNC'; # 'UNC' is the EMBL division code for 'unclassified'.
527             }
528              
529 11         19 my $mol;
530             # If the molecule type is a valid EMBL type, use it.
531 11 50 100     120 if ( $seq->can('molecule')
    100 66        
      100        
532             && defined($seq->molecule)
533             && $self->_is_valid_molecule_type($seq->molecule)
534             ) {
535 0         0 $mol = $seq->molecule();
536             }
537             # Otherwise, choose unassigned DNA or RNA based on the alphabet.
538             elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
539 6         16 my $alphabet =$seq->primary_seq->alphabet;
540 6 50       21 if ($alphabet eq 'dna') {
    0          
    0          
541 6         12 $mol ='unassigned DNA';
542             } elsif ($alphabet eq 'rna') {
543 0         0 $mol='unassigned RNA';
544             } elsif ($alphabet eq 'protein') {
545 0         0 $self->warn("Protein sequence found; EMBL is a nucleotide format.");
546 0         0 $mol='AA'; # AA is not a valid EMBL molecule type.
547             }
548             }
549              
550 11         19 my $topology = 'linear';
551 11 50       38 if ($seq->is_circular) {
552 0         0 $topology = 'circular';
553             }
554              
555 11   100     44 $mol ||= ''; # 'unassigned'; ?
556 11         68 $id_line = "ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
557             }
558 11         77 $self->_print($id_line);
559             }
560              
561             =head2 _is_valid_division
562              
563             Title : _is_valid_division
564             Usage : $self->_is_valid_division($div)
565             Function: tests division code for validity
566             Returns : true if $div is a valid EMBL release 87 taxonomic division.
567             Args : taxonomic division code string
568              
569             =cut
570              
571             sub _is_valid_division {
572 2     2   6 my ($self, $division) = @_;
573              
574 2         30 my %EMBL_divisions = (
575             "PHG" => 1, # Bacteriophage
576             "ENV" => 1, # Environmental Sample
577             "FUN" => 1, # Fungal
578             "HUM" => 1, # Human
579             "INV" => 1, # Invertebrate
580             "MAM" => 1, # Other Mammal
581             "VRT" => 1, # Other Vertebrate
582             "MUS" => 1, # Mus musculus
583             "PLN" => 1, # Plant
584             "PRO" => 1, # Prokaryote
585             "ROD" => 1, # Other Rodent
586             "SYN" => 1, # Synthetic
587             "UNC" => 1, # Unclassified
588             "VRL" => 1 # Viral
589             );
590              
591 2         17 return exists($EMBL_divisions{$division});
592             }
593              
594             =head2 _is_valid_molecule_type
595              
596             Title : _is_valid_molecule_type
597             Usage : $self->_is_valid_molecule_type($mol)
598             Function: tests molecule type for validity
599             Returns : true if $mol is a valid EMBL release 87 molecule type.
600             Args : molecule type string
601              
602             =cut
603              
604             sub _is_valid_molecule_type {
605 2     2   6 my ($self, $moltype) = @_;
606              
607 2         31 my %EMBL_molecule_types = (
608             "genomic DNA" => 1,
609             "genomic RNA" => 1,
610             "mRNA" => 1,
611             "tRNA" => 1,
612             "rRNA" => 1,
613             "snoRNA" => 1,
614             "snRNA" => 1,
615             "scRNA" => 1,
616             "pre-RNA" => 1,
617             "other RNA" => 1,
618             "other DNA" => 1,
619             "unassigned DNA" => 1,
620             "unassigned RNA" => 1
621             );
622              
623 2         41 return exists($EMBL_molecule_types{$moltype});
624             }
625              
626             =head2 write_seq
627              
628             Title : write_seq
629             Usage : $stream->write_seq($seq)
630             Function: writes the $seq object (must be seq) to the stream
631             Returns : 1 for success and undef for error
632             Args : array of 1 to n Bio::SeqI objects
633              
634              
635             =cut
636              
637             sub write_seq {
638 13     13 1 69 my ($self,@seqs) = @_;
639              
640 13         30 foreach my $seq ( @seqs ) {
641 13 50       38 $self->throw("Attempting to write with no seq!") unless defined $seq;
642 13 100 100     116 unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
643 4 50       15 $self->warn("$seq is not a SeqI compliant sequence object!")
644             if $self->verbose >= 0;
645 4 100 66     23 unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
646 2         23 $self->throw("$seq is not a PrimarySeqI compliant sequence object!");
647             }
648             }
649 11   100     50 my $str = $seq->seq || '';
650              
651             # Write the ID line.
652 11         41 $self->_write_ID_line($seq);
653              
654              
655             # Write the accession line if present
656 11         13 my( $acc );
657             {
658 11 50 66     12 if ( my $func = $self->_ac_generation_func ) {
  11 100       38  
    50          
659 0         0 $acc = &{$func}($seq);
  0         0  
660             } elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
661             defined($seq->accession_number) ) {
662 4         12 $acc = $seq->accession_number;
663 4         21 $acc = join("; ", $acc, $seq->get_secondary_accessions);
664             } elsif ( $seq->can('accession_number') ) {
665 7         23 $acc = $seq->accession_number;
666             }
667              
668 11 50       57 if (defined $acc) {
669 11 50       52 $self->_print("AC $acc;\n",
670             "XX\n") || return;
671             }
672             }
673              
674             # Date lines
675 11         20 my $switch=0;
676 11 100       64 if ( $seq->can('get_dates') ) {
677 4         14 my @dates = $seq->get_dates();
678 4         9 my $ct = 1;
679 4         7 my $date_flag = 0;
680 4         21 my ($cr) = $seq->annotation->get_Annotations("creation_release");
681 4         12 my ($ur) = $seq->annotation->get_Annotations("update_release");
682 4         12 my ($uv) = $seq->annotation->get_Annotations("update_version");
683              
684 4 0 33     15 unless ($cr && $ur && $ur) {
      33        
685 4         6 $date_flag = 1;
686             }
687              
688 4         10 foreach my $dt (@dates) {
689 0 0       0 if (!$date_flag) {
690 0 0       0 $self->_write_line_EMBL_regex("DT ","DT ",
691             $dt." (Rel. ".($cr->value()).", Created)",
692             '\s+|$',80) if $ct == 1;
693 0 0       0 $self->_write_line_EMBL_regex("DT ","DT ",
694             $dt." (Rel. ".($ur->value()).", Last updated, Version ".($uv->value()).")",
695             '\s+|$',80) if $ct == 2;
696             } else { # other formats?
697 0         0 $self->_write_line_EMBL_regex("DT ","DT ",
698             $dt,'\s+|$',80);
699             }
700 0         0 $switch =1;
701 0         0 $ct++;
702             }
703 4 50       11 if ($switch == 1) {
704 0 0       0 $self->_print("XX\n") || return;
705             }
706             }
707              
708             # Description lines
709 11 50       44 $self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80) || return; #'
710 11 50       33 $self->_print( "XX\n") || return;
711              
712             # if there, write the kw line
713             {
714 11         17 my( $kw );
  11         914  
715 11 50       47 if ( my $func = $self->_kw_generation_func ) {
    100          
716 0         0 $kw = &{$func}($seq);
  0         0  
717             } elsif ( $seq->can('keywords') ) {
718 4         19 $kw = $seq->keywords;
719             }
720 11 100       38 if (defined $kw) {
721 4 50       12 $self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+|$', 80) || return; #'
722 4 50       12 $self->_print( "XX\n") || return;
723             }
724             }
725              
726             # Organism lines
727              
728 11 100 100     73 if ($seq->can('species') && (my $spec = $seq->species)) {
729 6         31 my @class = $spec->classification();
730 6         11 shift @class; # get rid of species name. Some embl files include
731             # the species name in the OC lines, but this seems
732             # more like an error than something we need to
733             # emulate
734 6         22 my $OS = $spec->scientific_name;
735 6 50       22 if ($spec->common_name) {
736 0         0 $OS .= ' ('.$spec->common_name.')';
737             }
738 6 50       29 $self->_print("OS $OS\n") || return;
739 6         33 my $OC = join('; ', reverse(@class)) .'.';
740 6 50       22 $self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80) || return;
741 6 50       26 if ($spec->organelle) {
742 0 0       0 $self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80) || return;
743             }
744 6         20 my $ncbi_taxid = $spec->ncbi_taxid;
745 6 100       26 if ($ncbi_taxid) {
746 2 50       7 $self->_print("OX NCBI_TaxID=$ncbi_taxid\n") || return;
747             }
748 6 50       18 $self->_print("XX\n") || return;
749             }
750             # Reference lines
751 11         20 my $t = 1;
752 11 100 66     80 if ( $seq->can('annotation') && defined $seq->annotation ) {
753 9         26 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
754 6 50       24 $self->_print( "RN [$t]\n") || return;
755              
756             # Having no RP line is legal, but we need both
757             # start and end for a valid location.
758 6 50       23 if ($ref->comment) {
759 0 0       0 $self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80) || return; #'
760             }
761 6         23 my $start = $ref->start;
762 6         23 my $end = $ref->end;
763 6 50 33     33 if ($start and $end) {
    0 0        
764 6 50       28 $self->_print( "RP $start-$end\n") || return;
765             } elsif ($start or $end) {
766 0         0 $self->throw("Both start and end are needed for a valid RP line.".
767             " Got: start='$start' end='$end'");
768             }
769              
770 6 50       21 if (my $med = $ref->medline) {
771 0 0       0 $self->_print( "RX MEDLINE; $med.\n") || return;
772             }
773 6 50       17 if (my $pm = $ref->pubmed) {
774 0 0       0 $self->_print( "RX PUBMED; $pm.\n") || return;
775             }
776 6         19 my $authors = $ref->authors;
777 6         100 $authors =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
778              
779 6 50       31 $self->_write_line_EMBL_regex("RA ", "RA ",
780             $authors . ";",
781             '\s+|$', 80) || return; #'
782              
783             # If there is no title to the reference, it appears
784             # as a single semi-colon. All titles must end in
785             # a semi-colon.
786 6   100     22 my $ref_title = $ref->title || '';
787 6         52 $ref_title =~ s/[\s;]*$/;/;
788 6 50       19 $self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80) || return; #'
789 6 50       22 $self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80) || return; #'
790 6 50       14 $self->_print("XX\n") || return;
791 6         19 $t++;
792             }
793              
794             # DB Xref lines
795 9 100       30 if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
796 2         4 for my $dr (@db_xref) {
797 43         70 my $db_name = $dr->database;
798 43         66 my $prim = $dr->primary_id;
799              
800 43   50     57 my $opt = $dr->optional_id || '';
801 43 50       82 my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
802 43 50       55 $self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80) || return; #'
803             }
804 2 50       6 $self->_print("XX\n") || return;
805             }
806            
807             # Comment lines
808 9         28 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
809 4 50       16 $self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80) || return; #'
810 4 50       11 $self->_print("XX\n") || return;
811             }
812             }
813             # "\\s\+\|\$"
814              
815             ## FEATURE TABLE
816              
817 11 50       35 $self->_print("FH Key Location/Qualifiers\n") || return;
818 11 50       28 $self->_print("FH\n") || return;
819              
820 11 100       88 my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
821 11 100       38 if ($feats[0]) {
822 6 50       19 if ( defined $self->_post_sort ) {
823             # we need to read things into an array.
824             # Process. Sort them. Print 'em
825              
826 0         0 my $post_sort_func = $self->_post_sort();
827 0         0 my @fth;
828              
829 0         0 foreach my $sf ( @feats ) {
830 0         0 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
831             }
832              
833 0         0 @fth = sort { &$post_sort_func($a,$b) } @fth;
  0         0  
834              
835 0         0 foreach my $fth ( @fth ) {
836 0 0       0 $self->_print_EMBL_FTHelper($fth) || return;
837             }
838             } else {
839             # not post sorted. And so we can print as we get them.
840             # lower memory load...
841              
842 6         12 foreach my $sf ( @feats ) {
843 38         95 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
844 38         47 foreach my $fth ( @fth ) {
845 38 50       84 if ( $fth->key eq 'CONTIG') {
846 0         0 $self->_show_dna(0);
847             }
848 38 50       92 $self->_print_EMBL_FTHelper($fth) || return;
849             }
850             }
851             }
852             }
853              
854 11 50       35 if ( $self->_show_dna() == 0 ) {
855 0 0       0 $self->_print( "//\n") || return;
856 0         0 return;
857             }
858 11 50       36 $self->_print( "XX\n") || return;
859              
860             # finished printing features.
861              
862             # print contig if present : bug#2982
863 11 100 66     87 if ( $seq->can('annotation') && defined $seq->annotation) {
864 9         27 foreach my $ctg ( $seq->annotation->get_Annotations('contig') ) {
865 0 0       0 if ($ctg->value) {
866 0 0       0 $self->_write_line_EMBL_regex("CO ","CO ", $ctg->value,
867             '[,]|$', 80) || return;
868             }
869             }
870             }
871             # print sequence lines only if sequence is present! bug#2982
872 11 100       36 if (length($str)) {
873 8         42 $str =~ tr/A-Z/a-z/;
874              
875             # Count each nucleotide
876 8         25 my $alen = $str =~ tr/a/a/;
877 8         22 my $clen = $str =~ tr/c/c/;
878 8         27 my $glen = $str =~ tr/g/g/;
879 8         21 my $tlen = $str =~ tr/t/t/;
880            
881 8         20 my $len = $seq->length();
882 8         17 my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
883 8 50       24 if ( $olen < 0 ) {
884 0         0 $self->warn("Weird. More atgc than bases. Problem!");
885             }
886            
887 8 50       56 $self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") || return;
888              
889 8         16 my $nuc = 60; # Number of nucleotides per line
890 8         14 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
891 8         34 my $out_pat = 'A11' x 6; # Pattern for packing a line
892 8         11 my $length = length($str);
893            
894             # Calculate the number of nucleotides which fit on whole lines
895 8         37 my $whole = int($length / $nuc) * $nuc;
896              
897             # Print the whole lines
898 8         11 my( $i );
899 8         26 for ($i = 0; $i < $whole; $i += $nuc) {
900 162         687 my $blocks = pack $out_pat,
901             unpack $whole_pat,
902             substr($str, $i, $nuc);
903 162 50       660 $self->_print(sprintf(" $blocks%9d\n", $i + $nuc)) || return;
904             }
905              
906             # Print the last line
907 8 50       33 if (my $last = substr($str, $i)) {
908 8         14 my $last_len = length($last);
909 8         36 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
910 8         72 my $blocks = pack $out_pat,
911             unpack($last_pat, $last);
912 8 50       55 $self->_print(sprintf(" $blocks%9d\n", $length)) ||
913             return; # Add the length to the end
914             }
915             }
916            
917 11 50       32 $self->_print( "//\n") || return;
918            
919 11 50 33     34 $self->flush if $self->_flush_on_write && defined $self->_fh;
920             }
921 11         55 return 1;
922             }
923              
924             =head2 _print_EMBL_FTHelper
925              
926             Title : _print_EMBL_FTHelper
927             Usage :
928             Function: Internal function
929             Returns : 1 if writing suceeded, otherwise undef
930             Args :
931              
932              
933             =cut
934              
935             sub _print_EMBL_FTHelper {
936 38     38   47 my ($self,$fth) = @_;
937              
938 38 50 33     231 if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
939 0         0 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
940             }
941              
942              
943             #$self->_print( "FH Key Location/Qualifiers\n");
944             #$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc));
945             # let
946 38 50       72 if ( $fth->key eq 'CONTIG' ) {
947 0 0       0 $self->_print("XX\n") || return;
948 0 0       0 $self->_write_line_EMBL_regex("CO ",
949             "CO ",$fth->loc,
950             '\,|$',80) || return; #'
951 0         0 return 1;
952             }
953 38 50       77 $self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key),
954             "FT ",$fth->loc,
955             '\,|$',80) || return; #'
956 38         50 foreach my $tag ( keys %{$fth->field} ) {
  38         88  
957 108 50       196 if ( ! defined $fth->field->{$tag} ) {
958 0         0 next;
959             }
960 108         215 foreach my $value ( @{$fth->field->{$tag}} ) {
  108         147  
961 139         175 $value =~ s/\"/\"\"/g;
962 139 50 66     407 if ($value eq "_no_value") {
    50          
963 0 0       0 $self->_write_line_EMBL_regex("FT ",
964             "FT ",
965             "/$tag",'.|$',80) || return; #'
966             }
967             # there are almost 3x more quoted qualifier values and they
968             # are more common too so we take quoted ones first
969             #
970             # Long qualifiers, that will be line wrapped, are always quoted
971             elsif (!$FTQUAL_NO_QUOTE{$tag} or length("/$tag=$value")>=60) {
972 139 100       492 my $pat = $value =~ /\s+/ ? '\s+|\-|$' : '.|\-|$';
973 139 50       367 $self->_write_line_EMBL_regex("FT ",
974             "FT ",
975             "/$tag=\"$value\"",$pat,80) || return;
976             } else {
977 0 0       0 $self->_write_line_EMBL_regex("FT ",
978             "FT ",
979             "/$tag=$value",'.|$',80) || return; #'
980             }
981             }
982             }
983              
984 38         229 return 1;
985             }
986              
987              
988              
989             =head2 _read_EMBL_Contig()
990              
991             Title : _read_EMBL_Contig
992             Usage :
993             Function: convert CO lines into annotations
994             Returns :
995             Args :
996              
997             =cut
998              
999             sub _read_EMBL_Contig {
1000 4     4   5 my ($self, $buffer) = @_;
1001 4         5 my @ret;
1002 4 50       16 if ( $$buffer !~ /^CO/ ) {
1003 0         0 warn("Not parsing line '$$buffer' which maybe important");
1004             }
1005 4         11 $self->_pushback($$buffer);
1006 4         9 while ( defined ($_ = $self->_readline) ) {
1007 356 100       749 /^C/ || last;
1008 353 50       756 /^CO\s+(.*)/ && do {
1009 353         861 push @ret, Bio::Annotation::SimpleValue->new( -tagname => 'contig',
1010             -value => $1);
1011             };
1012             }
1013 4         8 $$buffer = $_;
1014 4         54 return @ret;
1015              
1016             }
1017             #'
1018             =head2 _read_EMBL_References
1019              
1020             Title : _read_EMBL_References
1021             Usage :
1022             Function: Reads references from EMBL format. Internal function really
1023             Example :
1024             Returns :
1025             Args :
1026              
1027              
1028             =cut
1029              
1030             sub _read_EMBL_References {
1031 99     99   107 my ($self,$buffer) = @_;
1032 99         65 my (@refs);
1033              
1034             # assume things are starting with RN
1035              
1036 99 50       233 if ( $$buffer !~ /^RN/ ) {
1037 0         0 warn("Not parsing line '$$buffer' which maybe important");
1038             }
1039 99         88 my $b1;
1040             my $b2;
1041 0         0 my $title;
1042 0         0 my $loc;
1043 0         0 my $au;
1044 0         0 my $med;
1045 0         0 my $pm;
1046 0         0 my $com;
1047              
1048 99         169 while ( defined ($_ = $self->_readline) ) {
1049 727 100       1411 /^R/ || last;
1050 628 100       965 /^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
  40         87  
  40         68  
1051 628 100       806 /^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
  55         96  
1052 628 100       762 /^RX PUBMED;\s+(\d+)/ && do {$pm=$1};
  10         18  
1053 628 100       880 /^RA (.*)/ && do {
1054 193         357 $au = $self->_concatenate_lines($au,$1); next;
  193         320  
1055             };
1056 435 100       656 /^RT (.*)/ && do {
1057 164         258 $title = $self->_concatenate_lines($title,$1); next;
  164         294  
1058             };
1059 271 100       459 /^RL (.*)/ && do {
1060 156         223 $loc = $self->_concatenate_lines($loc,$1); next;
  156         231  
1061             };
1062 115 100       274 /^RC (.*)/ && do {
1063 3         11 $com = $self->_concatenate_lines($com,$1); next;
  3         8  
1064             };
1065             }
1066              
1067 99         312 my $ref = Bio::Annotation::Reference->new();
1068 99         385 $au =~ s/;\s*$//g;
1069 99         206 $title =~ s/;\s*$//g;
1070              
1071 99         216 $ref->start($b1);
1072 99         168 $ref->end($b2);
1073 99         159 $ref->authors($au);
1074 99         163 $ref->title($title);
1075 99         150 $ref->location($loc);
1076 99         144 $ref->medline($med);
1077 99         154 $ref->comment($com);
1078 99         169 $ref->pubmed($pm);
1079              
1080 99         103 push(@refs,$ref);
1081 99         125 $$buffer = $_;
1082              
1083 99         178 return @refs;
1084             }
1085              
1086             =head2 _read_EMBL_Species
1087              
1088             Title : _read_EMBL_Species
1089             Usage :
1090             Function: Reads the EMBL Organism species and classification
1091             lines.
1092             Example :
1093             Returns : A Bio::Species object
1094             Args : a reference to the current line buffer, accession number
1095              
1096             =cut
1097              
1098             sub _read_EMBL_Species {
1099 21     21   39 my( $self, $buffer, $acc ) = @_;
1100 21         27 my $org;
1101              
1102 21         27 $_ = $$buffer;
1103 21         34 my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1104 21   66     76 while (defined( $_ ||= $self->_readline )) {
1105 92 100       332 if (/^OS\s+(.+)/) {
    100          
    50          
1106 21 50       79 $sci_name .= ($sci_name) ? ' '.$1 : $1;
1107             } elsif (s/^OC\s+(.+)$//) {
1108 50         104 $class_lines .= $1;
1109             } elsif (/^OG\s+(.*)/) {
1110 0         0 $org = $1;
1111             } else {
1112 21         35 last;
1113             }
1114              
1115 71         194 $_ = undef; # Empty $_ to trigger read of next line
1116             }
1117              
1118             # $$buffer = $_;
1119 21         71 $self->_pushback($_);
1120            
1121 21         32 $sci_name =~ s{\.$}{};
1122 21 50       103 $sci_name || return;
1123              
1124             # Convert data in classification lines into classification array.
1125             # only split on ';' or '.' so that classification that is 2 or more words
1126             # will still get matched, use map() to remove trailing/leading/intervening
1127             # spaces
1128 21         188 my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?
  224         308  
  224         204  
  224         182  
  224         252  
1129              
1130             # do we have a genus?
1131 21         41 my $possible_genus = $class[-1];
1132 21 50       66 $possible_genus .= "|$class[-2]" if $class[-2];
1133 21 50       626 if ($sci_name =~ /^($possible_genus)/) {
1134 21         49 $genus = $1;
1135 21         213 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1136             } else {
1137 0         0 $species = $sci_name;
1138             }
1139              
1140             # Don't make a species object if it is "Unknown" or "None"
1141 21 50       63 if ($genus) {
1142 21 50       122 return if $genus =~ /^(Unknown|None)$/i;
1143             }
1144              
1145             # is this organism of rank species or is it lower?
1146             # (doesn't catch everything, but at least the guess isn't dangerous)
1147 21 50       98 if ($species =~ /subsp\.|var\./) {
1148 0         0 ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
1149             }
1150              
1151             # sometimes things have common name in brackets, like
1152             # Schizosaccharomyces pombe (fission yeast), so get rid of the common
1153             # name bit. Probably dangerous if real scientific species name ends in
1154             # bracketed bit.
1155 21 50       59 unless ($class[-1] eq 'Viruses') {
1156 21         81 ($species, $common) = $species =~ /^(.+)\s+\((.+)\)$/;
1157 21 100       82 $sci_name =~ s/\s+\(.+\)$// if $common;
1158             }
1159              
1160             # Bio::Species array needs array in Species -> Kingdom direction
1161 21 50       52 unless ($class[-1] eq $sci_name) {
1162 21         38 push(@class, $sci_name);
1163             }
1164 21         33 @class = reverse @class;
1165              
1166             # do minimal sanity checks before we hand off to Bio::Species which won't
1167             # be able to give informative throw messages if it has to throw because
1168             # of problems here
1169 21 50       44 $self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1170 21         28 my %names;
1171 21         86 foreach my $i (0..$#class) {
1172 245         187 my $name = $class[$i];
1173 245         336 $names{$name}++;
1174             # this code breaks examples like: Xenopus (Silurana) tropicalis
1175             # commenting out, see bug 3158
1176            
1177             #if ($names{$name} > 1 && ($name ne $class[$i - 1])) {
1178             # $self->warn("$acc seems to have an invalid species classification:$name ne $class[$i - 1]");
1179             #}
1180             }
1181 21         155 my $make = Bio::Species->new();
1182 21         61 $make->scientific_name($sci_name);
1183 21         64 $make->classification(@class);
1184 21 50       75 unless ($class[-1] eq 'Viruses') {
1185 21 50       100 $make->genus($genus) if $genus;
1186 21 100       60 $make->species($species) if $species;
1187 21 50       46 $make->sub_species($sub_species) if $sub_species;
1188 21 100       108 $make->common_name($common) if $common;
1189             }
1190 21 50       50 $make->organelle($org) if $org;
1191 21         113 return $make;
1192             }
1193              
1194             =head2 _read_EMBL_DBLink
1195              
1196             Title : _read_EMBL_DBLink
1197             Usage :
1198             Function: Reads the EMBL database cross reference ("DR") lines
1199             Example :
1200             Returns : A list of Bio::Annotation::DBLink objects
1201             Args :
1202              
1203             =cut
1204              
1205             sub _read_EMBL_DBLink {
1206 5     5   8 my( $self,$buffer ) = @_;
1207 5         5 my( @db_link );
1208              
1209 5         5 $_ = $$buffer;
1210 5   66     24 while (defined( $_ ||= $self->_readline )) {
1211 30 100       125 if ( /^DR ([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?\.$/) {
1212 25         68 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1213 25         76 my $link = Bio::Annotation::DBLink->new(-database => $databse,
1214             -primary_id => $prim_id,
1215             -optional_id => $sec_id);
1216              
1217 25         39 push(@db_link, $link);
1218             } else {
1219 5         9 last;
1220             }
1221 25         72 $_ = undef; # Empty $_ to trigger read of next line
1222             }
1223              
1224 5         13 $$buffer = $_;
1225 5         16 return @db_link;
1226             }
1227              
1228             =head2 _read_EMBL_TaxID_DBLink
1229              
1230             Title : _read_EMBL_TaxID_DBLink
1231             Usage :
1232             Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1233             Example :
1234             Returns : A list of Bio::Annotation::DBLink objects
1235             Args :
1236              
1237             =cut
1238              
1239             sub _read_EMBL_TaxID_DBLink {
1240 3     3   5 my( $self,$buffer ) = @_;
1241 3         4 my( @db_link );
1242              
1243 3         4 $_ = $$buffer;
1244 3   66     14 while (defined( $_ ||= $self->_readline )) {
1245 4 100       17 if ( /^OX (\S+)=(\d+);$/ ) {
1246 1         4 my ($databse, $prim_id) = ($1,$2);
1247 1         15 my $link = Bio::Annotation::DBLink->new(-database => $databse,
1248             -primary_id => $prim_id,);
1249 1         3 push(@db_link, $link);
1250             } else {
1251 3         6 last;
1252             }
1253 1         7 $_ = undef; # Empty $_ to trigger read of next line
1254             }
1255              
1256 3         5 $$buffer = $_;
1257 3         7 return @db_link;
1258             }
1259              
1260             =head2 _filehandle
1261              
1262             Title : _filehandle
1263             Usage : $obj->_filehandle($newval)
1264             Function:
1265             Example :
1266             Returns : value of _filehandle
1267             Args : newvalue (optional)
1268              
1269              
1270             =cut
1271              
1272             sub _filehandle{
1273 0     0   0 my ($obj,$value) = @_;
1274 0 0       0 if ( defined $value) {
1275 0         0 $obj->{'_filehandle'} = $value;
1276             }
1277 0         0 return $obj->{'_filehandle'};
1278              
1279             }
1280              
1281             =head2 _read_FTHelper_EMBL
1282              
1283             Title : _read_FTHelper_EMBL
1284             Usage : _read_FTHelper_EMBL($buffer)
1285             Function: reads the next FT key line
1286             Example :
1287             Returns : Bio::SeqIO::FTHelper object
1288             Args : filehandle and reference to a scalar
1289              
1290              
1291             =cut
1292              
1293             sub _read_FTHelper_EMBL {
1294 256     256   220 my ($self,$buffer) = @_;
1295              
1296 256         194 my ($key, # The key of the feature
1297             $loc, # The location line from the feature
1298             @qual, # An arrray of lines making up the qualifiers
1299             );
1300              
1301 256 50       859 if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
    0          
1302 256         420 $key = $1;
1303 256         313 $loc = $2;
1304             # Read all the lines up to the next feature
1305 256         539 while ( defined($_ = $self->_readline) ) {
1306 2038 100       9833 if (/^FT(\s+)(.+?)\s*$/) {
1307             # Lines inside features are preceeded by 19 spaces
1308             # A new feature is preceeded by 3 spaces
1309 2016 100       2899 if (length($1) > 4) {
1310             # Add to qualifiers if we're in the qualifiers
1311 1782 100       2172 if (@qual) {
    100          
1312 1523         3437 push(@qual, $2);
1313             }
1314             # Start the qualifier list if it's the first qualifier
1315             elsif (substr($2, 0, 1) eq '/') {
1316 254         692 @qual = ($2);
1317             }
1318             # We're still in the location line, so append to location
1319             else {
1320 5         17 $loc .= $2;
1321             }
1322             } else {
1323             # We've reached the start of the next feature
1324 234         292 last;
1325             }
1326             } else {
1327             # We're at the end of the feature table
1328 22         40 last;
1329             }
1330             }
1331             } elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1332 0         0 $key = 'CONTIG';
1333 0         0 $loc = $1;
1334             # Read all the lines up to the next feature
1335 0         0 while ( defined($_ = $self->_readline) ) {
1336 0 0       0 if (/^CO\s+(\S+)\s*$/) {
1337 0         0 $loc .= $1;
1338             } else {
1339             # We've reached the start of the next feature
1340 0         0 last;
1341             }
1342             }
1343             } else {
1344             # No feature key
1345 0         0 return;
1346             }
1347              
1348             # Put the first line of the next feature into the buffer
1349 256         268 $$buffer = $_;
1350              
1351             # Make the new FTHelper object
1352 256         607 my $out = Bio::SeqIO::FTHelper->new();
1353 256         495 $out->verbose($self->verbose());
1354 256         458 $out->key($key);
1355 256         429 $out->loc($loc);
1356              
1357             # Now parse and add any qualifiers. (@qual is kept
1358             # intact to provide informative error messages.)
1359 256         218 my $last_unquoted_qualifier;
1360             QUAL:
1361 256         500 for (my $i = 0; $i < @qual; $i++) {
1362 1091         895 my $data = $qual[$i];
1363 1091         779 my ( $qualifier, $value );
1364            
1365 1091 50       3869 unless (( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s*(.*))?})) {
1366 0 0       0 if ( defined $last_unquoted_qualifier ) {
1367             # handle case of unquoted multiline - read up everything until the next qualifier
1368 0   0     0 do {
1369             # Protein sequence translations need to be joined without spaces,
1370             # other qualifiers need those.
1371 0 0       0 $value .= ' ' if $qualifier ne "translation";
1372 0         0 $value .= $data;
1373             } while defined($data = $qual[++$i]) && $data !~ m[^/];
1374 0         0 $i--;
1375 0         0 $out->field->{$last_unquoted_qualifier}->[-1] .= $value;
1376 0         0 $last_unquoted_qualifier = undef;
1377 0         0 next QUAL;
1378             } else {
1379 0         0 $self->throw("Can't see new qualifier in: $_\nfrom:\n"
1380             . join('', map "$_\n", @qual));
1381             }
1382             }
1383 1091 50       1423 $qualifier = '' if not defined $qualifier;
1384              
1385 1091         760 $last_unquoted_qualifier = undef;
1386 1091 50       1081 if (defined $value) {
1387             # Do we have a quoted value?
1388 1091 100       1315 if (substr($value, 0, 1) eq '"') {
1389             # Keep adding to value until we find the trailing quote
1390             # and the quotes are balanced
1391             QUOTES:
1392 971   66     3450 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1393 686         458 $i++;
1394 686         484 my $next = $qual[$i];
1395 686 50       817 if (!defined($next)) {
1396 0         0 $self->warn("Unbalanced quote in:\n".join("\n", @qual).
1397             "\nAdding quote to close...".
1398             "Check sequence quality!");
1399 0         0 $value .= '"';
1400 0         0 last QUOTES;
1401             }
1402              
1403             # Protein sequence translations need to be joined without spaces,
1404             # other qualifiers need those.
1405 686 100       663 if ($qualifier eq "translation") {
1406 569         1436 $value .= $next;
1407             } else {
1408 117         522 $value .= " $next";
1409             }
1410             }
1411             # Trim leading and trailing quotes
1412 971         7023 $value =~ s/^"|"$//g;
1413             # Undouble internal quotes
1414 971         1014 $value =~ s/""/"/g; #"
1415             } else {
1416 120         136 $last_unquoted_qualifier = $qualifier;
1417             }
1418             } else {
1419 0         0 $value = '_no_value';
1420             }
1421              
1422             # Store the qualifier
1423 1091   100     1681 $out->field->{$qualifier} ||= [];
1424 1091         814 push(@{$out->field->{$qualifier}},$value);
  1091         1438  
1425             }
1426              
1427 256         564 return $out;
1428             }
1429              
1430             =head2 _write_line_EMBL
1431              
1432             Title : _write_line_EMBL
1433             Usage :
1434             Function: internal function
1435             Example :
1436             Returns : 1 if writing suceeded, else undef
1437             Args :
1438              
1439              
1440             =cut
1441              
1442             sub _write_line_EMBL {
1443 0     0   0 my ($self,$pre1,$pre2,$line,$length) = @_;
1444              
1445 0 0       0 $length || $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1446 0         0 my $subl = $length - length $pre2;
1447 0         0 my $linel = length $line;
1448 0         0 my $i;
1449              
1450 0         0 my $sub = substr($line,0,$length - length $pre1);
1451              
1452 0 0       0 $self->_print( "$pre1$sub\n") || return;
1453              
1454 0         0 for ($i= ($length - length $pre1);$i < $linel;) {
1455 0         0 $sub = substr($line,$i,($subl));
1456 0 0       0 $self->_print( "$pre2$sub\n") || return;
1457 0         0 $i += $subl;
1458             }
1459              
1460 0         0 return 1;
1461             }
1462              
1463             =head2 _write_line_EMBL_regex
1464              
1465             Title : _write_line_EMBL_regex
1466             Usage :
1467             Function: internal function for writing lines of specified
1468             length, with different first and the next line
1469             left hand headers and split at specific points in the
1470             text
1471             Example :
1472             Returns : nothing
1473             Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1474              
1475              
1476             =cut
1477              
1478             sub _write_line_EMBL_regex {
1479 263     263   363 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1480              
1481             #print STDOUT "Going to print with $line!\n";
1482              
1483 263 50       390 $length || $self->throw("Programming error - called write_line_EMBL_regex without length.");
1484              
1485 263         294 my $subl = $length - (length $pre1) -1 ;
1486 263         205 my( @lines );
1487              
1488 263         370 CHUNK: while($line) {
1489 401         558 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1490 401 50       4130 if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1491 401         890 my $l = $1.$2;
1492 401 100       587 $l =~ s/#/ /g # remove word wrap protection char '#'
1493             if $pre1 eq "RA ";
1494 401         501 my $newl = $3;
1495 401         483 $line = substr($line,length($l));
1496             # be strict about not padding spaces according to
1497             # genbank format
1498 401         1056 $l =~ s/\s+$//;
1499 401 50       668 next CHUNK if ($l eq '');
1500 401         424 push(@lines, $l);
1501 401         1076 next CHUNK;
1502             }
1503             }
1504             # if we get here none of the patterns matched $subl or less chars
1505 0         0 $self->warn("trouble dissecting \"$line\"\n into chunks ".
1506             "of $subl chars or less - this tag won't print right");
1507             # insert a space char to prevent infinite loops
1508 0         0 $line = substr($line,0,$subl) . " " . substr($line,$subl);
1509             }
1510 263         293 my $s = shift @lines;
1511 263 100 50     952 ($self->_print("$pre1$s\n") || return) if $s;
1512 263         389 foreach my $s ( @lines ) {
1513 145 50       332 $self->_print("$pre2$s\n") || return;
1514             }
1515              
1516 263         692 return 1;
1517             }
1518              
1519             =head2 _post_sort
1520              
1521             Title : _post_sort
1522             Usage : $obj->_post_sort($newval)
1523             Function:
1524             Returns : value of _post_sort
1525             Args : newvalue (optional)
1526              
1527              
1528             =cut
1529              
1530             sub _post_sort{
1531 6     6   12 my $obj = shift;
1532 6 50       25 if ( @_ ) {
1533 0         0 my $value = shift;
1534 0         0 $obj->{'_post_sort'} = $value;
1535             }
1536 6         17 return $obj->{'_post_sort'};
1537              
1538             }
1539              
1540             =head2 _show_dna
1541              
1542             Title : _show_dna
1543             Usage : $obj->_show_dna($newval)
1544             Function:
1545             Returns : value of _show_dna
1546             Args : newvalue (optional)
1547              
1548              
1549             =cut
1550              
1551             sub _show_dna{
1552 42     42   62 my $obj = shift;
1553 42 100       223 if ( @_ ) {
1554 31         45 my $value = shift;
1555 31         65 $obj->{'_show_dna'} = $value;
1556             }
1557 42         79 return $obj->{'_show_dna'};
1558              
1559             }
1560              
1561             =head2 _id_generation_func
1562              
1563             Title : _id_generation_func
1564             Usage : $obj->_id_generation_func($newval)
1565             Function:
1566             Returns : value of _id_generation_func
1567             Args : newvalue (optional)
1568              
1569              
1570             =cut
1571              
1572             sub _id_generation_func{
1573 11     11   13 my $obj = shift;
1574 11 50       31 if ( @_ ) {
1575 0         0 my $value = shift;
1576 0         0 $obj->{'_id_generation_func'} = $value;
1577             }
1578 11         28 return $obj->{'_id_generation_func'};
1579              
1580             }
1581              
1582             =head2 _ac_generation_func
1583              
1584             Title : _ac_generation_func
1585             Usage : $obj->_ac_generation_func($newval)
1586             Function:
1587             Returns : value of _ac_generation_func
1588             Args : newvalue (optional)
1589              
1590              
1591             =cut
1592              
1593             sub _ac_generation_func{
1594 11     11   18 my $obj = shift;
1595 11 50       31 if ( @_ ) {
1596 0         0 my $value = shift;
1597 0         0 $obj->{'_ac_generation_func'} = $value;
1598             }
1599 11         153 return $obj->{'_ac_generation_func'};
1600              
1601             }
1602              
1603             =head2 _sv_generation_func
1604              
1605             Title : _sv_generation_func
1606             Usage : $obj->_sv_generation_func($newval)
1607             Function:
1608             Returns : value of _sv_generation_func
1609             Args : newvalue (optional)
1610              
1611              
1612             =cut
1613              
1614             sub _sv_generation_func{
1615 0     0   0 my $obj = shift;
1616 0 0       0 if ( @_ ) {
1617 0         0 my $value = shift;
1618 0         0 $obj->{'_sv_generation_func'} = $value;
1619             }
1620 0         0 return $obj->{'_sv_generation_func'};
1621              
1622             }
1623              
1624             =head2 _kw_generation_func
1625              
1626             Title : _kw_generation_func
1627             Usage : $obj->_kw_generation_func($newval)
1628             Function:
1629             Returns : value of _kw_generation_func
1630             Args : newvalue (optional)
1631              
1632              
1633             =cut
1634              
1635             sub _kw_generation_func{
1636 11     11   18 my $obj = shift;
1637 11 50       34 if ( @_ ) {
1638 0         0 my $value = shift;
1639 0         0 $obj->{'_kw_generation_func'} = $value;
1640             }
1641 11         89 return $obj->{'_kw_generation_func'};
1642              
1643             }
1644              
1645             1;