File Coverage

Bio/SeqIO/genbank.pm
Criterion Covered Total %
statement 614 723 84.9
branch 308 404 76.2
condition 74 111 66.6
subroutine 25 26 96.1
pod 2 2 100.0
total 1023 1266 80.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::genbank
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Bioperl project bioperl-l(at)bioperl.org
7             #
8             # Copyright Elia Stupka and contributors see AUTHORS section
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::genbank - GenBank sequence input/output stream
17              
18             =head1 SYNOPSIS
19              
20             It is probably best not to use this object directly, but
21             rather go through the SeqIO handler:
22              
23             $stream = Bio::SeqIO->new(-file => $filename,
24             -format => 'GenBank');
25              
26             while ( my $seq = $stream->next_seq ) {
27             # do something with $seq
28             }
29              
30              
31             =head1 DESCRIPTION
32              
33             This object can transform Bio::Seq objects to and from GenBank flat
34             file databases.
35              
36             There is some flexibility here about how to write GenBank output
37             that is not fully documented.
38              
39             =head2 Optional functions
40              
41             =over 3
42              
43             =item _show_dna()
44              
45             (output only) shows the dna or not
46              
47             =item _post_sort()
48              
49             (output only) provides a sorting func which is applied to the FTHelpers
50             before printing
51              
52             =item _id_generation_func()
53              
54             This is function which is called as
55              
56             print "ID ", $func($seq), "\n";
57              
58             To generate the ID line. If it is not there, it generates a sensible ID
59             line using a number of tools.
60              
61             If you want to output annotations in Genbank format they need to be
62             stored in a Bio::Annotation::Collection object which is accessible
63             through the Bio::SeqI interface method L.
64              
65             The following are the names of the keys which are pulled from a
66             L object:
67              
68             reference - Should contain Bio::Annotation::Reference objects
69             comment - Should contain Bio::Annotation::Comment objects
70             dblink - Should contain a Bio::Annotation::DBLink object
71             segment - Should contain a Bio::Annotation::SimpleValue object
72             origin - Should contain a Bio::Annotation::SimpleValue object
73             wgs - Should contain a Bio::Annotation::SimpleValue object
74              
75             =back
76              
77             =head1 Where does the data go?
78              
79             Data parsed in Bio::SeqIO::genbank is stored in a variety of data
80             fields in the sequence object that is returned. Here is a partial list
81             of fields.
82              
83             Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you
84             the top level object which defines a function called NAME() which
85             stores this information.
86              
87             Items listed as Annotation 'NAME' tell you the data is stored the
88             associated Bio::AnnotationCollectionI object which is associated with
89             Bio::Seq objects. If it is explicitly requested that no annotations
90             should be stored when parsing a record of course they will not be
91             available when you try and get them. If you are having this problem
92             look at the type of SeqBuilder that is being used to construct your
93             sequence object.
94              
95             Comments Annotation 'comment'
96             References Annotation 'reference'
97             Segment Annotation 'segment'
98             Origin Annotation 'origin'
99             Dbsource Annotation 'dblink'
100              
101             Accessions PrimarySeq accession_number()
102             Secondary accessions RichSeq get_secondary_accessions()
103             GI number PrimarySeq primary_id()
104             LOCUS PrimarySeq display_id()
105             Keywords RichSeq get_keywords()
106             Dates RichSeq get_dates()
107             Molecule RichSeq molecule()
108             Seq Version RichSeq seq_version()
109             PID RichSeq pid()
110             Division RichSeq division()
111             Features Seq get_SeqFeatures()
112             Alphabet PrimarySeq alphabet()
113             Definition PrimarySeq description() or desc()
114             Version PrimarySeq version()
115              
116             Sequence PrimarySeq seq()
117              
118             There is more information in the Feature-Annotation HOWTO about each
119             field and how it is mapped to the Sequence object
120             L.
121              
122             =head1 FEEDBACK
123              
124             =head2 Mailing Lists
125              
126             User feedback is an integral part of the evolution of this and other
127             Bioperl modules. Send your comments and suggestions preferably to one
128             of the Bioperl mailing lists. Your participation is much appreciated.
129              
130             bioperl-l@bioperl.org - General discussion
131             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
132              
133             =head2 Support
134              
135             Please direct usage questions or support issues to the mailing list:
136              
137             I
138              
139             rather than to the module maintainer directly. Many experienced and
140             reponsive experts will be able look at the problem and quickly
141             address it. Please include a thorough description of the problem
142             with code and data examples if at all possible.
143              
144             =head2 Reporting Bugs
145              
146             Report bugs to the Bioperl bug tracking system to help us keep track
147             the bugs and their resolution. Bug reports can be submitted via the web:
148              
149             https://github.com/bioperl/bioperl-live/issues
150              
151             =head1 AUTHOR - Bioperl Project
152              
153             bioperl-l at bioperl.org
154              
155             Original author Elia Stupka, elia -at- tigem.it
156              
157             =head1 CONTRIBUTORS
158              
159             Ewan Birney birney at ebi.ac.uk
160             Jason Stajich jason at bioperl.org
161             Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
162             Lincoln Stein lstein at cshl.org
163             Heikki Lehvaslaiho, heikki at ebi.ac.uk
164             Hilmar Lapp, hlapp at gmx.net
165             Donald G. Jackson, donald.jackson at bms.com
166             James Wasmuth, james.wasmuth at ed.ac.uk
167             Brian Osborne, bosborne at alum.mit.edu
168             Chris Fields, cjfields at bioperl dot org
169              
170             =head1 APPENDIX
171              
172             The rest of the documentation details each of the object
173             methods. Internal methods are usually preceded with a _
174              
175             =cut
176              
177             # Let the code begin...
178              
179             package Bio::SeqIO::genbank;
180 16     16   758 use strict;
  16         28  
  16         478  
181              
182 16     16   3800 use Bio::SeqIO::FTHelper;
  16         45  
  16         391  
183 16     16   92 use Bio::SeqFeature::Generic;
  16         23  
  16         278  
184 16     16   5687 use Bio::Species;
  16         38  
  16         407  
185 16     16   90 use Bio::Seq::SeqFactory;
  16         214  
  16         327  
186 16     16   68 use Bio::Annotation::Collection;
  16         25  
  16         405  
187 16     16   4520 use Bio::Annotation::Comment;
  16         32  
  16         427  
188 16     16   4014 use Bio::Annotation::Reference;
  16         32  
  16         436  
189 16     16   93 use Bio::Annotation::DBLink;
  16         29  
  16         335  
190              
191 16     16   69 use base qw(Bio::SeqIO);
  16         26  
  16         115043  
192              
193             # Note that a qualifier that exceeds one line (i.e. a long label) will
194             # automatically be quoted regardless:
195              
196             our $FTQUAL_LINE_LENGTH = 60;
197              
198             our %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
199             anticodon citation
200             codon codon_start
201             cons_splice direction
202             evidence label
203             mod_base number
204             rpt_type rpt_unit
205             transl_except transl_table
206             usedin
207             );
208              
209             our %DBSOURCE = map {$_ => 1} qw(
210             EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
211             TIGR GO InterPro Pfam PROSITE SGD GermOnline
212             HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
213             H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
214             MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
215             UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
216             PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
217             Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
218             TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
219             PhotoList Gramene WormBase WormPep Genew ZFIN
220             PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
221             swissprot GenBank GenPept REFSEQ embl PDB UniProtKB
222             DIP PeptideAtlas PRIDE CYGD HOGENOME Gene3D
223             Project);
224              
225             our %VALID_MOLTYPE = map {$_ => 1} qw(NA DNA RNA tRNA rRNA cDNA cRNA ms-DNA
226             mRNA uRNA ss-RNA ss-DNA snRNA snoRNA PRT);
227              
228             our %VALID_ALPHABET = (
229             'bp' => 'dna',
230             'aa' => 'protein',
231             'rc' => '' # rc = release candidate; file has no sequences
232             );
233              
234             sub _initialize {
235 115     115   392 my($self, @args) = @_;
236              
237 115         551 $self->SUPER::_initialize(@args);
238             # hash for functions for decoding keys.
239 115         437 $self->{'_func_ftunit_hash'} = {};
240 115         465 $self->_show_dna(1); # sets this to one by default. People can change it
241 115 50       563 if ( not defined $self->sequence_factory ) {
242 115         407 $self->sequence_factory
243             (Bio::Seq::SeqFactory->new(-verbose => $self->verbose,
244             -type => 'Bio::Seq::RichSeq'));
245             }
246             }
247              
248             =head2 next_seq
249              
250             Title : next_seq
251             Usage : $seq = $stream->next_seq()
252             Function: returns the next sequence in the stream
253             Returns : Bio::Seq object
254             Args :
255              
256             =cut
257              
258             sub next_seq {
259 117     117 1 5914 my ($self, @args) = @_;
260 117         280 my %args = @args;
261 117         399 my $builder = $self->sequence_builder;
262 117         280 my $seq;
263             my %params;
264              
265             RECORDSTART:
266 117         162 while (1) {
267 119         162 my $buffer;
268 119         502 my ( @acc, @features );
269 119         0 my ( $display_id, $annotation );
270 119         0 my $species;
271              
272             # initialize; we may come here because of starting over
273 119         210 @features = ();
274 119         224 $annotation = undef;
275 119         207 @acc = ();
276 119         163 $species = undef;
277 119         366 %params = ( -verbose => $self->verbose ); # reset hash
278 119         661 local ($/) = "\n";
279 119         551 while ( defined( $buffer = $self->_readline ) ) {
280 189 100       716 last if index( $buffer, 'LOCUS ' ) == 0;
281             }
282 119 100       397 return unless defined $buffer; # end of file
283 113 50       706 $buffer =~ /^LOCUS\s+(\S.*)$/o
284             or $self->throw( "GenBank stream with bad LOCUS line. "
285             . "Not GenBank in my book. Got '$buffer'");
286 113         793 my @tokens = split( ' ', $1 );
287              
288             # this is important to have the id for display in e.g. FTHelper,
289             # otherwise you won't know which entry caused an error
290 113         249 $display_id = shift @tokens;
291 113         296 $params{'-display_id'} = $display_id;
292              
293             # may still be useful if we don't want the seq
294 113         268 my $seqlength = shift @tokens;
295 113 50       461 if ( exists $VALID_ALPHABET{$seqlength} ) {
296             # moved one token too far. No locus name?
297 0         0 $self->warn( "Bad LOCUS name? Changing [$params{'-display_id'}] "
298             . "to 'unknown' and length to '$display_id'"
299             );
300 0         0 $params{'-display_id'} = 'unknown';
301 0         0 $params{'-length'} = $display_id;
302              
303             # add token back...
304 0         0 unshift @tokens, $seqlength;
305             }
306             else {
307 113         279 $params{'-length'} = $seqlength;
308             }
309              
310             # the alphabet of the entry
311             # shouldn't assign alphabet unless one
312             # is specifically designated (such as for rc files)
313 113         287 my $alphabet = lc( shift @tokens );
314             $params{'-alphabet'} =
315             ( exists $VALID_ALPHABET{$alphabet} )
316 113 50       481 ? $VALID_ALPHABET{$alphabet}
317             : $self->warn("Unknown alphabet: $alphabet");
318              
319             # for aa there is usually no 'molecule' (mRNA etc)
320 113 100       352 if ( $params{'-alphabet'} eq 'protein' ) {
321 10         28 $params{'-molecule'} = 'PRT';
322             }
323             else {
324 103         217 $params{'-molecule'} = shift(@tokens);
325             }
326              
327             # take care of lower case issues
328 113 100 66     711 if ( $params{'-molecule'} eq 'dna' or $params{'-molecule'} eq 'rna' ) {
329 4         14 $params{'-molecule'} = uc $params{'-molecule'};
330             }
331             $self->debug( "Unrecognized molecule type: " . $params{'-molecule'} )
332 113 50       488 if not exists( $VALID_MOLTYPE{ $params{'-molecule'} } );
333              
334 113         226 my $circ = shift @tokens;
335 113 100       280 if ( $circ eq 'circular' ) {
336 8         27 $params{'-is_circular'} = 1;
337 8         26 $params{'-division'} = shift @tokens;
338             }
339             else {
340             # 'linear' or 'circular' may actually be omitted altogether
341 105 100       385 $params{'-division'} =
342             ( CORE::length($circ) == 3 ) ? $circ : shift @tokens;
343             }
344 113         355 my $date = join( ' ', @tokens ); # we lump together the rest
345              
346             # this is per request bug #1513
347             # we can handle:
348             # 9-10-2003
349             # 9-10-03
350             # 09-10-2003
351             # 09-10-03
352 113 100       949 if ( $date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/ ) {
353 107 50       338 if ( length($date) < 11 ) {
354             # improperly formatted date
355             # But we'll be nice and fix it for them
356 0         0 my ( $d, $m, $y ) = ( $2, $3, $4 );
357 0 0       0 if ( length($d) == 1 ) {
358 0         0 $d = "0$d";
359             }
360              
361             # guess the century here
362 0 0       0 if ( length($y) == 2 ) {
363 0 0       0 if ( $y > 60 ) { # arbitrarily guess that '60' means 1960
364 0         0 $y = "19$y";
365             }
366             else {
367 0         0 $y = "20$y";
368             }
369 0         0 $self->warn( "Date was malformed, guessing the "
370             . "century for $date to be $y\n"
371             );
372             }
373 0         0 $params{'-dates'} = [ join( '-', $d, $m, $y ) ];
374             }
375             else {
376 107         313 $params{'-dates'} = [$date];
377             }
378             }
379              
380             # set them all at once
381 113         703 $builder->add_slot_value(%params);
382 113         369 %params = ();
383              
384             # parse the rest if desired, otherwise start over
385 113 100       368 if ( not $builder->want_object ) {
386 2         5 $builder->make_object;
387 2         8 next RECORDSTART;
388             }
389              
390             # set up annotation depending on what the builder wants
391 111 100       407 if ( $builder->want_slot('annotation') ) {
392 106         787 $annotation = Bio::Annotation::Collection->new;
393             }
394              
395 111         365 $buffer = $self->_readline;
396 111         466 while ( defined( my $line = $buffer ) ) {
397             # Description line(s)
398 739 100       2126 if ($line =~ /^DEFINITION\s+(\S.*\S)/) {
399 110         366 my @desc = ($1);
400 110         332 while ( defined( $line = $self->_readline ) ) {
401 172 100       614 if ($line =~ /^\s+(.*)/) {
402 62         173 push( @desc, $1 );
403 62         161 next;
404             }
405 110         189 last;
406             }
407 110         688 $builder->add_slot_value( -desc => join( ' ', @desc ) );
408              
409             # we'll continue right here because DEFINITION
410             # always comes at the top of the entry
411 110         227 $buffer = $line;
412             }
413              
414             # accession number (there can be multiple accessions)
415 739 100       5739 if ($line =~ /^ACCESSION\s+(\S.*\S)/) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
416 110         555 push( @acc, split( /\s+/, $1 ) );
417 110         359 while ( defined( $line = $self->_readline ) ) {
418 113 100       448 if ($line =~ /^\s+(.*)/) {
419 3         18 push( @acc, split( /\s+/, $1 ) );
420 3         9 next;
421             }
422 110         219 last;
423             }
424 110         187 $buffer = $line;
425 110         293 next;
426             }
427              
428             # PID
429             elsif ($line =~ /^PID\s+(\S+)/) {
430 1         5 $params{'-pid'} = $1;
431             }
432              
433             # Version number
434             elsif ($line =~ /^VERSION\s+(\S.+)$/) {
435 91         402 my ( $acc, $gi ) = split( ' ', $1 );
436 91 100       496 if ( $acc =~ /^\w+\.(\d+)/ ) {
437 87         310 $params{'-version'} = $1;
438 87         248 $params{'-seq_version'} = $1;
439             }
440 91 100 66     598 if ( $gi && ( index( $gi, "GI:" ) == 0 ) ) {
441 89         310 $params{'-primary_id'} = substr( $gi, 3 );
442             }
443             }
444              
445             # Keywords
446             elsif ($line =~ /^KEYWORDS\s+(\S.*)/) {
447 107         500 my @kw = split( /\s*\;\s*/, $1 );
448 107         327 while ( defined( $line = $self->_readline ) ) {
449 138         299 chomp $line;
450 138 100       445 if ($line =~ /^\s+(.*)/) {
451 31         1311 push( @kw, split( /\s*\;\s*/, $1 ) );
452 31         80 next;
453             }
454 107         177 last;
455             }
456              
457 107 50       584 @kw && $kw[-1] =~ s/\.$//;
458 107         333 $params{'-keywords'} = \@kw;
459 107         221 $buffer = $line;
460 107         274 next;
461             }
462              
463             # Organism name and phylogenetic information
464             elsif ($line =~ /^SOURCE\s+\S/) {
465 106 100       398 if ( $builder->want_slot('species') ) {
466 101         461 $species = $self->_read_GenBank_Species( \$buffer );
467 101         771 $builder->add_slot_value( -species => $species );
468             }
469             else {
470 5         11 while ( defined( $buffer = $self->_readline ) ) {
471 18 100       44 last if substr( $buffer, 0, 1 ) ne ' ';
472             }
473             }
474 106         604 next;
475             }
476              
477             # References
478             elsif ($line =~ /^REFERENCE\s+\S/) {
479 109 100       384 if ($annotation) {
480 100         525 my @refs = $self->_read_GenBank_References( \$buffer );
481 100         266 foreach my $ref (@refs) {
482 417         1122 $annotation->add_Annotation( 'reference', $ref );
483             }
484             }
485             else {
486 9         16 while ( defined( $buffer = $self->_readline ) ) {
487 47 100       99 last if substr( $buffer, 0, 1 ) ne ' ';
488             }
489             }
490 109         325 next;
491             }
492              
493             # Project
494             elsif ($line =~ /^PROJECT\s+(\S.*)/) {
495 2 50       9 if ($annotation) {
496 2         14 my $project =
497             Bio::Annotation::SimpleValue->new( -value => $1 );
498 2         9 $annotation->add_Annotation( 'project', $project );
499             }
500             }
501              
502             # Comments may be plain text or Structured Comments.
503             # Structured Comments are made up of tag/value pairs and have beginning
504             # and end delimiters like ##*-Data-START## and ##*-Data-END##
505             elsif ($line =~ /^COMMENT\s+(\S.*)/) {
506 60 100       170 if ($annotation) {
507 58         196 my $comment = $1;
508 58         207 while ( defined( $line = $self->_readline ) ) {
509 481 100       1106 last if ($line =~ /^\S/);
510 423         970 $comment .= $line;
511             }
512 58         703 $comment =~ s/ +/ /g;
513             # Structured Comment, do not remove returns in the tabular section
514 58 100       264 if ( my ( $text, $table )= $comment
515             =~ /([^#]*)(##\S+Data-START##.+?##\S+Data-END##)/is
516             ) {
517 2 100       19 $text =~ s/\n/ /g if $text;
518 2         15 $table =~ s/START##/START##\n/;
519 2         25 $table =~ s/^\s+//gm;
520 2         15 $comment = $text . "\n" . $table;
521             }
522             # Plain text, remove returns
523             else {
524 56         348 $comment =~ s/\n/ /g;
525             }
526 58         647 $annotation->add_Annotation(
527             'comment',
528             Bio::Annotation::Comment->new(
529             -text => $comment,
530             -tagname => 'comment'
531             )
532             );
533 58         216 $buffer = $line;
534             }
535             else {
536 2         6 while ( defined( $buffer = $self->_readline ) ) {
537 7 100       18 last if substr( $buffer, 0, 1 ) ne ' ';
538             }
539             }
540 60         164 next;
541             }
542              
543             # Corresponding Genbank nucleotide id, Genpept only
544             elsif ($line =~ /^DB(?:SOURCE|LINK)\s+(\S.+)/) {
545 20 50       76 if ($annotation) {
546 20         58 my $dbsource = $1;
547 20         76 while ( defined( $line = $self->_readline ) ) {
548 85 100       228 last if ($line =~ /^\S/);
549 65         171 $dbsource .= $line;
550             }
551              
552             # deal with UniProKB dbsources
553 20 100       114 if ( $dbsource =~
554             s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n//
555             ) {
556 5         61 $annotation->add_Annotation(
557             'dblink',
558             Bio::Annotation::DBLink->new(
559             -primary_id => $2,
560             -database => $1,
561             -tagname => 'dblink'
562             )
563             );
564 5 50       60 if ( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {
565 5         44 $annotation->add_Annotation(
566             'swissprot_dates',
567             Bio::Annotation::SimpleValue->new(
568             -tagname => 'date_created',
569             -value => $1
570             )
571             );
572             }
573 5         91 while ( $dbsource =~
574             s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g
575             ) {
576 5         35 $annotation->add_Annotation(
577             'swissprot_dates',
578             Bio::Annotation::SimpleValue->new(
579             -tagname => 'date_updated',
580             -value => $2
581             )
582             );
583             }
584 5         48 $dbsource =~ s/\n/ /g;
585 5 50       78 if ( $dbsource =~
    0          
586             s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/
587             ) {
588             # will use $i to determine even or odd
589             # for swissprot the accessions are paired
590 5         14 my $i = 0;
591 5         51 for my $dbsrc ( split( /,\s+/, $1 ) ) {
592 49 50 66     292 if ( $dbsrc =~ /(\S+)\.(\d+)/
593             or $dbsrc =~ /(\S+)/
594             ) {
595 49         143 my ( $id, $version ) = ( $1, $2 );
596 49 100       104 $version = '' unless defined $version;
597 49 100       157 my $db = ( $id =~ /^\d\S{3}/ ) ? 'PDB'
    100          
598             : ( $i++ % 2 ) ? 'GenPept'
599             : 'GenBank';
600              
601 49         179 $annotation->add_Annotation(
602             'dblink',
603             Bio::Annotation::DBLink->new(
604             -primary_id => $id,
605             -version => $version,
606             -database => $db,
607             -tagname => 'dblink'
608             )
609             );
610             }
611             }
612             }
613             elsif ( $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
614             # download screwed up and ncbi didn't put acc in for gi numbers
615 0         0 my $i = 0;
616 0         0 for my $id ( split( /\,\s+/, $1 ) ) {
617 0         0 my ( $acc, $db );
618 0 0       0 if ( $id =~ /gi:\s+(\d+)/ ) {
    0          
619 0         0 $acc = $1;
620 0 0       0 $db = ( $i++ % 2 ) ? 'GenPept' : 'GenBank';
621             }
622             elsif ( $id =~ /pdb\s+accession\s+(\S+)/ ) {
623 0         0 $acc = $1;
624 0         0 $db = 'PDB';
625             }
626             else {
627 0         0 $acc = $id;
628 0         0 $db = '';
629             }
630 0         0 $annotation->add_Annotation(
631             'dblink',
632             Bio::Annotation::DBLink->new(
633             -primary_id => $acc,
634             -database => $db,
635             -tagname => 'dblink'
636             )
637             );
638             }
639             }
640             else {
641 0         0 $self->debug("Cannot match $dbsource\n");
642             }
643 5 50       102 if ( $dbsource =~ s/xrefs\s+
644             \(non\-sequence\s+databases\):\s+
645             ((?:\S+,\s+)+\S+)//x
646             ) {
647 5         83 for my $id ( split( /\,\s+/, $1 ) ) {
648 86         143 my $db;
649              
650             # this is because GenBank dropped the spaces!!!
651             # I'm sure we're not going to get this right
652             ##if ( $id =~ s/^://i ) {
653             ## $db = $1;
654             ##}
655 86         186 $db = substr( $id, 0, index( $id, ':' ) );
656 86 100       207 if ( not exists $DBSOURCE{$db} ) {
657 16         27 $db = ''; # do we want 'GenBank' here?
658             }
659 86         140 $id = substr( $id, index( $id, ':' ) + 1 );
660 86         266 $annotation->add_Annotation(
661             'dblink',
662             Bio::Annotation::DBLink->new(
663             -primary_id => $id,
664             -database => $db,
665             -tagname => 'dblink'
666             )
667             );
668             }
669             }
670             }
671             else {
672 15 100       133 if ( $dbsource =~
    100          
    50          
673             /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/
674             ) {
675 4         21 my ( $db, $id, $version ) = ( $1, $2, $3 );
676 4   100     40 $annotation->add_Annotation(
677             'dblink',
678             Bio::Annotation::DBLink->new(
679             -primary_id => $id,
680             -version => $version,
681             -database => $db || 'GenBank',
682             -tagname => 'dblink'
683             )
684             );
685             }
686             elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {
687 1         4 my ( $db, $id ) = ( $1, $2 );
688 1   50     11 $annotation->add_Annotation(
689             'dblink',
690             Bio::Annotation::DBLink->new(
691             -primary_id => $id,
692             -database => $db || 'GenBank',
693             -tagname => 'dblink'
694             )
695             );
696             }
697             elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {
698 10         18 my ( $db, $version );
699 10         21 my @ids = ();
700 10 100       41 if ( $2 eq ':' ) {
701 9         18 $db = $1;
702              
703             # Genbank 192 release notes say this: "The second
704             # field can consist of multiple comma-separated
705             # identifiers, if a sequence record has multiple
706             # DBLINK cross-references of a given type."
707             # For example: DBLINK Project:100,200,300"
708 9         45 @ids = split( /,/, $3 );
709             }
710             else {
711 1         4 ( $db, $version ) = ( 'GenBank', $3 );
712 1         3 $ids[0] = $1;
713             }
714              
715 10         26 foreach my $id (@ids) {
716 12         104 $annotation->add_Annotation(
717             'dblink',
718             Bio::Annotation::DBLink->new(
719             -primary_id => $id,
720             -version => $version,
721             -database => $db,
722             -tagname => 'dblink'
723             )
724             );
725             }
726             }
727             else {
728 0         0 $self->warn(
729             "Unrecognized DBSOURCE data: $dbsource\n");
730             }
731             }
732 20         88 $buffer = $line;
733             }
734             else {
735 0         0 while ( defined( $buffer = $self->_readline ) ) {
736 0 0       0 last if substr( $buffer, 0, 1 ) ne ' ';
737             }
738             }
739 20         61 next;
740             }
741              
742             # Exit at start of Feature table, or start of sequence
743 227 100       1174 if ($line =~ /^(FEATURES|ORIGIN)/) {
744 111         206 my $trap;
745             }
746 227 100       908 last if ($line =~ /^(FEATURES|ORIGIN)/);
747              
748             # Get next line and loop again
749 116         335 $buffer = $self->_readline;
750             }
751 111 50       467 return unless defined $buffer;
752              
753             # add them all at once for efficiency
754 111         965 $builder->add_slot_value(
755             -accession_number => shift(@acc),
756             -secondary_accessions => \@acc,
757             %params
758             );
759 111 100       561 $builder->add_slot_value( -annotation => $annotation ) if $annotation;
760 111         328 %params = (); # reset before possible re-use to avoid setting twice
761              
762             # start over if we don't want to continue with this entry
763 111 50       432 if ( not $builder->want_object ) {
764 0         0 $builder->make_object;
765 0         0 next RECORDSTART;
766             }
767              
768             # some "minimal" formats may not necessarily have a feature table
769 111 100 66     445 if ( $builder->want_slot('features')
      100        
770             and defined $buffer
771             and $buffer =~ /^FEATURES/o
772             ) {
773             # need to read the first line of the feature table
774 104         394 $buffer = $self->_readline;
775              
776             # DO NOT read lines in the while condition -- this is done
777             # as a side effect in _read_FTHelper_GenBank!
778              
779             # part of new circular spec:
780             # commented out for now until kinks worked out
781             #my $sourceEnd = 0;
782             #$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);
783              
784 104         335 while ( defined $buffer ) {
785             # check immediately -- not at the end of the loop
786             # note: GenPept entries obviously do not have a BASE line
787 8384 100       58514 last if ( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o );
788              
789             # slurp in one feature at a time -- at return, the start of
790             # the next feature will have been read already, so we need
791             # to pass a reference, and the called method must set this
792             # to the last line read before returning
793 8280         19335 my $ftunit = $self->_read_FTHelper_GenBank( \$buffer );
794              
795             # implement new circular spec: features that cross the origin are now
796             # seamless instead of being 2 separate joined features
797             # commented out until kinks get worked out
798             #if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/
799             #&& $sourceEnd == $2 && $3 == 1) {
800             #my $start = $1;
801             #my $end = $2 + $4;
802             #$ftunit->{'loc'} = "$start..$end";
803             #}
804              
805             # fix suggested by James Diggans
806              
807 8280 50       13220 if ( not defined $ftunit ) {
808             # GRRRR. We have fallen over. Try to recover
809             $self->warn( "Unexpected error in feature table for "
810 0         0 . $params{'-display_id'}
811             . " Skipping feature, attempting to recover" );
812              
813 0 0 0     0 unless ( $buffer =~ /^\s{5,5}\S+/o
814             or $buffer =~ /^\S+/o
815             ) {
816 0         0 $buffer = $self->_readline;
817             }
818 0         0 next; # back to reading FTHelpers
819             }
820              
821             # process ftunit
822 8280         20579 my $feat =
823             $ftunit->_generic_seqfeature( $self->location_factory,
824             $display_id );
825              
826             # add taxon_id from source if available
827 8280 100 100     21167 if ( $species
      100        
      66        
      66        
828             and $feat->primary_tag eq 'source'
829             and $feat->has_tag('db_xref')
830             and ( not $species->ncbi_taxid
831             or ( $species->ncbi_taxid
832             and $species->ncbi_taxid =~ /^list/ ) )
833             ) {
834 95         352 foreach my $tagval ( $feat->get_tag_values('db_xref') ) {
835 96 100       368 if ( index( $tagval, "taxon:" ) == 0 ) {
836 95         470 $species->ncbi_taxid( substr( $tagval, 6 ) );
837 95         308 last;
838             }
839             }
840             }
841              
842             # add feature to list of features
843 8280         28555 push( @features, $feat );
844             }
845 104         628 $builder->add_slot_value( -features => \@features );
846             }
847              
848 111 50       380 if ( defined $buffer ) {
849             # CONTIG lines: TODO, this needs to be cleaned up
850 111 100       1321 if ($buffer =~/^CONTIG\s+(.*)/o) {
    100          
    100          
851 7         24 my $ctg = $1;
852 7         29 while ( defined( $buffer = $self->_readline ) ) {
853 29 100       174 last if $buffer =~ m{^ORIGIN|//}o;
854 22         68 $buffer =~ s/\s+(.*)/$1/;
855 22         45 $ctg .= $buffer;
856             }
857 7 50       27 if ($ctg) {
858 7         71 $annotation->add_Annotation(
859             Bio::Annotation::SimpleValue->new(
860             -tagname => 'contig',
861             -value => $ctg
862             )
863             );
864             }
865             }
866             elsif ($buffer =~ /^WGS|WGS_SCAFLD\s+/o) { # catch WGS/WGS_SCAFLD lines
867 1         6 while ( $buffer =~ s/(^WGS|WGS_SCAFLD)\s+// ) { # gulp lines
868 3         8 chomp $buffer;
869 3         20 $annotation->add_Annotation(
870             Bio::Annotation::SimpleValue->new(
871             -value => $buffer,
872             -tagname => lc $1
873             )
874             );
875 3         12 $buffer = $self->_readline;
876             }
877             }
878             elsif ( $buffer !~ m{^ORIGIN|//}o ) { # advance to the sequence, if any
879 53         209 while ( defined( $buffer = $self->_readline ) ) {
880 148 100       548 last if $buffer =~ m{^(ORIGIN|//)};
881             }
882             }
883             }
884 111 50       469 if ( not $builder->want_object ) {
885 0         0 $builder->make_object; # implicit end-of-object
886 0         0 next RECORDSTART;
887             }
888 111 100 33     386 if ( $builder->want_slot('seq') ) {
    50          
889             # the fact that we want a sequence does not necessarily mean that
890             # there also is a sequence ...
891 101 100 100     954 if ( defined $buffer and $buffer =~ s/^ORIGIN\s+// ) {
892 93 100 66     574 if ( $annotation and length($buffer) > 0 ) {
893 1         13 $annotation->add_Annotation(
894             'origin',
895             Bio::Annotation::SimpleValue->new(
896             -tagname => 'origin',
897             -value => $buffer
898             )
899             );
900             }
901 93         226 my $seqc = '';
902 93         324 while ( defined( $buffer = $self->_readline ) ) {
903 32735 100       48518 last if $buffer =~ m{^//};
904 32643         38116 $buffer = uc $buffer;
905 32643         214194 $buffer =~ s/[^A-Za-z]//g;
906 32643         64543 $seqc .= $buffer;
907             }
908              
909 93         390 $builder->add_slot_value( -seq => $seqc );
910             }
911             }
912             elsif ( defined($buffer) and ( substr( $buffer, 0, 2 ) ne '//' ) ) {
913             # advance to the end of the record
914 10         26 while ( defined( $buffer = $self->_readline ) ) {
915 172 100       347 last if substr( $buffer, 0, 2 ) eq '//';
916             }
917             }
918              
919             # Unlikely, but maybe the sequence is so weird that we don't want it
920             # anymore. We don't want to return undef if the stream's not exhausted
921             # yet.
922 111         466 $seq = $builder->make_object;
923 111 50       333 next RECORDSTART unless $seq;
924 111         2408 last RECORDSTART;
925             } # end while RECORDSTART
926              
927 111         872 return $seq;
928             }
929              
930             =head2 write_seq
931              
932             Title : write_seq
933             Usage : $stream->write_seq($seq)
934             Function: writes the $seq object (must be seq) to the stream
935             Returns : 1 for success and 0 for error
936             Args : array of 1 to n Bio::SeqI objects
937              
938             =cut
939              
940             sub write_seq {
941 25     25 1 198 my ($self,@seqs) = @_;
942              
943 25         77 foreach my $seq ( @seqs ) {
944 25 50       115 $self->throw("Attempting to write with no seq!") unless defined $seq;
945              
946 25 50 33     228 if ( not ref $seq or not $seq->isa('Bio::SeqI') ) {
947 0         0 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
948             }
949              
950 25         124 my $str = $seq->seq;
951 25         205 my $len = $seq->length;
952 25         126 my $alpha = $seq->alphabet;
953              
954 25         122 my ($div, $mol);
955 25 100 66     268 if ( not $seq->can('division')
956             or not defined($div = $seq->division)
957             ) {
958 2         6 $div = 'UNK';
959             }
960 25 100 66     318 if ( not $seq->can('molecule')
961             or not defined ($mol = $seq->molecule)
962             ) {
963 2   100     10 $mol = $alpha || 'DNA';
964             }
965              
966 25 100       117 my $circular = ($seq->is_circular) ? 'circular' : 'linear ';
967              
968 25         153 local($^W) = 0; # suppressing warnings about uninitialized fields.
969              
970 25         55 my $temp_line;
971 25 50       91 if ( $self->_id_generation_func ) {
972 0         0 $temp_line = &{$self->_id_generation_func}($seq);
  0         0  
973             }
974             else {
975 25         57 my $date = '';
976 25 100       168 if ( $seq->can('get_dates') ) {
977 23         94 ($date) = $seq->get_dates;
978             }
979              
980 25 50       115 $self->warn("No whitespace allowed in GenBank display id [". $seq->display_id. "]")
981             if $seq->display_id =~ /\s/;
982              
983 25 100       154 my @data = ( lc($alpha) eq 'protein' ) ? ('aa', '', '') : ('bp', '', $mol);
984 25         114 $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s\n",
985             'LOCUS', $seq->id, $len,
986             @data, $circular, $div, $date);
987             }
988              
989 25         177 $self->_print($temp_line);
990 25         112 $self->_write_line_GenBank_regex("DEFINITION ", " ",
991             $seq->desc, "\\s\+\|\$",80);
992              
993             # if there, write the accession line
994              
995 25 50       95 if ( $self->_ac_generation_func ) {
996 0         0 $temp_line = &{$self->_ac_generation_func}($seq);
  0         0  
997 0         0 $self->_print("ACCESSION $temp_line\n");
998             }
999             else {
1000 25         64 my @acc = ();
1001 25         113 push @acc, $seq->accession_number;
1002 25 100       142 if ( $seq->isa('Bio::Seq::RichSeqI') ) {
1003 23         99 push @acc, $seq->get_secondary_accessions;
1004             }
1005 25         135 $self->_print("ACCESSION ", join(" ", @acc), "\n");
1006             # otherwise - cannot print
1007             }
1008              
1009             # if PID defined, print it
1010 25 50 66     219 if ($seq->isa('Bio::Seq::RichSeqI') and $seq->pid) {
1011 0         0 $self->_print("PID ", $seq->pid, "\n");
1012             }
1013              
1014             # if there, write the version line
1015 25 50 100     95 if ( defined $self->_sv_generation_func ) {
    100          
1016 0         0 $temp_line = &{$self->_sv_generation_func}($seq);
  0         0  
1017 0 0       0 if ( $temp_line ) {
1018 0         0 $self->_print("VERSION $temp_line\n");
1019             }
1020             }
1021             elsif ($seq->isa('Bio::Seq::RichSeqI') and defined($seq->seq_version)) {
1022 18         72 my $id = $seq->primary_id; # this may be a GI number
1023 18 50 33     212 my $data = (defined $id and $id =~ /^\d+$/) ? " GI:$id" : "";
1024 18         97 $self->_print("VERSION ",
1025             $seq->accession_number, ".",
1026             $seq->seq_version, $data, "\n");
1027             }
1028              
1029             # if there, write the PROJECT line
1030 25         88 for my $proj ( $seq->annotation->get_Annotations('project') ) {
1031 2         12 $self->_print("PROJECT ".$proj->value."\n");
1032             }
1033              
1034             # if there, write the DBSOURCE line
1035 25         85 foreach my $ref ( $seq->annotation->get_Annotations('dblink') ) {
1036 2         9 my ($db, $id) = ($ref->database, $ref->primary_id);
1037 2 100       7 my $prefix = $db eq 'Project' ? 'DBLINK' : 'DBSOURCE';
1038 2 100       9 my $text = $db eq 'GenBank' ? ''
    50          
1039             : $db eq 'Project' ? "$db:$id"
1040             : "$db accession $id";
1041 2         11 $self->_print(sprintf ("%-11s %s\n", $prefix, $text));
1042             }
1043              
1044             # if there, write the keywords line
1045 25 50       84 if ( defined $self->_kw_generation_func ) {
    100          
1046 0         0 $temp_line = &{$self->_kw_generation_func}($seq);
  0         0  
1047 0         0 $self->_print("KEYWORDS $temp_line\n");
1048             }
1049             elsif ( $seq->can('keywords') ) {
1050 23         112 my $kw = $seq->keywords;
1051 23 50       101 $kw .= '.' if ( $kw !~ /\.$/ );
1052 23         108 $self->_print("KEYWORDS $kw\n");
1053             }
1054              
1055             # SEGMENT if it exists
1056 25         78 foreach my $ref ( $seq->annotation->get_Annotations('segment') ) {
1057 0         0 $self->_print(sprintf ("%-11s %s\n",'SEGMENT',
1058             $ref->value));
1059             }
1060              
1061             # Organism lines
1062 25 100       87 if (my $spec = $seq->species) {
1063 22 50       218 my ($on, $sn, $cn) = ($spec->can('organelle') ? $spec->organelle : '',
1064             $spec->scientific_name,
1065             $spec->common_name);
1066 22         57 my @classification;
1067 22 50       122 if ($spec->isa('Bio::Species')) {
1068 22         101 @classification = $spec->classification;
1069 22         45 shift @classification;
1070             }
1071             else {
1072             # Bio::Taxon should have a DB handle of some type attached, so
1073             # derive the classification from that
1074 0         0 my $node = $spec;
1075 0         0 while ($node) {
1076 0   0     0 $node = $node->ancestor || last;
1077 0         0 unshift @classification, $node->node_name;
1078             #$node eq $root && last;
1079             }
1080 0         0 @classification = reverse @classification;
1081             }
1082 22 50       87 my $abname = $spec->name('abbreviated') ? # from genbank file
1083             $spec->name('abbreviated')->[0] : $sn;
1084 22 100       80 my $sl = $on ? "$on " : '';
1085 22 100       80 $sl .= $cn ? "$abname ($cn)" : $abname;
1086              
1087 22         87 $self->_write_line_GenBank_regex("SOURCE ", ' 'x12, $sl, "\\s\+\|\$", 80);
1088 22         66 $self->_print(" ORGANISM ", $spec->scientific_name, "\n");
1089 22         110 my $OC = join('; ', reverse @classification) . '.';
1090 22         72 $self->_write_line_GenBank_regex(' 'x12,' 'x12, $OC, "\\s\+\|\$", 80);
1091             }
1092              
1093             # Reference lines
1094 25         57 my $count = 1;
1095 25         85 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
1096 47         112 $temp_line = "REFERENCE $count";
1097 47 50       198 if ($ref->start) {
    0          
1098 47 100       133 $temp_line .= sprintf (" (%s %d to %d)",
1099             ($seq->alphabet() eq "protein" ?
1100             "residues" : "bases"),
1101             $ref->start, $ref->end);
1102             }
1103             elsif ($ref->gb_reference) {
1104 0         0 $temp_line .= sprintf (" (%s)", $ref->gb_reference);
1105             }
1106 47         283 $self->_print("$temp_line\n");
1107 47         143 $self->_write_line_GenBank_regex(" AUTHORS ", ' 'x12,
1108             $ref->authors, "\\s\+\|\$", 80);
1109 47 100       152 $self->_write_line_GenBank_regex(" CONSRTM ", ' 'x12,
1110             $ref->consortium, "\\s\+\|\$", 80) if $ref->consortium;
1111 47         172 $self->_write_line_GenBank_regex(" TITLE ", ' 'x12,
1112             $ref->title, "\\s\+\|\$", 80);
1113 47         137 $self->_write_line_GenBank_regex(" JOURNAL ", ' 'x12,
1114             $ref->location, "\\s\+\|\$", 80);
1115 47 100       134 if ( $ref->medline) {
1116 5         20 $self->_write_line_GenBank_regex(" MEDLINE ", ' 'x12,
1117             $ref->medline, "\\s\+\|\$", 80);
1118             # I am assuming that pubmed entries only exist when there
1119             # are also MEDLINE entries due to the indentation
1120             }
1121             # This could be a wrong assumption
1122 47 100       109 if ( $ref->pubmed ) {
1123 13         46 $self->_write_line_GenBank_regex(" PUBMED ", ' 'x12,
1124             $ref->pubmed, "\\s\+\|\$", 80);
1125             }
1126             # put remark at the end
1127 47 50       175 if ($ref->comment) {
1128 0         0 $self->_write_line_GenBank_regex(" REMARK ", ' 'x12,
1129             $ref->comment, "\\s\+\|\$", 80);
1130             }
1131 47         93 $count++;
1132             }
1133              
1134             # Comment lines
1135 25         104 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
1136 13         41 $self->_write_line_GenBank_regex("COMMENT ", ' 'x12,
1137             $comment->text, "\\s\+\|\$", 80);
1138             }
1139              
1140             # FEATURES section
1141 25         97 $self->_print("FEATURES Location/Qualifiers\n");
1142              
1143 25 50       180 if ( defined $self->_post_sort ) {
1144             # we need to read things into an array. Process. Sort them. Print 'em
1145 0         0 my $post_sort_func = $self->_post_sort;
1146 0         0 my @fth;
1147              
1148 0         0 foreach my $sf ( $seq->top_SeqFeatures ) {
1149 0         0 push @fth, Bio::SeqIO::FTHelper::from_SeqFeature($sf, $seq);
1150             }
1151              
1152 0         0 @fth = sort { &$post_sort_func($a, $b) } @fth;
  0         0  
1153              
1154 0         0 foreach my $fth ( @fth ) {
1155 0         0 $self->_print_GenBank_FTHelper($fth);
1156             }
1157             }
1158             else {
1159             # not post sorted. And so we can print as we get them.
1160             # lower memory load...
1161 25         157 foreach my $sf ( $seq->top_SeqFeatures ) {
1162 430         1113 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf, $seq);
1163 430         564 foreach my $fth ( @fth ) {
1164 430 50       865 if ( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
1165 0         0 $sf->throw("Cannot process FTHelper... $fth");
1166             }
1167 430         908 $self->_print_GenBank_FTHelper($fth);
1168             }
1169             }
1170             }
1171              
1172             # deal with WGS; WGS_SCAFLD present only if WGS is also present
1173 25 50       210 if ($seq->annotation->get_Annotations('wgs')) {
1174 0         0 foreach my $wgs (map {$seq->annotation->get_Annotations($_)}
  0         0  
1175             qw(wgs wgs_scaffold)
1176             ) {
1177 0         0 $self->_print(sprintf ("%-11s %s\n",
1178             uc($wgs->tagname),
1179             $wgs->value));
1180             }
1181 0         0 $self->_show_dna(0);
1182             }
1183 25 100       109 if ($seq->annotation->get_Annotations('contig')) {
1184 1         4 my $ct = 0;
1185 1         4 my $cline;
1186 1         6 foreach my $contig ($seq->annotation->get_Annotations('contig')) {
1187 1 50       5 unless ($ct) {
1188 1         3 $cline = uc($contig->tagname) . " " . $contig->value . "\n";
1189             }
1190             else {
1191 0         0 $cline = " " . $contig->value . "\n";
1192             }
1193 1         6 $self->_print($cline);
1194 1         3 $ct++;
1195             }
1196             }
1197 25 100       107 if ( $seq->length == 0 ) {
1198 1         3 $self->_show_dna(0);
1199             }
1200              
1201 25 100       106 if ( $self->_show_dna == 0 ) {
1202 1         3 $self->_print("\n//\n");
1203 1         5 return;
1204             }
1205              
1206             # finished printing features.
1207              
1208 24         388 $str =~ tr/A-Z/a-z/;
1209              
1210 24         79 my ($o) = $seq->annotation->get_Annotations('origin');
1211 24 50       197 $self->_print(sprintf("%-12s%s\n",
1212             'ORIGIN', $o ? $o->value : ''));
1213             # print out the sequence
1214 24         46 my $nuc = 60; # Number of nucleotides per line
1215 24         53 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
1216 24         53 my $out_pat = 'A11' x 6; # Pattern for packing a line
1217 24         44 my $length = length $str;
1218              
1219             # Calculate the number of nucleotides which fit on whole lines
1220 24         134 my $whole = int($length / $nuc) * $nuc;
1221              
1222             # Print the whole lines
1223 24         37 my $i;
1224 24         96 for ($i = 0; $i < $whole; $i += $nuc) {
1225 1953         6604 my $blocks = pack $out_pat,
1226             unpack $whole_pat,
1227             substr($str, $i, $nuc);
1228 1953         3024 chop $blocks;
1229 1953         5501 $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59));
1230             }
1231              
1232             # Print the last line
1233 24 100       93 if (my $last = substr($str, $i)) {
1234 19         36 my $last_len = length($last);
1235 19         114 my $last_pat = 'a10' x int($last_len / 10)
1236             . 'a' . $last_len % 10;
1237 19         107 my $blocks = pack $out_pat,
1238             unpack($last_pat, $last);
1239 19         143 $blocks =~ s/ +$//;
1240 19         130 $self->_print(sprintf("%9d $blocks\n",
1241             $length - $last_len + 1));
1242             }
1243              
1244 24         79 $self->_print("//\n");
1245              
1246 24 50 33     100 $self->flush if $self->_flush_on_write && defined $self->_fh;
1247 24         198 return 1;
1248             }
1249             }
1250              
1251             =head2 _print_GenBank_FTHelper
1252              
1253             Title : _print_GenBank_FTHelper
1254             Usage :
1255             Function:
1256             Example :
1257             Returns :
1258             Args :
1259              
1260             =cut
1261              
1262             sub _print_GenBank_FTHelper {
1263 430     430   619 my ( $self, $fth ) = @_;
1264              
1265 430 50 33     1519 if ( not ref $fth or not $fth->isa('Bio::SeqIO::FTHelper') ) {
1266 0         0 $fth->warn(
1267             "$fth is not a FTHelper class. Attempting to print but there could be issues"
1268             );
1269             }
1270              
1271 430 50       753 my $spacer = ( length $fth->key >= 15 ) ? ' ' : '';
1272 430         775 $self->_write_line_GenBank_regex(
1273             sprintf( " %-16s%s", $fth->key, $spacer ),
1274             " " x 21, $fth->loc, "\,\|\$", 80 );
1275              
1276 430         656 foreach my $tag ( sort keys %{ $fth->field } ) {
  430         875  
1277             # Account for hash structure in Annotation::DBLink, not the expected array
1278 997 50 66     2132 if ( $tag eq 'db_xref' and grep /HASH/, @{ $fth->field->{$tag} }) {
  140         259  
1279 0         0 for my $ref ( @{ $fth->field->{$tag} } ) {
  0         0  
1280 0         0 my $db = $ref->{'database'};
1281 0         0 my $id = $ref->{'primary_id'};
1282 0         0 $self->_write_line_GenBank_regex
1283             ( " " x 21, " " x 21,
1284             "/$tag=\"$db:$id\"", "\.\|\$", 80 );
1285             }
1286             }
1287             # The usual case, where all values are found in an array
1288             else {
1289 997         1072 foreach my $value ( @{ $fth->field->{$tag} } ) {
  997         1528  
1290 1085         1497 $value =~ s/\"/\"\"/g;
1291 1085 50 100     2752 if ( $value eq "_no_value" ) {
    100          
1292 0         0 $self->_write_line_GenBank_regex
1293             ( " " x 21, " " x 21,
1294             "/$tag", "\.\|\$", 80 );
1295             }
1296              
1297             # There are almost 3x more quoted qualifier values and they
1298             # are more common too so we take quoted ones first.
1299             # Long qualifiers, that will be line wrapped, are always quoted
1300             elsif ( not $FTQUAL_NO_QUOTE{$tag}
1301             or length("/$tag=$value") >= $FTQUAL_LINE_LENGTH
1302             ) {
1303 976 100       2254 my ($pat) = ( $value =~ /\s/ ? '\s|$' : '.|$' );
1304 976         2764 $self->_write_line_GenBank_regex
1305             ( " " x 21, " " x 21,
1306             "/$tag=\"$value\"", $pat, 80 );
1307             }
1308             else {
1309 109         310 $self->_write_line_GenBank_regex
1310             ( " " x 21, " " x 21,
1311             "/$tag=$value", "\.\|\$", 80 );
1312             }
1313             }
1314             }
1315             }
1316             }
1317              
1318             =head2 _read_GenBank_References
1319              
1320             Title : _read_GenBank_References
1321             Usage :
1322             Function: Reads references from GenBank format. Internal function really
1323             Returns :
1324             Args :
1325              
1326             =cut
1327              
1328             sub _read_GenBank_References {
1329 100     100   271 my ($self, $buffer) = @_;
1330 100         191 my (@refs);
1331             my $ref;
1332              
1333             # assume things are starting with RN
1334 100 50       499 if ( $$buffer !~ /^REFERENCE/ ) {
1335 0         0 warn("Not parsing line '$$buffer' which maybe important");
1336             }
1337              
1338 100         219 my $line = $$buffer;
1339              
1340 100         252 my (@title,@loc,@authors,@consort,@com,@medline,@pubmed);
1341              
1342             REFLOOP:
1343 100   66     702 while( defined($line) or defined($line = $self->_readline) ) {
1344 1379 100       3185 if ($line =~ /^\s{2}AUTHORS\s+(.*)/o) {
1345 402         1053 push @authors, $1;
1346 402         827 while ( defined($line = $self->_readline) ) {
1347 1040 100       2660 if ($line =~ /^\s{9,}(.*)/o) {
1348 638         1260 push @authors, $1;
1349 638         1184 next;
1350             }
1351 402         529 last;
1352             }
1353 402         1740 $ref->authors(join(' ', @authors));
1354             }
1355              
1356 1379 100       2211 if ($line =~ /^\s{2}CONSRTM\s+(.*)/o) {
1357 35         96 push @consort, $1;
1358 35         84 while ( defined($line = $self->_readline) ) {
1359 36 100       137 if ($line =~ /^\s{9,}(.*)/o) {
1360 1         6 push @consort, $1;
1361 1         5 next;
1362             }
1363 35         145 last;
1364             }
1365 35         188 $ref->consortium(join(' ', @consort));
1366             }
1367              
1368 1379 100       2688 if ($line =~ /^\s{2}TITLE\s+(.*)/o) {
1369 408         881 push @title, $1;
1370 408         879 while ( defined($line = $self->_readline) ) {
1371 693 100       1758 if ($line =~ /^\s{9,}(.*)/o) {
1372 285         605 push @title, $1;
1373 285         598 next;
1374             }
1375 408         463 last;
1376             }
1377 408         1525 $ref->title(join(' ', @title));
1378             }
1379              
1380 1379 100       2677 if ($line =~ /^\s{2}JOURNAL\s+(.*)/o) {
1381 416         952 push @loc, $1;
1382 416         836 while ( defined($line = $self->_readline) ) {
1383             # we only match when there are at least 4 spaces
1384             # there is probably a better way to match this
1385             # as it assumes that the describing tag is short enough
1386 565 100       1433 if ($line =~ /^\s{9,}(.*)/o) {
1387 149         348 push @loc, $1;
1388 149         313 next;
1389             }
1390 416         538 last;
1391             }
1392 416         1562 $ref->location(join(' ', @loc));
1393 416         671 redo REFLOOP;
1394             }
1395              
1396 963 100       1501 if ($line =~ /^\s{2}REMARK\s+(.*)/o) {
1397 59         139 push @com, $1;
1398 59         132 while ( defined($line = $self->_readline) ) {
1399 83 100       214 if ($line =~ /^\s{9,}(.*)/o) {
1400 24         53 push @com, $1;
1401 24         51 next;
1402             }
1403 59         73 last;
1404             }
1405 59         279 $ref->comment(join(' ', @com));
1406 59         93 redo REFLOOP;
1407             }
1408              
1409 904 100       1625 if ( $line =~ /^\s{2}MEDLINE\s+(.*)/ ) {
1410 130         287 push @medline, $1;
1411 130         284 while ( defined($line = $self->_readline) ) {
1412 130 50       354 if ($line =~ /^\s{9,}(.*)/) {
1413 0         0 push @medline, $1;
1414 0         0 next;
1415             }
1416 130         161 last;
1417             }
1418 130         402 $ref->medline(join(' ', @medline));
1419 130         171 redo REFLOOP;
1420             }
1421              
1422 774 100       1536 if ( $line =~ /^\s{3}PUBMED\s+(.*)/ ) {
1423 237         505 push @pubmed, $1;
1424 237         476 while ( defined($line = $self->_readline) ) {
1425 237 50       617 if ($line =~ /^\s{9,}(.*)/) {
1426 0         0 push @pubmed, $1;
1427 0         0 next;
1428             }
1429 237         275 last;
1430             }
1431 237         736 $ref->pubmed(join(' ', @pubmed));
1432 237         310 redo REFLOOP;
1433             }
1434              
1435 537 100       1282 if ( $line =~ /^REFERENCE/o ) {
1436             # store current reference
1437 417 100       1162 $self->_add_ref_to_array(\@refs,$ref) if defined $ref;
1438              
1439             # reset
1440 417         701 @authors = ();
1441 417         544 @title = ();
1442 417         463 @loc = ();
1443 417         535 @com = ();
1444 417         493 @pubmed = ();
1445 417         472 @medline = ();
1446              
1447             # create the new reference object
1448 417         1898 $ref = Bio::Annotation::Reference->new(-tagname => 'reference');
1449              
1450             # check whether start and end base is given
1451 417 100       1877 if ($line =~ /^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)\)/){
    100          
1452 286         960 $ref->start($1);
1453 286         946 $ref->end($2);
1454             }
1455             elsif ($line =~ /^REFERENCE\s+\d+\s+\((.*)\)/) {
1456 14         38 $ref->gb_reference($1);
1457             }
1458             }
1459              
1460 537 100       2474 last if ($line =~ /^(FEATURES)|(COMMENT)/o);
1461              
1462 437         1848 $line = undef; # Empty $line to trigger read of next line
1463             }
1464              
1465             # store last reference
1466 100 50       559 $self->_add_ref_to_array(\@refs, $ref) if defined $ref;
1467              
1468 100         317 $$buffer = $line;
1469              
1470             #print "\nnumber of references found: ", $#refs+1,"\n";
1471              
1472 100         445 return @refs;
1473             }
1474              
1475             =head2 _add_ref_to_array
1476              
1477             Title: _add_ref_to_array
1478             Usage:
1479             Function: Adds a Reference object to an array of Reference objects, takes
1480             care of possible cleanups to be done (currently, only author and title
1481             will be chopped of trailing semicolons).
1482             Args: A reference to an array of Reference objects and
1483             the Reference object to be added
1484             Returns: nothing
1485              
1486             =cut
1487              
1488             sub _add_ref_to_array {
1489 417     417   673 my ($self, $refs, $ref) = @_;
1490              
1491             # first, polish author and title by removing possible trailing semicolons
1492 417         755 my $au = $ref->authors;
1493 417         834 my $title = $ref->title;
1494 417 100       1032 $au =~ s/;\s*$//g if $au;
1495 417 100       835 $title =~ s/;\s*$//g if $title;
1496 417         883 $ref->authors($au);
1497 417         939 $ref->title($title);
1498             # the rest should be clean already, so go ahead and add it
1499 417         451 push @{$refs}, $ref;
  417         809  
1500             }
1501              
1502             =head2 _read_GenBank_Species
1503              
1504             Title : _read_GenBank_Species
1505             Usage :
1506             Function: Reads the GenBank Organism species and classification
1507             lines. Able to deal with unconvential Organism naming
1508             formats, and varietas in plants
1509             Example : ORGANISM unknown marine gamma proteobacterium NOR5
1510             $genus = undef
1511             $species = unknown marine gamma proteobacterium NOR5
1512              
1513             ORGANISM Drosophila sp. 'white tip scutellum'
1514             $genus = Drosophila
1515             $species = sp. 'white tip scutellum'
1516             (yes, this really is a species and that is its name)
1517             $subspecies = undef
1518              
1519             ORGANISM Ajellomyces capsulatus var. farciminosus
1520             $genus = Ajellomyces
1521             $species = capsulatus
1522             $subspecies = var. farciminosus
1523              
1524             ORGANISM Hepatitis delta virus
1525             $genus = undef (though this virus has a genus in its lineage, we
1526             cannot know that without a database lookup)
1527             $species = Hepatitis delta virus
1528              
1529             Returns : A Bio::Species object
1530             Args : A reference to the current line buffer
1531              
1532             =cut
1533              
1534             sub _read_GenBank_Species {
1535 101     101   264 my ($self, $buffer) = @_;
1536              
1537 101         448 my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown',
1538             'Unspecified', 'Unknown', 'None', 'unclassified',
1539             'unidentified organism', 'not supplied');
1540             # dictionary of synonyms for taxid 32644
1541 101         287 my @unkn_genus = ('unknown', 'unclassified', 'uncultured', 'unidentified');
1542             # all above can be part of valid species name
1543              
1544 101         215 my $line = $$buffer;
1545              
1546 101         235 my( $sub_species, $species, $genus, $sci_name, $common,
1547             $class_lines, $source_flag, $abbr_name, $organelle, $sl );
1548 101         236 my %source = map { $_ => 1 } qw(SOURCE ORGANISM CLASSIFICATION);
  303         797  
1549              
1550             # upon first entering the loop, we must not read a new line -- the SOURCE
1551             # line is already in the buffer (HL 05/10/2000)
1552 101         211 my ($ann, $tag, $data);
1553 101   66     385 while (defined($line) or defined($line = $self->_readline)) {
1554             # de-HTMLify (links that may be encountered here don't contain
1555             # escaped '>', so a simple-minded approach suffices)
1556 517         881 $line =~ s{<[^>]+>}{}g;
1557 517 100       2055 if ($line =~ m{^(?:\s{0,2})(\w+)\s+(.+)?$}ox) {
1558 303   50     1263 ($tag, $data) = ($1, $2 || '');
1559 303 100 66     1224 last if ($tag and not exists $source{$tag});
1560             }
1561             else {
1562 214 50       510 return unless $tag;
1563 214         816 ($data = $line) =~ s{^\s+}{};
1564 214         392 chomp $data;
1565 214 100 100     1245 $tag = 'CLASSIFICATION' if ( $tag ne 'CLASSIFICATION'
      100        
1566             and $tag eq 'ORGANISM'
1567             # Don't match "str." or "var." (fix NC_021815),
1568             # and don't match ".1" (fix NC_021902)
1569             and $line =~ m{(?
1570             }
1571 416 100       1414 (exists $ann->{$tag}) ? ($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data);
1572 416         2451 $line = undef;
1573             }
1574              
1575 101         388 ($sl, $class_lines, $sci_name) = ($ann->{SOURCE}, $ann->{CLASSIFICATION}, $ann->{ORGANISM});
1576              
1577 101         211 $$buffer = $line;
1578              
1579 101 50       281 $sci_name or return;
1580              
1581             # parse out organelle, common name, abbreviated name if present;
1582             # this should catch everything, but falls back to
1583             # entire SOURCE line just in case
1584 101 50       1126 if ($sl =~ m{^(mitochondrion|chloroplast|plastid)?
1585             \s*(.*?)
1586             \s*(?: \( (.*?) \) )?\.?
1587             $
1588             }xms
1589             ) {
1590 101         407 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
1591             }
1592             else {
1593 0         0 $abbr_name = $sl; # nothing caught; this is a backup!
1594             }
1595              
1596             # Convert data in classification lines into classification array.
1597             # only split on ';' or '.' so that classification that is 2 or more words will
1598             # still get matched, use map() to remove trailing/leading/intervening spaces
1599 101         945 my @class = map { $_ =~ s/^\s+//;
  955         1816  
1600 955         1398 $_ =~ s/\s+$//;
1601 955         1134 $_ =~ s/\s{2,}/ /g;
1602 955         1404 $_; }
1603             split /(?
1604              
1605             # do we have a genus?
1606 101 50       575 my $possible_genus = quotemeta($class[-1])
1607             . ($class[-2] ? "|" . quotemeta($class[-2]) : '');
1608 101 100       4231 if ($sci_name =~ /^($possible_genus)/) {
1609 88         272 $genus = $1;
1610 88         1131 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1611             }
1612             else {
1613 13         43 $species = $sci_name;
1614             }
1615              
1616             # is this organism of rank species or is it lower?
1617             # (we don't catch everything lower than species, but it doesn't matter -
1618             # this is just so we abide by previous behaviour whilst not calling a
1619             # species a subspecies)
1620 101 100 66     1425 if ($species and $species =~ /(.+)\s+((?:subsp\.|var\.).+)/) {
1621 5         19 ($species, $sub_species) = ($1, $2);
1622             }
1623              
1624             # Don't make a species object if it's empty or "Unknown" or "None"
1625             # return unless $genus and $genus !~ /^(Unknown|None)$/oi;
1626             # Don't make a species object if it belongs to taxid 32644
1627             # my $unkn = grep { $_ =~ /^\Q$sl\E$/; } @unkn_names;
1628 101         253 my $unkn = grep { $_ eq $sl } @unkn_names;
  1010         1424  
1629 101 50 33     564 return unless (defined $species or defined $genus) and $unkn == 0;
      33        
1630              
1631             # Bio::Species array needs array in Species -> Kingdom direction
1632 101         242 push @class, $sci_name;
1633 101         197 @class = reverse @class;
1634              
1635 101         775 my $make = Bio::Species->new;
1636 101         426 $make->scientific_name($sci_name);
1637 101 50       717 $make->classification(@class) if @class > 0;
1638 101 100       469 $make->common_name( $common ) if $common;
1639 101 50       583 $make->name('abbreviated', $abbr_name) if $abbr_name;
1640 101 100       361 $make->organelle($organelle) if $organelle;
1641             #$make->sub_species( $sub_species ) if $sub_species;
1642 101         1311 return $make;
1643             }
1644              
1645             =head2 _read_FTHelper_GenBank
1646              
1647             Title : _read_FTHelper_GenBank
1648             Usage : _read_FTHelper_GenBank($buffer)
1649             Function: reads the next FT key line
1650             Example :
1651             Returns : Bio::SeqIO::FTHelper object
1652             Args : filehandle and reference to a scalar
1653              
1654             =cut
1655              
1656             sub _read_FTHelper_GenBank {
1657 8280     8280   12021 my ($self, $buffer) = @_;
1658              
1659 8280         10144 my ($key, # The key of the feature
1660             $loc # The location line from the feature
1661             );
1662 8280         9524 my @qual = (); # An array of lines making up the qualifiers
1663              
1664 8280 50       52960 if ($$buffer =~ /^\s{5}(\S+)\s+(.+?)\s*$/o) {
1665 8280         19631 $key = $1;
1666 8280         12436 $loc = $2;
1667             # Read all the lines up to the next feature
1668 8280         20540 while ( defined(my $line = $self->_readline) ) {
1669 88592 100       396003 if ($line =~ /^(\s+)(.+?)\s*$/o) {
1670             # Lines inside features are preceded by 21 spaces
1671             # A new feature is preceded by 5 spaces
1672 88489 100       145149 if (length($1) > 6) {
1673             # Add to qualifiers if we're in the qualifiers, or if it's
1674             # the first qualifier
1675 80312 100 100     159431 if (@qual or (index($2,'/') == 0)) {
1676 68121         164611 push @qual, $2;
1677             }
1678             # We're still in the location line, so append to location
1679             else {
1680 12191         32051 $loc .= $2;
1681             }
1682             }
1683             else {
1684             # We've reached the start of the next feature
1685             # Put the first line of the next feature into the buffer
1686 8177         10835 $$buffer = $line;
1687 8177         12839 last;
1688             }
1689             }
1690             else {
1691             # We're at the end of the feature table
1692             # Put the first line of the next feature into the buffer
1693 103         189 $$buffer = $line;
1694 103         181 last;
1695             }
1696             }
1697             }
1698             else {
1699             # No feature key
1700 0         0 $self->debug("no feature key!\n");
1701             # change suggested by JDiggans to avoid infinite loop-
1702             # see bugreport 1062.
1703             # reset buffer to prevent infinite loop
1704 0         0 $$buffer = $self->_readline;
1705 0         0 return;
1706             }
1707              
1708             # Make the new FTHelper object
1709 8280         21210 my $out = Bio::SeqIO::FTHelper->new;
1710 8280         17808 $out->verbose($self->verbose);
1711 8280         17124 $out->key($key);
1712 8280         16320 $out->loc($loc);
1713              
1714             # Now parse and add any qualifiers. (@qual is kept
1715             # intact to provide informative error messages.)
1716             QUAL:
1717 8280         16130 for (my $i = 0; $i < @qual; $i++) {
1718 55807         67013 my $data = $qual[$i];
1719 55807 50       210304 my ( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s*(.+))?})
1720             or $self->warn( "cannot see new qualifier in feature $key: "
1721             . $data);
1722 55807 50       87055 $qualifier = '' if not defined $qualifier;
1723              
1724 55807 100       64904 if (defined $value) {
1725             # Do we have a quoted value?
1726 54903 100       81539 if (substr($value, 0, 1) eq '"') {
    100          
1727             # Keep adding to value until we find the trailing quote
1728             # and the quotes are balanced
1729 52083   66     166172 while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) {
1730 12312 50       18491 if ($i >= $#qual) {
1731 0         0 $self->warn( "Unbalanced quote in:\n"
1732             . join("\n", @qual)
1733             . "No further qualifiers will "
1734             . "be added for this feature");
1735 0         0 last QUAL;
1736             }
1737             # modifying a for-loop variable inside of the loop
1738             # is not the best programming style ...
1739 12312         11478 $i++;
1740 12312         13454 my $next = $qual[$i];
1741              
1742             # add to value with a space unless the value appears
1743             # to be a sequence (translation for example)
1744             # if (($value.$next) =~ /[^A-Za-z\"\-]/o) {
1745             # changed to explicitly look for translation tag - cjf 06/8/29
1746 12312 100       18027 if ($qualifier !~ /^translation$/i ) {
1747 9870         14569 $value .= " ";
1748             }
1749 12312         41785 $value .= $next;
1750             }
1751             # Trim leading and trailing quotes
1752 52083         257882 $value =~ s/^"|"$//g;
1753             # Undouble internal quotes
1754 52083         76821 $value =~ s/""/\"/g;
1755             }
1756             elsif ( $value =~ /^\(/ ) { # values quoted by ()s
1757             # Keep adding to value until we find the trailing bracket
1758             # and the ()s are balanced
1759 27         42 my $left = ($value =~ tr/\(/\(/); # count left parens
1760 27         32 my $right = ($value =~ tr/\)/\)/); # count right parens
1761              
1762 27         83 while( $left != $right ) { # was "$value !~ /\)$/ or $left != $right"
1763 2 50       8 if ( $i >= $#qual) {
1764 0         0 $self->warn( "Unbalanced parens in:\n"
1765             . join("\n", @qual)
1766             . "\nNo further qualifiers will "
1767             . "be added for this feature");
1768 0         0 last QUAL;
1769             }
1770 2         5 $i++;
1771 2         5 my $next = $qual[$i];
1772 2         5 $value .= $next;
1773 2         4 $left += ($next =~ tr/\(/\(/);
1774 2         5 $right += ($next =~ tr/\)/\)/);
1775             }
1776             }
1777             }
1778             else {
1779 904         1159 $value = '_no_value';
1780             }
1781             # Store the qualifier
1782 55807   100     86996 $out->field->{$qualifier} ||= [];
1783 55807         53394 push @{$out->field->{$qualifier}}, $value;
  55807         69021  
1784             }
1785 8280         18704 return $out;
1786             }
1787              
1788             =head2 _write_line_GenBank
1789              
1790             Title : _write_line_GenBank
1791             Usage :
1792             Function: internal function
1793             Example :
1794             Returns :
1795             Args :
1796              
1797             =cut
1798              
1799             sub _write_line_GenBank {
1800 0     0   0 my ($self, $pre1, $pre2, $line, $length) = @_;
1801              
1802 0 0       0 $length or $self->throw("Miscalled write_line_GenBank without length. Programming error!");
1803 0         0 my $subl = $length - length $pre2;
1804 0         0 my $linel = length $line;
1805 0         0 my $i;
1806              
1807 0         0 my $subr = substr($line,0,$length - length $pre1);
1808              
1809 0         0 $self->_print("$pre1$subr\n");
1810 0         0 for($i = ($length - length $pre1); $i < $linel; $i += $subl) {
1811 0         0 $subr = substr($line, $i, $subl);
1812 0         0 $self->_print("$pre2$subr\n");
1813             }
1814             }
1815              
1816             =head2 _write_line_GenBank_regex
1817              
1818             Title : _write_line_GenBank_regex
1819             Usage :
1820             Function: internal function for writing lines of specified
1821             length, with different first and the next line
1822             left hand headers and split at specific points in the
1823             text
1824             Example :
1825             Returns : nothing
1826             Args : file handle,
1827             first header,
1828             second header,
1829             text-line,
1830             regex for line breaks,
1831             total line length
1832              
1833             =cut
1834              
1835             sub _write_line_GenBank_regex {
1836 1765     1765   3018 my ($self, $pre1, $pre2, $line, $regex, $length) = @_;
1837              
1838             #print STDOUT "Going to print with $line!\n";
1839              
1840 1765 50       2734 $length or $self->throw("Miscalled write_line_GenBank without length. Programming error!");
1841              
1842 1765         2147 my $subl = $length - (length $pre1) - 2;
1843 1765         1929 my @lines = ();
1844              
1845             CHUNK:
1846 1765         2402 while ($line) {
1847 2129         3230 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1848 2129 50       23520 if ($line =~ m/^(.{0,$subl})($pat)(.*)/ ) {
1849 2129         5665 my $l = $1 . $2;
1850 2129         3154 $line = substr($line, length $l);
1851             # be strict about not padding spaces according to
1852             # genbank format
1853 2129         4141 $l =~ s/\s+$//;
1854 2129 50       3214 next CHUNK if ($l eq '');
1855 2129         2571 push @lines, $l;
1856 2129         4942 next CHUNK;
1857             }
1858             }
1859             # if we get here none of the patterns matched $subl or less chars
1860 0         0 $self->warn( "trouble dissecting \"$line\"\n into chunks "
1861             . "of $subl chars or less - this tag won't print right");
1862             # insert a space char to prevent infinite loops
1863 0         0 $line = substr($line, 0, $subl) . " " . substr($line, $subl);
1864             }
1865 1765         2184 my $s = shift @lines;
1866 1765 100       6341 $self->_print("$pre1$s\n") if $s;
1867 1765         4788 foreach my $s ( @lines ) {
1868 369         727 $self->_print("$pre2$s\n");
1869             }
1870             }
1871              
1872             =head2 _post_sort
1873              
1874             Title : _post_sort
1875             Usage : $obj->_post_sort($newval)
1876             Function:
1877             Returns : value of _post_sort
1878             Args : newvalue (optional)
1879              
1880             =cut
1881              
1882             sub _post_sort {
1883 25     25   57 my ($obj,$value) = @_;
1884 25 50       110 if ( defined $value) {
1885 0         0 $obj->{'_post_sort'} = $value;
1886             }
1887 25         94 return $obj->{'_post_sort'};
1888             }
1889              
1890             =head2 _show_dna
1891              
1892             Title : _show_dna
1893             Usage : $obj->_show_dna($newval)
1894             Function:
1895             Returns : value of _show_dna
1896             Args : newvalue (optional)
1897              
1898             =cut
1899              
1900             sub _show_dna {
1901 141     141   312 my ($obj,$value) = @_;
1902 141 100       362 if ( defined $value) {
1903 116         293 $obj->{'_show_dna'} = $value;
1904             }
1905 141         278 return $obj->{'_show_dna'};
1906             }
1907              
1908             =head2 _id_generation_func
1909              
1910             Title : _id_generation_func
1911             Usage : $obj->_id_generation_func($newval)
1912             Function:
1913             Returns : value of _id_generation_func
1914             Args : newvalue (optional)
1915              
1916             =cut
1917              
1918             sub _id_generation_func {
1919 25     25   58 my ($obj,$value) = @_;
1920 25 50       78 if ( defined $value ) {
1921 0         0 $obj->{'_id_generation_func'} = $value;
1922             }
1923 25         109 return $obj->{'_id_generation_func'};
1924             }
1925              
1926             =head2 _ac_generation_func
1927              
1928             Title : _ac_generation_func
1929             Usage : $obj->_ac_generation_func($newval)
1930             Function:
1931             Returns : value of _ac_generation_func
1932             Args : newvalue (optional)
1933              
1934             =cut
1935              
1936             sub _ac_generation_func {
1937 25     25   70 my ($obj,$value) = @_;
1938 25 50       83 if ( defined $value ) {
1939 0         0 $obj->{'_ac_generation_func'} = $value;
1940             }
1941 25         81 return $obj->{'_ac_generation_func'};
1942             }
1943              
1944             =head2 _sv_generation_func
1945              
1946             Title : _sv_generation_func
1947             Usage : $obj->_sv_generation_func($newval)
1948             Function:
1949             Returns : value of _sv_generation_func
1950             Args : newvalue (optional)
1951              
1952             =cut
1953              
1954             sub _sv_generation_func {
1955 25     25   66 my ($obj,$value) = @_;
1956 25 50       82 if ( defined $value ) {
1957 0         0 $obj->{'_sv_generation_func'} = $value;
1958             }
1959 25         216 return $obj->{'_sv_generation_func'};
1960             }
1961              
1962             =head2 _kw_generation_func
1963              
1964             Title : _kw_generation_func
1965             Usage : $obj->_kw_generation_func($newval)
1966             Function:
1967             Returns : value of _kw_generation_func
1968             Args : newvalue (optional)
1969              
1970             =cut
1971              
1972             sub _kw_generation_func {
1973 25     25   59 my ($obj,$value) = @_;
1974 25 50       72 if ( defined $value ) {
1975 0         0 $obj->{'_kw_generation_func'} = $value;
1976             }
1977 25         250 return $obj->{'_kw_generation_func'};
1978             }
1979              
1980             1;