File Coverage

Bio/SeqIO/genbank.pm
Criterion Covered Total %
statement 612 723 84.6
branch 308 404 76.2
condition 74 111 66.6
subroutine 25 26 96.1
pod 2 2 100.0
total 1021 1266 80.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::genbank
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Bioperl project bioperl-l(at)bioperl.org
7             #
8             # Copyright Elia Stupka and contributors see AUTHORS section
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::genbank - GenBank sequence input/output stream
17              
18             =head1 SYNOPSIS
19              
20             It is probably best not to use this object directly, but
21             rather go through the SeqIO handler:
22              
23             $stream = Bio::SeqIO->new(-file => $filename,
24             -format => 'GenBank');
25              
26             while ( my $seq = $stream->next_seq ) {
27             # do something with $seq
28             }
29              
30              
31             =head1 DESCRIPTION
32              
33             This object can transform Bio::Seq objects to and from GenBank flat
34             file databases.
35              
36             There is some flexibility here about how to write GenBank output
37             that is not fully documented.
38              
39             =head2 Optional functions
40              
41             =over 3
42              
43             =item _show_dna()
44              
45             (output only) shows the dna or not
46              
47             =item _post_sort()
48              
49             (output only) provides a sorting func which is applied to the FTHelpers
50             before printing
51              
52             =item _id_generation_func()
53              
54             This is function which is called as
55              
56             print "ID ", $func($seq), "\n";
57              
58             To generate the ID line. If it is not there, it generates a sensible ID
59             line using a number of tools.
60              
61             If you want to output annotations in Genbank format they need to be
62             stored in a Bio::Annotation::Collection object which is accessible
63             through the Bio::SeqI interface method L.
64              
65             The following are the names of the keys which are pulled from a
66             L object:
67              
68             reference - Should contain Bio::Annotation::Reference objects
69             comment - Should contain Bio::Annotation::Comment objects
70             dblink - Should contain a Bio::Annotation::DBLink object
71             segment - Should contain a Bio::Annotation::SimpleValue object
72             origin - Should contain a Bio::Annotation::SimpleValue object
73             wgs - Should contain a Bio::Annotation::SimpleValue object
74              
75             =back
76              
77             =head1 Where does the data go?
78              
79             Data parsed in Bio::SeqIO::genbank is stored in a variety of data
80             fields in the sequence object that is returned. Here is a partial list
81             of fields.
82              
83             Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you
84             the top level object which defines a function called NAME() which
85             stores this information.
86              
87             Items listed as Annotation 'NAME' tell you the data is stored the
88             associated Bio::AnnotationCollectionI object which is associated with
89             Bio::Seq objects. If it is explicitly requested that no annotations
90             should be stored when parsing a record of course they will not be
91             available when you try and get them. If you are having this problem
92             look at the type of SeqBuilder that is being used to contruct your
93             sequence object.
94              
95             Comments Annotation 'comment'
96             References Annotation 'reference'
97             Segment Annotation 'segment'
98             Origin Annotation 'origin'
99             Dbsource Annotation 'dblink'
100              
101             Accessions PrimarySeq accession_number()
102             Secondary accessions RichSeq get_secondary_accessions()
103             GI number PrimarySeq primary_id()
104             LOCUS PrimarySeq display_id()
105             Keywords RichSeq get_keywords()
106             Dates RichSeq get_dates()
107             Molecule RichSeq molecule()
108             Seq Version RichSeq seq_version()
109             PID RichSeq pid()
110             Division RichSeq division()
111             Features Seq get_SeqFeatures()
112             Alphabet PrimarySeq alphabet()
113             Definition PrimarySeq description() or desc()
114             Version PrimarySeq version()
115              
116             Sequence PrimarySeq seq()
117              
118             There is more information in the Feature-Annotation HOWTO about each
119             field and how it is mapped to the Sequence object
120             L.
121              
122             =head1 FEEDBACK
123              
124             =head2 Mailing Lists
125              
126             User feedback is an integral part of the evolution of this and other
127             Bioperl modules. Send your comments and suggestions preferably to one
128             of the Bioperl mailing lists. Your participation is much appreciated.
129              
130             bioperl-l@bioperl.org - General discussion
131             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
132              
133             =head2 Support
134              
135             Please direct usage questions or support issues to the mailing list:
136              
137             I
138              
139             rather than to the module maintainer directly. Many experienced and
140             reponsive experts will be able look at the problem and quickly
141             address it. Please include a thorough description of the problem
142             with code and data examples if at all possible.
143              
144             =head2 Reporting Bugs
145              
146             Report bugs to the Bioperl bug tracking system to help us keep track
147             the bugs and their resolution. Bug reports can be submitted via the web:
148              
149             https://github.com/bioperl/bioperl-live/issues
150              
151             =head1 AUTHOR - Bioperl Project
152              
153             bioperl-l at bioperl.org
154              
155             Original author Elia Stupka, elia -at- tigem.it
156              
157             =head1 CONTRIBUTORS
158              
159             Ewan Birney birney at ebi.ac.uk
160             Jason Stajich jason at bioperl.org
161             Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
162             Lincoln Stein lstein at cshl.org
163             Heikki Lehvaslaiho, heikki at ebi.ac.uk
164             Hilmar Lapp, hlapp at gmx.net
165             Donald G. Jackson, donald.jackson at bms.com
166             James Wasmuth, james.wasmuth at ed.ac.uk
167             Brian Osborne, bosborne at alum.mit.edu
168             Chris Fields, cjfields at bioperl dot org
169              
170             =head1 APPENDIX
171              
172             The rest of the documentation details each of the object
173             methods. Internal methods are usually preceded with a _
174              
175             =cut
176              
177             # Let the code begin...
178              
179             package Bio::SeqIO::genbank;
180 16     16   602 use strict;
  16         21  
  16         455  
181              
182 16     16   4065 use Bio::SeqIO::FTHelper;
  16         26  
  16         322  
183 16     16   66 use Bio::SeqFeature::Generic;
  16         18  
  16         276  
184 16     16   4378 use Bio::Species;
  16         26  
  16         329  
185 16     16   74 use Bio::Seq::SeqFactory;
  16         15  
  16         511  
186 16     16   53 use Bio::Annotation::Collection;
  16         17  
  16         264  
187 16     16   4642 use Bio::Annotation::Comment;
  16         24  
  16         321  
188 16     16   4630 use Bio::Annotation::Reference;
  16         29  
  16         376  
189 16     16   70 use Bio::Annotation::DBLink;
  16         13  
  16         267  
190              
191 16     16   49 use base qw(Bio::SeqIO);
  16         17  
  16         106724  
192              
193             # Note that a qualifier that exceeds one line (i.e. a long label) will
194             # automatically be quoted regardless:
195              
196             our $FTQUAL_LINE_LENGTH = 60;
197              
198             our %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
199             anticodon citation
200             codon codon_start
201             cons_splice direction
202             evidence label
203             mod_base number
204             rpt_type rpt_unit
205             transl_except transl_table
206             usedin
207             );
208              
209             our %DBSOURCE = map {$_ => 1} qw(
210             EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
211             TIGR GO InterPro Pfam PROSITE SGD GermOnline
212             HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
213             H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
214             MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
215             UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
216             PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
217             Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
218             TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
219             PhotoList Gramene WormBase WormPep Genew ZFIN
220             PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
221             swissprot GenBank GenPept REFSEQ embl PDB UniProtKB
222             DIP PeptideAtlas PRIDE CYGD HOGENOME Gene3D
223             Project);
224              
225             our %VALID_MOLTYPE = map {$_ => 1} qw(NA DNA RNA tRNA rRNA cDNA cRNA ms-DNA
226             mRNA uRNA ss-RNA ss-DNA snRNA snoRNA PRT);
227              
228             our %VALID_ALPHABET = (
229             'bp' => 'dna',
230             'aa' => 'protein',
231             'rc' => '' # rc = release candidate; file has no sequences
232             );
233              
234             sub _initialize {
235 115     115   280 my($self, @args) = @_;
236              
237 115         477 $self->SUPER::_initialize(@args);
238             # hash for functions for decoding keys.
239 115         439 $self->{'_func_ftunit_hash'} = {};
240 115         451 $self->_show_dna(1); # sets this to one by default. People can change it
241 115 50       423 if ( not defined $self->sequence_factory ) {
242 115         510 $self->sequence_factory
243             (Bio::Seq::SeqFactory->new(-verbose => $self->verbose,
244             -type => 'Bio::Seq::RichSeq'));
245             }
246             }
247              
248             =head2 next_seq
249              
250             Title : next_seq
251             Usage : $seq = $stream->next_seq()
252             Function: returns the next sequence in the stream
253             Returns : Bio::Seq object
254             Args :
255              
256             =cut
257              
258             sub next_seq {
259 117     117 1 6491 my ($self, @args) = @_;
260 117         302 my %args = @args;
261 117         332 my $builder = $self->sequence_builder;
262 117         183 my $seq;
263             my %params;
264              
265             RECORDSTART:
266 117         134 while (1) {
267 119         141 my $buffer;
268 119         127 my ( @acc, @features );
269 0         0 my ( $display_id, $annotation );
270 0         0 my $species;
271              
272             # initialize; we may come here because of starting over
273 119         220 @features = ();
274 119         158 $annotation = undef;
275 119         450 @acc = ();
276 119         152 $species = undef;
277 119         454 %params = ( -verbose => $self->verbose ); # reset hash
278 119         576 local ($/) = "\n";
279 119         497 while ( defined( $buffer = $self->_readline ) ) {
280 189 100       742 last if index( $buffer, 'LOCUS ' ) == 0;
281             }
282 119 100       327 return unless defined $buffer; # end of file
283 113 50       663 $buffer =~ /^LOCUS\s+(\S.*)$/o
284             or $self->throw( "GenBank stream with bad LOCUS line. "
285             . "Not GenBank in my book. Got '$buffer'");
286 113         621 my @tokens = split( ' ', $1 );
287              
288             # this is important to have the id for display in e.g. FTHelper,
289             # otherwise you won't know which entry caused an error
290 113         200 $display_id = shift @tokens;
291 113         261 $params{'-display_id'} = $display_id;
292              
293             # may still be useful if we don't want the seq
294 113         169 my $seqlength = shift @tokens;
295 113 50       282 if ( exists $VALID_ALPHABET{$seqlength} ) {
296             # moved one token too far. No locus name?
297 0         0 $self->warn( "Bad LOCUS name? Changing [$params{'-display_id'}] "
298             . "to 'unknown' and length to '$display_id'"
299             );
300 0         0 $params{'-display_id'} = 'unknown';
301 0         0 $params{'-length'} = $display_id;
302              
303             # add token back...
304 0         0 unshift @tokens, $seqlength;
305             }
306             else {
307 113         234 $params{'-length'} = $seqlength;
308             }
309              
310             # the alphabet of the entry
311             # shouldn't assign alphabet unless one
312             # is specifically designated (such as for rc files)
313 113         238 my $alphabet = lc( shift @tokens );
314             $params{'-alphabet'} =
315             ( exists $VALID_ALPHABET{$alphabet} )
316 113 50       383 ? $VALID_ALPHABET{$alphabet}
317             : $self->warn("Unknown alphabet: $alphabet");
318              
319             # for aa there is usually no 'molecule' (mRNA etc)
320 113 100       301 if ( $params{'-alphabet'} eq 'protein' ) {
321 10         25 $params{'-molecule'} = 'PRT';
322             }
323             else {
324 103         198 $params{'-molecule'} = shift(@tokens);
325             }
326              
327             # take care of lower case issues
328 113 100 66     761 if ( $params{'-molecule'} eq 'dna' or $params{'-molecule'} eq 'rna' ) {
329 4         10 $params{'-molecule'} = uc $params{'-molecule'};
330             }
331             $self->debug( "Unrecognized molecule type: " . $params{'-molecule'} )
332 113 50       496 if not exists( $VALID_MOLTYPE{ $params{'-molecule'} } );
333              
334 113         200 my $circ = shift @tokens;
335 113 100       270 if ( $circ eq 'circular' ) {
336 8         21 $params{'-is_circular'} = 1;
337 8         24 $params{'-division'} = shift @tokens;
338             }
339             else {
340             # 'linear' or 'circular' may actually be omitted altogether
341 105 100       408 $params{'-division'} =
342             ( CORE::length($circ) == 3 ) ? $circ : shift @tokens;
343             }
344 113         284 my $date = join( ' ', @tokens ); # we lump together the rest
345              
346             # this is per request bug #1513
347             # we can handle:
348             # 9-10-2003
349             # 9-10-03
350             # 09-10-2003
351             # 09-10-03
352 113 100       882 if ( $date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/ ) {
353 107 50       252 if ( length($date) < 11 ) {
354             # improperly formatted date
355             # But we'll be nice and fix it for them
356 0         0 my ( $d, $m, $y ) = ( $2, $3, $4 );
357 0 0       0 if ( length($d) == 1 ) {
358 0         0 $d = "0$d";
359             }
360              
361             # guess the century here
362 0 0       0 if ( length($y) == 2 ) {
363 0 0       0 if ( $y > 60 ) { # arbitrarily guess that '60' means 1960
364 0         0 $y = "19$y";
365             }
366             else {
367 0         0 $y = "20$y";
368             }
369 0         0 $self->warn( "Date was malformed, guessing the "
370             . "century for $date to be $y\n"
371             );
372             }
373 0         0 $params{'-dates'} = [ join( '-', $d, $m, $y ) ];
374             }
375             else {
376 107         320 $params{'-dates'} = [$date];
377             }
378             }
379              
380             # set them all at once
381 113         667 $builder->add_slot_value(%params);
382 113         315 %params = ();
383              
384             # parse the rest if desired, otherwise start over
385 113 100       312 if ( not $builder->want_object ) {
386 2         7 $builder->make_object;
387 2         11 next RECORDSTART;
388             }
389              
390             # set up annotation depending on what the builder wants
391 111 100       318 if ( $builder->want_slot('annotation') ) {
392 106         778 $annotation = Bio::Annotation::Collection->new;
393             }
394              
395 111         314 $buffer = $self->_readline;
396 111         385 while ( defined( my $line = $buffer ) ) {
397             # Description line(s)
398 739 100       1952 if ($line =~ /^DEFINITION\s+(\S.*\S)/) {
399 110         318 my @desc = ($1);
400 110         307 while ( defined( $line = $self->_readline ) ) {
401 172 100       573 if ($line =~ /^\s+(.*)/) {
402 62         130 push( @desc, $1 );
403 62         149 next;
404             }
405 110         142 last;
406             }
407 110         605 $builder->add_slot_value( -desc => join( ' ', @desc ) );
408              
409             # we'll continue right here because DEFINITION
410             # always comes at the top of the entry
411 110         212 $buffer = $line;
412             }
413              
414             # accession number (there can be multiple accessions)
415 739 100       8817 if ($line =~ /^ACCESSION\s+(\S.*\S)/) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
416 110         511 push( @acc, split( /\s+/, $1 ) );
417 110         500 while ( defined( $line = $self->_readline ) ) {
418 113 100       401 if ($line =~ /^\s+(.*)/) {
419 3         15 push( @acc, split( /\s+/, $1 ) );
420 3         10 next;
421             }
422 110         135 last;
423             }
424 110         180 $buffer = $line;
425 110         262 next;
426             }
427              
428             # PID
429             elsif ($line =~ /^PID\s+(\S+)/) {
430 1         3 $params{'-pid'} = $1;
431             }
432              
433             # Version number
434             elsif ($line =~ /^VERSION\s+(\S.+)$/) {
435 91         384 my ( $acc, $gi ) = split( ' ', $1 );
436 91 100       413 if ( $acc =~ /^\w+\.(\d+)/ ) {
437 87         231 $params{'-version'} = $1;
438 87         193 $params{'-seq_version'} = $1;
439             }
440 91 100 66     531 if ( $gi && ( index( $gi, "GI:" ) == 0 ) ) {
441 89         301 $params{'-primary_id'} = substr( $gi, 3 );
442             }
443             }
444              
445             # Keywords
446             elsif ($line =~ /^KEYWORDS\s+(\S.*)/) {
447 107         474 my @kw = split( /\s*\;\s*/, $1 );
448 107         294 while ( defined( $line = $self->_readline ) ) {
449 138         241 chomp $line;
450 138 100       398 if ($line =~ /^\s+(.*)/) {
451 31         218 push( @kw, split( /\s*\;\s*/, $1 ) );
452 31         58 next;
453             }
454 107         350 last;
455             }
456              
457 107 50       524 @kw && $kw[-1] =~ s/\.$//;
458 107         251 $params{'-keywords'} = \@kw;
459 107         152 $buffer = $line;
460 107         309 next;
461             }
462              
463             # Organism name and phylogenetic information
464             elsif ($line =~ /^SOURCE\s+\S/) {
465 106 100       334 if ( $builder->want_slot('species') ) {
466 101         360 $species = $self->_read_GenBank_Species( \$buffer );
467 101         720 $builder->add_slot_value( -species => $species );
468             }
469             else {
470 5         10 while ( defined( $buffer = $self->_readline ) ) {
471 18 100       35 last if substr( $buffer, 0, 1 ) ne ' ';
472             }
473             }
474 106         493 next;
475             }
476              
477             # References
478             elsif ($line =~ /^REFERENCE\s+\S/) {
479 109 100       276 if ($annotation) {
480 100         440 my @refs = $self->_read_GenBank_References( \$buffer );
481 100         226 foreach my $ref (@refs) {
482 417         989 $annotation->add_Annotation( 'reference', $ref );
483             }
484             }
485             else {
486 9         14 while ( defined( $buffer = $self->_readline ) ) {
487 47 100       92 last if substr( $buffer, 0, 1 ) ne ' ';
488             }
489             }
490 109         337 next;
491             }
492              
493             # Project
494             elsif ($line =~ /^PROJECT\s+(\S.*)/) {
495 2 50       7 if ($annotation) {
496 2         22 my $project =
497             Bio::Annotation::SimpleValue->new( -value => $1 );
498 2         10 $annotation->add_Annotation( 'project', $project );
499             }
500             }
501              
502             # Comments may be plain text or Structured Comments.
503             # Structured Comments are made up of tag/value pairs and have beginning
504             # and end delimiters like ##*-Data-START## and ##*-Data-END##
505             elsif ($line =~ /^COMMENT\s+(\S.*)/) {
506 60 100       123 if ($annotation) {
507 58         159 my $comment = $1;
508 58         180 while ( defined( $line = $self->_readline ) ) {
509 481 100       1032 last if ($line =~ /^\S/);
510 423         960 $comment .= $line;
511             }
512 58         635 $comment =~ s/ +/ /g;
513             # Structured Comment, do not remove returns in the tabular section
514 58 100       246 if ( my ( $text, $table )= $comment
515             =~ /([^#]*)(##\S+Data-START##.+?##\S+Data-END##)/is
516             ) {
517 2 100       13 $text =~ s/\n/ /g if $text;
518 2         11 $table =~ s/START##/START##\n/;
519 2         16 $table =~ s/^\s+//gm;
520 2         10 $comment = $text . "\n" . $table;
521             }
522             # Plain text, remove returns
523             else {
524 56         317 $comment =~ s/\n/ /g;
525             }
526 58         614 $annotation->add_Annotation(
527             'comment',
528             Bio::Annotation::Comment->new(
529             -text => $comment,
530             -tagname => 'comment'
531             )
532             );
533 58         343 $buffer = $line;
534             }
535             else {
536 2         6 while ( defined( $buffer = $self->_readline ) ) {
537 7 100       16 last if substr( $buffer, 0, 1 ) ne ' ';
538             }
539             }
540 60         145 next;
541             }
542              
543             # Corresponding Genbank nucleotide id, Genpept only
544             elsif ($line =~ /^DB(?:SOURCE|LINK)\s+(\S.+)/) {
545 20 50       59 if ($annotation) {
546 20         49 my $dbsource = $1;
547 20         64 while ( defined( $line = $self->_readline ) ) {
548 85 100       196 last if ($line =~ /^\S/);
549 65         138 $dbsource .= $line;
550             }
551              
552             # deal with UniProKB dbsources
553 20 100       99 if ( $dbsource =~
554             s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n//
555             ) {
556 5         51 $annotation->add_Annotation(
557             'dblink',
558             Bio::Annotation::DBLink->new(
559             -primary_id => $2,
560             -database => $1,
561             -tagname => 'dblink'
562             )
563             );
564 5 50       47 if ( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {
565 5         43 $annotation->add_Annotation(
566             'swissprot_dates',
567             Bio::Annotation::SimpleValue->new(
568             -tagname => 'date_created',
569             -value => $1
570             )
571             );
572             }
573 5         63 while ( $dbsource =~
574             s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g
575             ) {
576 5         25 $annotation->add_Annotation(
577             'swissprot_dates',
578             Bio::Annotation::SimpleValue->new(
579             -tagname => 'date_updated',
580             -value => $2
581             )
582             );
583             }
584 5         39 $dbsource =~ s/\n/ /g;
585 5 50       59 if ( $dbsource =~
    0          
586             s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/
587             ) {
588             # will use $i to determine even or odd
589             # for swissprot the accessions are paired
590 5         9 my $i = 0;
591 5         44 for my $dbsrc ( split( /,\s+/, $1 ) ) {
592 49 50 66     244 if ( $dbsrc =~ /(\S+)\.(\d+)/
593             or $dbsrc =~ /(\S+)/
594             ) {
595 49         108 my ( $id, $version ) = ( $1, $2 );
596 49 100       85 $version = '' unless defined $version;
597 49 100       140 my $db = ( $id =~ /^\d\S{3}/ ) ? 'PDB'
    100          
598             : ( $i++ % 2 ) ? 'GenPept'
599             : 'GenBank';
600              
601 49         184 $annotation->add_Annotation(
602             'dblink',
603             Bio::Annotation::DBLink->new(
604             -primary_id => $id,
605             -version => $version,
606             -database => $db,
607             -tagname => 'dblink'
608             )
609             );
610             }
611             }
612             }
613             elsif ( $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
614             # download screwed up and ncbi didn't put acc in for gi numbers
615 0         0 my $i = 0;
616 0         0 for my $id ( split( /\,\s+/, $1 ) ) {
617 0         0 my ( $acc, $db );
618 0 0       0 if ( $id =~ /gi:\s+(\d+)/ ) {
    0          
619 0         0 $acc = $1;
620 0 0       0 $db = ( $i++ % 2 ) ? 'GenPept' : 'GenBank';
621             }
622             elsif ( $id =~ /pdb\s+accession\s+(\S+)/ ) {
623 0         0 $acc = $1;
624 0         0 $db = 'PDB';
625             }
626             else {
627 0         0 $acc = $id;
628 0         0 $db = '';
629             }
630 0         0 $annotation->add_Annotation(
631             'dblink',
632             Bio::Annotation::DBLink->new(
633             -primary_id => $acc,
634             -database => $db,
635             -tagname => 'dblink'
636             )
637             );
638             }
639             }
640             else {
641 0         0 $self->debug("Cannot match $dbsource\n");
642             }
643 5 50       91 if ( $dbsource =~ s/xrefs\s+
644             \(non\-sequence\s+databases\):\s+
645             ((?:\S+,\s+)+\S+)//x
646             ) {
647 5         69 for my $id ( split( /\,\s+/, $1 ) ) {
648 86         73 my $db;
649              
650             # this is because GenBank dropped the spaces!!!
651             # I'm sure we're not going to get this right
652             ##if ( $id =~ s/^://i ) {
653             ## $db = $1;
654             ##}
655 86         170 $db = substr( $id, 0, index( $id, ':' ) );
656 86 100       179 if ( not exists $DBSOURCE{$db} ) {
657 16         18 $db = ''; # do we want 'GenBank' here?
658             }
659 86         109 $id = substr( $id, index( $id, ':' ) + 1 );
660 86         266 $annotation->add_Annotation(
661             'dblink',
662             Bio::Annotation::DBLink->new(
663             -primary_id => $id,
664             -database => $db,
665             -tagname => 'dblink'
666             )
667             );
668             }
669             }
670             }
671             else {
672 15 100       121 if ( $dbsource =~
    100          
    50          
673             /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/
674             ) {
675 4         15 my ( $db, $id, $version ) = ( $1, $2, $3 );
676 4   100     35 $annotation->add_Annotation(
677             'dblink',
678             Bio::Annotation::DBLink->new(
679             -primary_id => $id,
680             -version => $version,
681             -database => $db || 'GenBank',
682             -tagname => 'dblink'
683             )
684             );
685             }
686             elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {
687 1         6 my ( $db, $id ) = ( $1, $2 );
688 1   50     13 $annotation->add_Annotation(
689             'dblink',
690             Bio::Annotation::DBLink->new(
691             -primary_id => $id,
692             -database => $db || 'GenBank',
693             -tagname => 'dblink'
694             )
695             );
696             }
697             elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {
698 10         16 my ( $db, $version );
699 10         21 my @ids = ();
700 10 100       38 if ( $2 eq ':' ) {
701 9         19 $db = $1;
702              
703             # Genbank 192 release notes say this: "The second
704             # field can consist of multiple comma-separated
705             # identifiers, if a sequence record has multiple
706             # DBLINK cross-references of a given type."
707             # For example: DBLINK Project:100,200,300"
708 9         31 @ids = split( /,/, $3 );
709             }
710             else {
711 1         4 ( $db, $version ) = ( 'GenBank', $3 );
712 1         3 $ids[0] = $1;
713             }
714              
715 10         25 foreach my $id (@ids) {
716 12         117 $annotation->add_Annotation(
717             'dblink',
718             Bio::Annotation::DBLink->new(
719             -primary_id => $id,
720             -version => $version,
721             -database => $db,
722             -tagname => 'dblink'
723             )
724             );
725             }
726             }
727             else {
728 0         0 $self->warn(
729             "Unrecognized DBSOURCE data: $dbsource\n");
730             }
731             }
732 20         48 $buffer = $line;
733             }
734             else {
735 0         0 while ( defined( $buffer = $self->_readline ) ) {
736 0 0       0 last if substr( $buffer, 0, 1 ) ne ' ';
737             }
738             }
739 20         58 next;
740             }
741              
742             # Exit at start of Feature table, or start of sequence
743 227 100       1003 if ($line =~ /^(FEATURES|ORIGIN)/) {
744 111         137 my $trap;
745             }
746 227 100       950 last if ($line =~ /^(FEATURES|ORIGIN)/);
747              
748             # Get next line and loop again
749 116         325 $buffer = $self->_readline;
750             }
751 111 50       252 return unless defined $buffer;
752              
753             # add them all at once for efficiency
754 111         892 $builder->add_slot_value(
755             -accession_number => shift(@acc),
756             -secondary_accessions => \@acc,
757             %params
758             );
759 111 100       462 $builder->add_slot_value( -annotation => $annotation ) if $annotation;
760 111         273 %params = (); # reset before possible re-use to avoid setting twice
761              
762             # start over if we don't want to continue with this entry
763 111 50       357 if ( not $builder->want_object ) {
764 0         0 $builder->make_object;
765 0         0 next RECORDSTART;
766             }
767              
768             # some "minimal" formats may not necessarily have a feature table
769 111 100 66     347 if ( $builder->want_slot('features')
      100        
770             and defined $buffer
771             and $buffer =~ /^FEATURES/o
772             ) {
773             # need to read the first line of the feature table
774 104         363 $buffer = $self->_readline;
775              
776             # DO NOT read lines in the while condition -- this is done
777             # as a side effect in _read_FTHelper_GenBank!
778              
779             # part of new circular spec:
780             # commented out for now until kinks worked out
781             #my $sourceEnd = 0;
782             #$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);
783              
784 104         284 while ( defined $buffer ) {
785             # check immediately -- not at the end of the loop
786             # note: GenPept entries obviously do not have a BASE line
787 8384 100       57132 last if ( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o );
788              
789             # slurp in one feature at a time -- at return, the start of
790             # the next feature will have been read already, so we need
791             # to pass a reference, and the called method must set this
792             # to the last line read before returning
793 8280         17235 my $ftunit = $self->_read_FTHelper_GenBank( \$buffer );
794              
795             # implement new circular spec: features that cross the origin are now
796             # seamless instead of being 2 separate joined features
797             # commented out until kinks get worked out
798             #if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/
799             #&& $sourceEnd == $2 && $3 == 1) {
800             #my $start = $1;
801             #my $end = $2 + $4;
802             #$ftunit->{'loc'} = "$start..$end";
803             #}
804              
805             # fix suggested by James Diggans
806              
807 8280 50       11652 if ( not defined $ftunit ) {
808             # GRRRR. We have fallen over. Try to recover
809             $self->warn( "Unexpected error in feature table for "
810 0         0 . $params{'-display_id'}
811             . " Skipping feature, attempting to recover" );
812              
813 0 0 0     0 unless ( $buffer =~ /^\s{5,5}\S+/o
814             or $buffer =~ /^\S+/o
815             ) {
816 0         0 $buffer = $self->_readline;
817             }
818 0         0 next; # back to reading FTHelpers
819             }
820              
821             # process ftunit
822 8280         19705 my $feat =
823             $ftunit->_generic_seqfeature( $self->location_factory,
824             $display_id );
825              
826             # add taxon_id from source if available
827 8280 100 100     21029 if ( $species
      100        
      66        
      66        
828             and $feat->primary_tag eq 'source'
829             and $feat->has_tag('db_xref')
830             and ( not $species->ncbi_taxid
831             or ( $species->ncbi_taxid
832             and $species->ncbi_taxid =~ /^list/ ) )
833             ) {
834 95         298 foreach my $tagval ( $feat->get_tag_values('db_xref') ) {
835 96 100       366 if ( index( $tagval, "taxon:" ) == 0 ) {
836 95         339 $species->ncbi_taxid( substr( $tagval, 6 ) );
837 95         227 last;
838             }
839             }
840             }
841              
842             # add feature to list of features
843 8280         26341 push( @features, $feat );
844             }
845 104         626 $builder->add_slot_value( -features => \@features );
846             }
847              
848 111 50       402 if ( defined $buffer ) {
849             # CONTIG lines: TODO, this needs to be cleaned up
850 111 100       1334 if ($buffer =~/^CONTIG\s+(.*)/o) {
    100          
    100          
851 7         23 my $ctg = $1;
852 7         31 while ( defined( $buffer = $self->_readline ) ) {
853 29 100       167 last if $buffer =~ m{^ORIGIN|//}o;
854 22         57 $buffer =~ s/\s+(.*)/$1/;
855 22         44 $ctg .= $buffer;
856             }
857 7 50       27 if ($ctg) {
858 7         74 $annotation->add_Annotation(
859             Bio::Annotation::SimpleValue->new(
860             -tagname => 'contig',
861             -value => $ctg
862             )
863             );
864             }
865             }
866             elsif ($buffer =~ /^WGS|WGS_SCAFLD\s+/o) { # catch WGS/WGS_SCAFLD lines
867 1         12 while ( $buffer =~ s/(^WGS|WGS_SCAFLD)\s+// ) { # gulp lines
868 3         8 chomp $buffer;
869 3         35 $annotation->add_Annotation(
870             Bio::Annotation::SimpleValue->new(
871             -value => $buffer,
872             -tagname => lc $1
873             )
874             );
875 3         14 $buffer = $self->_readline;
876             }
877             }
878             elsif ( $buffer !~ m{^ORIGIN|//}o ) { # advance to the sequence, if any
879 53         196 while ( defined( $buffer = $self->_readline ) ) {
880 148 100       557 last if $buffer =~ m{^(ORIGIN|//)};
881             }
882             }
883             }
884 111 50       445 if ( not $builder->want_object ) {
885 0         0 $builder->make_object; # implicit end-of-object
886 0         0 next RECORDSTART;
887             }
888 111 100 33     356 if ( $builder->want_slot('seq') ) {
    50          
889             # the fact that we want a sequence does not necessarily mean that
890             # there also is a sequence ...
891 101 100 100     839 if ( defined $buffer and $buffer =~ s/^ORIGIN\s+// ) {
892 93 100 66     516 if ( $annotation and length($buffer) > 0 ) {
893 1         10 $annotation->add_Annotation(
894             'origin',
895             Bio::Annotation::SimpleValue->new(
896             -tagname => 'origin',
897             -value => $buffer
898             )
899             );
900             }
901 93         153 my $seqc = '';
902 93         294 while ( defined( $buffer = $self->_readline ) ) {
903 32735 100       42408 last if $buffer =~ m{^//};
904 32643         29546 $buffer = uc $buffer;
905 32643         206020 $buffer =~ s/[^A-Za-z]//g;
906 32643         55632 $seqc .= $buffer;
907             }
908              
909 93         443 $builder->add_slot_value( -seq => $seqc );
910             }
911             }
912             elsif ( defined($buffer) and ( substr( $buffer, 0, 2 ) ne '//' ) ) {
913             # advance to the end of the record
914 10         20 while ( defined( $buffer = $self->_readline ) ) {
915 172 100       338 last if substr( $buffer, 0, 2 ) eq '//';
916             }
917             }
918              
919             # Unlikely, but maybe the sequence is so weird that we don't want it
920             # anymore. We don't want to return undef if the stream's not exhausted
921             # yet.
922 111         392 $seq = $builder->make_object;
923 111 50       311 next RECORDSTART unless $seq;
924 111         2687 last RECORDSTART;
925             } # end while RECORDSTART
926              
927 111         533 return $seq;
928             }
929              
930             =head2 write_seq
931              
932             Title : write_seq
933             Usage : $stream->write_seq($seq)
934             Function: writes the $seq object (must be seq) to the stream
935             Returns : 1 for success and 0 for error
936             Args : array of 1 to n Bio::SeqI objects
937              
938             =cut
939              
940             sub write_seq {
941 25     25 1 83 my ($self,@seqs) = @_;
942              
943 25         69 foreach my $seq ( @seqs ) {
944 25 50       105 $self->throw("Attempting to write with no seq!") unless defined $seq;
945              
946 25 50 33     205 if ( not ref $seq or not $seq->isa('Bio::SeqI') ) {
947 0         0 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
948             }
949              
950 25         123 my $str = $seq->seq;
951 25         124 my $len = $seq->length;
952 25         99 my $alpha = $seq->alphabet;
953              
954 25         41 my ($div, $mol);
955 25 100 66     235 if ( not $seq->can('division')
956             or not defined($div = $seq->division)
957             ) {
958 2         4 $div = 'UNK';
959             }
960 25 100 66     190 if ( not $seq->can('molecule')
961             or not defined ($mol = $seq->molecule)
962             ) {
963 2   100     10 $mol = $alpha || 'DNA';
964             }
965              
966 25 100       92 my $circular = ($seq->is_circular) ? 'circular' : 'linear ';
967              
968 25         164 local($^W) = 0; # supressing warnings about uninitialized fields.
969              
970 25         39 my $temp_line;
971 25 50       87 if ( $self->_id_generation_func ) {
972 0         0 $temp_line = &{$self->_id_generation_func}($seq);
  0         0  
973             }
974             else {
975 25         44 my $date = '';
976 25 100       132 if ( $seq->can('get_dates') ) {
977 23         71 ($date) = $seq->get_dates;
978             }
979              
980 25 50       100 $self->warn("No whitespace allowed in GenBank display id [". $seq->display_id. "]")
981             if $seq->display_id =~ /\s/;
982              
983 25 100       127 my @data = ( lc($alpha) eq 'protein' ) ? ('aa', '', '') : ('bp', '', $mol);
984 25         99 $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s\n",
985             'LOCUS', $seq->id, $len,
986             @data, $circular, $div, $date);
987             }
988              
989 25         244 $self->_print($temp_line);
990 25         112 $self->_write_line_GenBank_regex("DEFINITION ", " ",
991             $seq->desc, "\\s\+\|\$",80);
992              
993             # if there, write the accession line
994              
995 25 50       90 if ( $self->_ac_generation_func ) {
996 0         0 $temp_line = &{$self->_ac_generation_func}($seq);
  0         0  
997 0         0 $self->_print("ACCESSION $temp_line\n");
998             }
999             else {
1000 25         61 my @acc = ();
1001 25         111 push @acc, $seq->accession_number;
1002 25 100       131 if ( $seq->isa('Bio::Seq::RichSeqI') ) {
1003 23         158 push @acc, $seq->get_secondary_accessions;
1004             }
1005 25         136 $self->_print("ACCESSION ", join(" ", @acc), "\n");
1006             # otherwise - cannot print
1007             }
1008              
1009             # if PID defined, print it
1010 25 50 66     170 if ($seq->isa('Bio::Seq::RichSeqI') and $seq->pid) {
1011 0         0 $self->_print("PID ", $seq->pid, "\n");
1012             }
1013              
1014             # if there, write the version line
1015 25 50 100     86 if ( defined $self->_sv_generation_func ) {
    100          
1016 0         0 $temp_line = &{$self->_sv_generation_func}($seq);
  0         0  
1017 0 0       0 if ( $temp_line ) {
1018 0         0 $self->_print("VERSION $temp_line\n");
1019             }
1020             }
1021             elsif ($seq->isa('Bio::Seq::RichSeqI') and defined($seq->seq_version)) {
1022 18         68 my $id = $seq->primary_id; # this may be a GI number
1023 18 50 33     213 my $data = (defined $id and $id =~ /^\d+$/) ? " GI:$id" : "";
1024 18         58 $self->_print("VERSION ",
1025             $seq->accession_number, ".",
1026             $seq->seq_version, $data, "\n");
1027             }
1028              
1029             # if there, write the PROJECT line
1030 25         78 for my $proj ( $seq->annotation->get_Annotations('project') ) {
1031 2         11 $self->_print("PROJECT ".$proj->value."\n");
1032             }
1033              
1034             # if there, write the DBSOURCE line
1035 25         65 foreach my $ref ( $seq->annotation->get_Annotations('dblink') ) {
1036 2         11 my ($db, $id) = ($ref->database, $ref->primary_id);
1037 2 100       9 my $prefix = $db eq 'Project' ? 'DBLINK' : 'DBSOURCE';
1038 2 100       13 my $text = $db eq 'GenBank' ? ''
    50          
1039             : $db eq 'Project' ? "$db:$id"
1040             : "$db accession $id";
1041 2         13 $self->_print(sprintf ("%-11s %s\n", $prefix, $text));
1042             }
1043              
1044             # if there, write the keywords line
1045 25 50       82 if ( defined $self->_kw_generation_func ) {
    100          
1046 0         0 $temp_line = &{$self->_kw_generation_func}($seq);
  0         0  
1047 0         0 $self->_print("KEYWORDS $temp_line\n");
1048             }
1049             elsif ( $seq->can('keywords') ) {
1050 23         66 my $kw = $seq->keywords;
1051 23 50       96 $kw .= '.' if ( $kw !~ /\.$/ );
1052 23         81 $self->_print("KEYWORDS $kw\n");
1053             }
1054              
1055             # SEGMENT if it exists
1056 25         79 foreach my $ref ( $seq->annotation->get_Annotations('segment') ) {
1057 0         0 $self->_print(sprintf ("%-11s %s\n",'SEGMENT',
1058             $ref->value));
1059             }
1060              
1061             # Organism lines
1062 25 100       85 if (my $spec = $seq->species) {
1063 22 50       260 my ($on, $sn, $cn) = ($spec->can('organelle') ? $spec->organelle : '',
1064             $spec->scientific_name,
1065             $spec->common_name);
1066 22         39 my @classification;
1067 22 50       115 if ($spec->isa('Bio::Species')) {
1068 22         78 @classification = $spec->classification;
1069 22         46 shift @classification;
1070             }
1071             else {
1072             # Bio::Taxon should have a DB handle of some type attached, so
1073             # derive the classification from that
1074 0         0 my $node = $spec;
1075 0         0 while ($node) {
1076 0   0     0 $node = $node->ancestor || last;
1077 0         0 unshift @classification, $node->node_name;
1078             #$node eq $root && last;
1079             }
1080 0         0 @classification = reverse @classification;
1081             }
1082 22 50       83 my $abname = $spec->name('abbreviated') ? # from genbank file
1083             $spec->name('abbreviated')->[0] : $sn;
1084 22 100       74 my $sl = $on ? "$on " : '';
1085 22 100       74 $sl .= $cn ? "$abname ($cn)" : $abname;
1086              
1087 22         85 $self->_write_line_GenBank_regex("SOURCE ", ' 'x12, $sl, "\\s\+\|\$", 80);
1088 22         73 $self->_print(" ORGANISM ", $spec->scientific_name, "\n");
1089 22         106 my $OC = join('; ', reverse @classification) . '.';
1090 22         94 $self->_write_line_GenBank_regex(' 'x12,' 'x12, $OC, "\\s\+\|\$", 80);
1091             }
1092              
1093             # Reference lines
1094 25         61 my $count = 1;
1095 25         79 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
1096 47         80 $temp_line = "REFERENCE $count";
1097 47 50       149 if ($ref->start) {
    0          
1098 47 100       140 $temp_line .= sprintf (" (%s %d to %d)",
1099             ($seq->alphabet() eq "protein" ?
1100             "residues" : "bases"),
1101             $ref->start, $ref->end);
1102             }
1103             elsif ($ref->gb_reference) {
1104 0         0 $temp_line .= sprintf (" (%s)", $ref->gb_reference);
1105             }
1106 47         164 $self->_print("$temp_line\n");
1107 47         122 $self->_write_line_GenBank_regex(" AUTHORS ", ' 'x12,
1108             $ref->authors, "\\s\+\|\$", 80);
1109 47 100       153 $self->_write_line_GenBank_regex(" CONSRTM ", ' 'x12,
1110             $ref->consortium, "\\s\+\|\$", 80) if $ref->consortium;
1111 47         142 $self->_write_line_GenBank_regex(" TITLE ", ' 'x12,
1112             $ref->title, "\\s\+\|\$", 80);
1113 47         134 $self->_write_line_GenBank_regex(" JOURNAL ", ' 'x12,
1114             $ref->location, "\\s\+\|\$", 80);
1115 47 100       120 if ( $ref->medline) {
1116 5         18 $self->_write_line_GenBank_regex(" MEDLINE ", ' 'x12,
1117             $ref->medline, "\\s\+\|\$", 80);
1118             # I am assuming that pubmed entries only exist when there
1119             # are also MEDLINE entries due to the indentation
1120             }
1121             # This could be a wrong assumption
1122 47 100       128 if ( $ref->pubmed ) {
1123 13         84 $self->_write_line_GenBank_regex(" PUBMED ", ' 'x12,
1124             $ref->pubmed, "\\s\+\|\$", 80);
1125             }
1126             # put remark at the end
1127 47 50       158 if ($ref->comment) {
1128 0         0 $self->_write_line_GenBank_regex(" REMARK ", ' 'x12,
1129             $ref->comment, "\\s\+\|\$", 80);
1130             }
1131 47         84 $count++;
1132             }
1133              
1134             # Comment lines
1135 25         155 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
1136 13         39 $self->_write_line_GenBank_regex("COMMENT ", ' 'x12,
1137             $comment->text, "\\s\+\|\$", 80);
1138             }
1139              
1140             # FEATURES section
1141 25         74 $self->_print("FEATURES Location/Qualifiers\n");
1142              
1143 25 50       70 if ( defined $self->_post_sort ) {
1144             # we need to read things into an array. Process. Sort them. Print 'em
1145 0         0 my $post_sort_func = $self->_post_sort;
1146 0         0 my @fth;
1147              
1148 0         0 foreach my $sf ( $seq->top_SeqFeatures ) {
1149 0         0 push @fth, Bio::SeqIO::FTHelper::from_SeqFeature($sf, $seq);
1150             }
1151              
1152 0         0 @fth = sort { &$post_sort_func($a, $b) } @fth;
  0         0  
1153              
1154 0         0 foreach my $fth ( @fth ) {
1155 0         0 $self->_print_GenBank_FTHelper($fth);
1156             }
1157             }
1158             else {
1159             # not post sorted. And so we can print as we get them.
1160             # lower memory load...
1161 25         110 foreach my $sf ( $seq->top_SeqFeatures ) {
1162 430         1016 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf, $seq);
1163 430         444 foreach my $fth ( @fth ) {
1164 430 50       1088 if ( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
1165 0         0 $sf->throw("Cannot process FTHelper... $fth");
1166             }
1167 430         841 $self->_print_GenBank_FTHelper($fth);
1168             }
1169             }
1170             }
1171              
1172             # deal with WGS; WGS_SCAFLD present only if WGS is also present
1173 25 50       180 if ($seq->annotation->get_Annotations('wgs')) {
1174 0         0 foreach my $wgs (map {$seq->annotation->get_Annotations($_)}
  0         0  
1175             qw(wgs wgs_scaffold)
1176             ) {
1177 0         0 $self->_print(sprintf ("%-11s %s\n",
1178             uc($wgs->tagname),
1179             $wgs->value));
1180             }
1181 0         0 $self->_show_dna(0);
1182             }
1183 25 100       80 if ($seq->annotation->get_Annotations('contig')) {
1184 1         4 my $ct = 0;
1185 1         2 my $cline;
1186 1         4 foreach my $contig ($seq->annotation->get_Annotations('contig')) {
1187 1 50       5 unless ($ct) {
1188 1         6 $cline = uc($contig->tagname) . " " . $contig->value . "\n";
1189             }
1190             else {
1191 0         0 $cline = " " . $contig->value . "\n";
1192             }
1193 1         5 $self->_print($cline);
1194 1         2 $ct++;
1195             }
1196             }
1197 25 100       93 if ( $seq->length == 0 ) {
1198 1         2 $self->_show_dna(0);
1199             }
1200              
1201 25 100       83 if ( $self->_show_dna == 0 ) {
1202 1         2 $self->_print("\n//\n");
1203 1         19 return;
1204             }
1205              
1206             # finished printing features.
1207              
1208 24         382 $str =~ tr/A-Z/a-z/;
1209              
1210 24         84 my ($o) = $seq->annotation->get_Annotations('origin');
1211 24 50       160 $self->_print(sprintf("%-12s%s\n",
1212             'ORIGIN', $o ? $o->value : ''));
1213             # print out the sequence
1214 24         45 my $nuc = 60; # Number of nucleotides per line
1215 24         84 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
1216 24         38 my $out_pat = 'A11' x 6; # Pattern for packing a line
1217 24         37 my $length = length $str;
1218              
1219             # Calculate the number of nucleotides which fit on whole lines
1220 24         102 my $whole = int($length / $nuc) * $nuc;
1221              
1222             # Print the whole lines
1223 24         32 my $i;
1224 24         99 for ($i = 0; $i < $whole; $i += $nuc) {
1225 1953         6419 my $blocks = pack $out_pat,
1226             unpack $whole_pat,
1227             substr($str, $i, $nuc);
1228 1953         2642 chop $blocks;
1229 1953         5236 $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59));
1230             }
1231              
1232             # Print the last line
1233 24 100       88 if (my $last = substr($str, $i)) {
1234 19         33 my $last_len = length($last);
1235 19         111 my $last_pat = 'a10' x int($last_len / 10)
1236             . 'a' . $last_len % 10;
1237 19         93 my $blocks = pack $out_pat,
1238             unpack($last_pat, $last);
1239 19         116 $blocks =~ s/ +$//;
1240 19         176 $self->_print(sprintf("%9d $blocks\n",
1241             $length - $last_len + 1));
1242             }
1243              
1244 24         65 $self->_print("//\n");
1245              
1246 24 50 33     81 $self->flush if $self->_flush_on_write && defined $self->_fh;
1247 24         190 return 1;
1248             }
1249             }
1250              
1251             =head2 _print_GenBank_FTHelper
1252              
1253             Title : _print_GenBank_FTHelper
1254             Usage :
1255             Function:
1256             Example :
1257             Returns :
1258             Args :
1259              
1260             =cut
1261              
1262             sub _print_GenBank_FTHelper {
1263 430     430   383 my ( $self, $fth ) = @_;
1264              
1265 430 50 33     1582 if ( not ref $fth or not $fth->isa('Bio::SeqIO::FTHelper') ) {
1266 0         0 $fth->warn(
1267             "$fth is not a FTHelper class. Attempting to print but there could be issues"
1268             );
1269             }
1270              
1271 430 50       680 my $spacer = ( length $fth->key >= 15 ) ? ' ' : '';
1272 430         783 $self->_write_line_GenBank_regex(
1273             sprintf( " %-16s%s", $fth->key, $spacer ),
1274             " " x 21, $fth->loc, "\,\|\$", 80 );
1275              
1276 430         561 foreach my $tag ( keys %{ $fth->field } ) {
  430         829  
1277             # Account for hash structure in Annotation::DBLink, not the expected array
1278 997 50 66     2208 if ( $tag eq 'db_xref' and grep /HASH/, @{ $fth->field->{$tag} }) {
  140         278  
1279 0         0 for my $ref ( @{ $fth->field->{$tag} } ) {
  0         0  
1280 0         0 my $db = $ref->{'database'};
1281 0         0 my $id = $ref->{'primary_id'};
1282 0         0 $self->_write_line_GenBank_regex
1283             ( " " x 21, " " x 21,
1284             "/$tag=\"$db:$id\"", "\.\|\$", 80 );
1285             }
1286             }
1287             # The usual case, where all values are found in an array
1288             else {
1289 997         841 foreach my $value ( @{ $fth->field->{$tag} } ) {
  997         1548  
1290 1085         1379 $value =~ s/\"/\"\"/g;
1291 1085 50 100     3357 if ( $value eq "_no_value" ) {
    100          
1292 0         0 $self->_write_line_GenBank_regex
1293             ( " " x 21, " " x 21,
1294             "/$tag", "\.\|\$", 80 );
1295             }
1296              
1297             # There are almost 3x more quoted qualifier values and they
1298             # are more common too so we take quoted ones first.
1299             # Long qualifiers, that will be line wrapped, are always quoted
1300             elsif ( not $FTQUAL_NO_QUOTE{$tag}
1301             or length("/$tag=$value") >= $FTQUAL_LINE_LENGTH
1302             ) {
1303 976 100       2213 my ($pat) = ( $value =~ /\s/ ? '\s|$' : '.|$' );
1304 976         2831 $self->_write_line_GenBank_regex
1305             ( " " x 21, " " x 21,
1306             "/$tag=\"$value\"", $pat, 80 );
1307             }
1308             else {
1309 109         357 $self->_write_line_GenBank_regex
1310             ( " " x 21, " " x 21,
1311             "/$tag=$value", "\.\|\$", 80 );
1312             }
1313             }
1314             }
1315             }
1316             }
1317              
1318             =head2 _read_GenBank_References
1319              
1320             Title : _read_GenBank_References
1321             Usage :
1322             Function: Reads references from GenBank format. Internal function really
1323             Returns :
1324             Args :
1325              
1326             =cut
1327              
1328             sub _read_GenBank_References {
1329 100     100   159 my ($self, $buffer) = @_;
1330 100         134 my (@refs);
1331             my $ref;
1332              
1333             # assumme things are starting with RN
1334 100 50       504 if ( $$buffer !~ /^REFERENCE/ ) {
1335 0         0 warn("Not parsing line '$$buffer' which maybe important");
1336             }
1337              
1338 100         159 my $line = $$buffer;
1339              
1340 100         127 my (@title,@loc,@authors,@consort,@com,@medline,@pubmed);
1341              
1342             REFLOOP:
1343 100   66     416 while( defined($line) or defined($line = $self->_readline) ) {
1344 1379 100       3196 if ($line =~ /^\s{2}AUTHORS\s+(.*)/o) {
1345 402         1197 push @authors, $1;
1346 402         780 while ( defined($line = $self->_readline) ) {
1347 1040 100       2410 if ($line =~ /^\s{9,}(.*)/o) {
1348 638         975 push @authors, $1;
1349 638         1066 next;
1350             }
1351 402         398 last;
1352             }
1353 402         1622 $ref->authors(join(' ', @authors));
1354             }
1355              
1356 1379 100       2132 if ($line =~ /^\s{2}CONSRTM\s+(.*)/o) {
1357 35         298 push @consort, $1;
1358 35         86 while ( defined($line = $self->_readline) ) {
1359 36 100       108 if ($line =~ /^\s{9,}(.*)/o) {
1360 1         4 push @consort, $1;
1361 1         3 next;
1362             }
1363 35         45 last;
1364             }
1365 35         157 $ref->consortium(join(' ', @consort));
1366             }
1367              
1368 1379 100       2622 if ($line =~ /^\s{2}TITLE\s+(.*)/o) {
1369 408         729 push @title, $1;
1370 408         781 while ( defined($line = $self->_readline) ) {
1371 693 100       1756 if ($line =~ /^\s{9,}(.*)/o) {
1372 285         477 push @title, $1;
1373 285         550 next;
1374             }
1375 408         478 last;
1376             }
1377 408         1651 $ref->title(join(' ', @title));
1378             }
1379              
1380 1379 100       2685 if ($line =~ /^\s{2}JOURNAL\s+(.*)/o) {
1381 416         751 push @loc, $1;
1382 416         761 while ( defined($line = $self->_readline) ) {
1383             # we only match when there are at least 4 spaces
1384             # there is probably a better way to match this
1385             # as it assumes that the describing tag is short enough
1386 565 100       1349 if ($line =~ /^\s{9,}(.*)/o) {
1387 149         526 push @loc, $1;
1388 149         279 next;
1389             }
1390 416         392 last;
1391             }
1392 416         1500 $ref->location(join(' ', @loc));
1393 416         528 redo REFLOOP;
1394             }
1395              
1396 963 100       1476 if ($line =~ /^\s{2}REMARK\s+(.*)/o) {
1397 59         102 push @com, $1;
1398 59         103 while ( defined($line = $self->_readline) ) {
1399 83 100       190 if ($line =~ /^\s{9,}(.*)/o) {
1400 24         46 push @com, $1;
1401 24         44 next;
1402             }
1403 59         56 last;
1404             }
1405 59         188 $ref->comment(join(' ', @com));
1406 59         61 redo REFLOOP;
1407             }
1408              
1409 904 100       1547 if ( $line =~ /^\s{2}MEDLINE\s+(.*)/ ) {
1410 130         293 push @medline, $1;
1411 130         277 while ( defined($line = $self->_readline) ) {
1412 130 50       368 if ($line =~ /^\s{9,}(.*)/) {
1413 0         0 push @medline, $1;
1414 0         0 next;
1415             }
1416 130         147 last;
1417             }
1418 130         392 $ref->medline(join(' ', @medline));
1419 130         150 redo REFLOOP;
1420             }
1421              
1422 774 100       1568 if ( $line =~ /^\s{3}PUBMED\s+(.*)/ ) {
1423 237         531 push @pubmed, $1;
1424 237         502 while ( defined($line = $self->_readline) ) {
1425 237 50       582 if ($line =~ /^\s{9,}(.*)/) {
1426 0         0 push @pubmed, $1;
1427 0         0 next;
1428             }
1429 237         234 last;
1430             }
1431 237         644 $ref->pubmed(join(' ', @pubmed));
1432 237         245 redo REFLOOP;
1433             }
1434              
1435 537 100       1196 if ( $line =~ /^REFERENCE/o ) {
1436             # store current reference
1437 417 100       1058 $self->_add_ref_to_array(\@refs,$ref) if defined $ref;
1438              
1439             # reset
1440 417         661 @authors = ();
1441 417         457 @title = ();
1442 417         487 @loc = ();
1443 417         399 @com = ();
1444 417         378 @pubmed = ();
1445 417         347 @medline = ();
1446              
1447             # create the new reference object
1448 417         1892 $ref = Bio::Annotation::Reference->new(-tagname => 'reference');
1449              
1450             # check whether start and end base is given
1451 417 100       1913 if ($line =~ /^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)\)/){
    100          
1452 286         720 $ref->start($1);
1453 286         550 $ref->end($2);
1454             }
1455             elsif ($line =~ /^REFERENCE\s+\d+\s+\((.*)\)/) {
1456 14         48 $ref->gb_reference($1);
1457             }
1458             }
1459              
1460 537 100       2761 last if ($line =~ /^(FEATURES)|(COMMENT)/o);
1461              
1462 437         1757 $line = undef; # Empty $line to trigger read of next line
1463             }
1464              
1465             # store last reference
1466 100 50       402 $self->_add_ref_to_array(\@refs, $ref) if defined $ref;
1467              
1468 100         141 $$buffer = $line;
1469              
1470             #print "\nnumber of references found: ", $#refs+1,"\n";
1471              
1472 100         444 return @refs;
1473             }
1474              
1475             =head2 _add_ref_to_array
1476              
1477             Title: _add_ref_to_array
1478             Usage:
1479             Function: Adds a Reference object to an array of Reference objects, takes
1480             care of possible cleanups to be done (currently, only author and title
1481             will be chopped of trailing semicolons).
1482             Args: A reference to an array of Reference objects and
1483             the Reference object to be added
1484             Returns: nothing
1485              
1486             =cut
1487              
1488             sub _add_ref_to_array {
1489 417     417   485 my ($self, $refs, $ref) = @_;
1490              
1491             # first, polish author and title by removing possible trailing semicolons
1492 417         702 my $au = $ref->authors;
1493 417         728 my $title = $ref->title;
1494 417 100       899 $au =~ s/;\s*$//g if $au;
1495 417 100       752 $title =~ s/;\s*$//g if $title;
1496 417         685 $ref->authors($au);
1497 417         722 $ref->title($title);
1498             # the rest should be clean already, so go ahead and add it
1499 417         336 push @{$refs}, $ref;
  417         704  
1500             }
1501              
1502             =head2 _read_GenBank_Species
1503              
1504             Title : _read_GenBank_Species
1505             Usage :
1506             Function: Reads the GenBank Organism species and classification
1507             lines. Able to deal with unconvential Organism naming
1508             formats, and varietas in plants
1509             Example : ORGANISM unknown marine gamma proteobacterium NOR5
1510             $genus = undef
1511             $species = unknown marine gamma proteobacterium NOR5
1512              
1513             ORGANISM Drosophila sp. 'white tip scutellum'
1514             $genus = Drosophila
1515             $species = sp. 'white tip scutellum'
1516             (yes, this really is a species and that is its name)
1517             $subspecies = undef
1518              
1519             ORGANISM Ajellomyces capsulatus var. farciminosus
1520             $genus = Ajellomyces
1521             $species = capsulatus
1522             $subspecies = var. farciminosus
1523              
1524             ORGANISM Hepatitis delta virus
1525             $genus = undef (though this virus has a genus in its lineage, we
1526             cannot know that without a database lookup)
1527             $species = Hepatitis delta virus
1528              
1529             Returns : A Bio::Species object
1530             Args : A reference to the current line buffer
1531              
1532             =cut
1533              
1534             sub _read_GenBank_Species {
1535 101     101   126 my ($self, $buffer) = @_;
1536              
1537 101         426 my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown',
1538             'Unspecified', 'Unknown', 'None', 'unclassified',
1539             'unidentified organism', 'not supplied');
1540             # dictionary of synonyms for taxid 32644
1541 101         209 my @unkn_genus = ('unknown', 'unclassified', 'uncultured', 'unidentified');
1542             # all above can be part of valid species name
1543              
1544 101         156 my $line = $$buffer;
1545              
1546 101         140 my( $sub_species, $species, $genus, $sci_name, $common,
1547             $class_lines, $source_flag, $abbr_name, $organelle, $sl );
1548 101         199 my %source = map { $_ => 1 } qw(SOURCE ORGANISM CLASSIFICATION);
  303         636  
1549              
1550             # upon first entering the loop, we must not read a new line -- the SOURCE
1551             # line is already in the buffer (HL 05/10/2000)
1552 101         145 my ($ann, $tag, $data);
1553 101   66     362 while (defined($line) or defined($line = $self->_readline)) {
1554             # de-HTMLify (links that may be encountered here don't contain
1555             # escaped '>', so a simple-minded approach suffices)
1556 517         635 $line =~ s{<[^>]+>}{}g;
1557 517 100       1760 if ($line =~ m{^(?:\s{0,2})(\w+)\s+(.+)?$}ox) {
1558 303   50     1071 ($tag, $data) = ($1, $2 || '');
1559 303 100 66     1240 last if ($tag and not exists $source{$tag});
1560             }
1561             else {
1562 214 50       385 return unless $tag;
1563 214         620 ($data = $line) =~ s{^\s+}{};
1564 214         268 chomp $data;
1565 214 100 100     1368 $tag = 'CLASSIFICATION' if ( $tag ne 'CLASSIFICATION'
      100        
1566             and $tag eq 'ORGANISM'
1567             # Don't match "str." or "var." (fix NC_021815),
1568             # and don't match ".1" (fix NC_021902)
1569             and $line =~ m{(?
1570             }
1571 416 100       1175 (exists $ann->{$tag}) ? ($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data);
1572 416         1221 $line = undef;
1573             }
1574              
1575 101         378 ($sl, $class_lines, $sci_name) = ($ann->{SOURCE}, $ann->{CLASSIFICATION}, $ann->{ORGANISM});
1576              
1577 101         200 $$buffer = $line;
1578              
1579 101 50       222 $sci_name or return;
1580              
1581             # parse out organelle, common name, abbreviated name if present;
1582             # this should catch everything, but falls back to
1583             # entire SOURCE line just in case
1584 101 50       1103 if ($sl =~ m{^(mitochondrion|chloroplast|plastid)?
1585             \s*(.*?)
1586             \s*(?: \( (.*?) \) )?\.?
1587             $
1588             }xms
1589             ) {
1590 101         339 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
1591             }
1592             else {
1593 0         0 $abbr_name = $sl; # nothing caught; this is a backup!
1594             }
1595              
1596             # Convert data in classification lines into classification array.
1597             # only split on ';' or '.' so that classification that is 2 or more words will
1598             # still get matched, use map() to remove trailing/leading/intervening spaces
1599 101         819 my @class = map { $_ =~ s/^\s+//;
  955         1424  
1600 955         949 $_ =~ s/\s+$//;
1601 955         877 $_ =~ s/\s{2,}/ /g;
1602 955         1166 $_; }
1603             split /(?
1604              
1605             # do we have a genus?
1606 101 50       508 my $possible_genus = quotemeta($class[-1])
1607             . ($class[-2] ? "|" . quotemeta($class[-2]) : '');
1608 101 100       3704 if ($sci_name =~ /^($possible_genus)/) {
1609 88         401 $genus = $1;
1610 88         1125 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1611             }
1612             else {
1613 13         28 $species = $sci_name;
1614             }
1615              
1616             # is this organism of rank species or is it lower?
1617             # (we don't catch everything lower than species, but it doesn't matter -
1618             # this is just so we abide by previous behaviour whilst not calling a
1619             # species a subspecies)
1620 101 100 66     1430 if ($species and $species =~ /(.+)\s+((?:subsp\.|var\.).+)/) {
1621 5         18 ($species, $sub_species) = ($1, $2);
1622             }
1623              
1624             # Don't make a species object if it's empty or "Unknown" or "None"
1625             # return unless $genus and $genus !~ /^(Unknown|None)$/oi;
1626             # Don't make a species object if it belongs to taxid 32644
1627             # my $unkn = grep { $_ =~ /^\Q$sl\E$/; } @unkn_names;
1628 101         174 my $unkn = grep { $_ eq $sl } @unkn_names;
  1010         1131  
1629 101 50 33     649 return unless (defined $species or defined $genus) and $unkn == 0;
      33        
1630              
1631             # Bio::Species array needs array in Species -> Kingdom direction
1632 101         170 push @class, $sci_name;
1633 101         213 @class = reverse @class;
1634              
1635 101         709 my $make = Bio::Species->new;
1636 101         324 $make->scientific_name($sci_name);
1637 101 50       516 $make->classification(@class) if @class > 0;
1638 101 100       372 $make->common_name( $common ) if $common;
1639 101 50       465 $make->name('abbreviated', $abbr_name) if $abbr_name;
1640 101 100       257 $make->organelle($organelle) if $organelle;
1641             #$make->sub_species( $sub_species ) if $sub_species;
1642 101         1111 return $make;
1643             }
1644              
1645             =head2 _read_FTHelper_GenBank
1646              
1647             Title : _read_FTHelper_GenBank
1648             Usage : _read_FTHelper_GenBank($buffer)
1649             Function: reads the next FT key line
1650             Example :
1651             Returns : Bio::SeqIO::FTHelper object
1652             Args : filehandle and reference to a scalar
1653              
1654             =cut
1655              
1656             sub _read_FTHelper_GenBank {
1657 8280     8280   7559 my ($self, $buffer) = @_;
1658              
1659 8280         6221 my ($key, # The key of the feature
1660             $loc # The location line from the feature
1661             );
1662 8280         8458 my @qual = (); # An array of lines making up the qualifiers
1663              
1664 8280 50       50487 if ($$buffer =~ /^\s{5}(\S+)\s+(.+?)\s*$/o) {
1665 8280         15136 $key = $1;
1666 8280         10121 $loc = $2;
1667             # Read all the lines up to the next feature
1668 8280         17686 while ( defined(my $line = $self->_readline) ) {
1669 88592 100       386490 if ($line =~ /^(\s+)(.+?)\s*$/o) {
1670             # Lines inside features are preceded by 21 spaces
1671             # A new feature is preceded by 5 spaces
1672 88489 100       123207 if (length($1) > 6) {
1673             # Add to qualifiers if we're in the qualifiers, or if it's
1674             # the first qualifier
1675 80312 100 100     163820 if (@qual or (index($2,'/') == 0)) {
1676 68121         157126 push @qual, $2;
1677             }
1678             # We're still in the location line, so append to location
1679             else {
1680 12191         29294 $loc .= $2;
1681             }
1682             }
1683             else {
1684             # We've reached the start of the next feature
1685             # Put the first line of the next feature into the buffer
1686 8177         7737 $$buffer = $line;
1687 8177         11783 last;
1688             }
1689             }
1690             else {
1691             # We're at the end of the feature table
1692             # Put the first line of the next feature into the buffer
1693 103         188 $$buffer = $line;
1694 103         178 last;
1695             }
1696             }
1697             }
1698             else {
1699             # No feature key
1700 0         0 $self->debug("no feature key!\n");
1701             # change suggested by JDiggans to avoid infinite loop-
1702             # see bugreport 1062.
1703             # reset buffer to prevent infinite loop
1704 0         0 $$buffer = $self->_readline;
1705 0         0 return;
1706             }
1707              
1708             # Make the new FTHelper object
1709 8280         18970 my $out = Bio::SeqIO::FTHelper->new;
1710 8280         16946 $out->verbose($self->verbose);
1711 8280         13153 $out->key($key);
1712 8280         12068 $out->loc($loc);
1713              
1714             # Now parse and add any qualifiers. (@qual is kept
1715             # intact to provide informative error messages.)
1716             QUAL:
1717 8280         16304 for (my $i = 0; $i < @qual; $i++) {
1718 55807         45528 my $data = $qual[$i];
1719 55807 50       196061 my ( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s*(.+))?})
1720             or $self->warn( "cannot see new qualifier in feature $key: "
1721             . $data);
1722 55807 50       69452 $qualifier = '' if not defined $qualifier;
1723              
1724 55807 100       56119 if (defined $value) {
1725             # Do we have a quoted value?
1726 54903 100       69734 if (substr($value, 0, 1) eq '"') {
    100          
1727             # Keep adding to value until we find the trailing quote
1728             # and the quotes are balanced
1729 52083   66     191605 while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) {
1730 12312 50       17233 if ($i >= $#qual) {
1731 0         0 $self->warn( "Unbalanced quote in:\n"
1732             . join("\n", @qual)
1733             . "No further qualifiers will "
1734             . "be added for this feature");
1735 0         0 last QUAL;
1736             }
1737             # modifying a for-loop variable inside of the loop
1738             # is not the best programming style ...
1739 12312         9333 $i++;
1740 12312         9545 my $next = $qual[$i];
1741              
1742             # add to value with a space unless the value appears
1743             # to be a sequence (translation for example)
1744             # if (($value.$next) =~ /[^A-Za-z\"\-]/o) {
1745             # changed to explicitly look for translation tag - cjf 06/8/29
1746 12312 100       17638 if ($qualifier !~ /^translation$/i ) {
1747 9870         10364 $value .= " ";
1748             }
1749 12312         48784 $value .= $next;
1750             }
1751             # Trim leading and trailing quotes
1752 52083         233636 $value =~ s/^"|"$//g;
1753             # Undouble internal quotes
1754 52083         51590 $value =~ s/""/\"/g;
1755             }
1756             elsif ( $value =~ /^\(/ ) { # values quoted by ()s
1757             # Keep adding to value until we find the trailing bracket
1758             # and the ()s are balanced
1759 27         42 my $left = ($value =~ tr/\(/\(/); # count left parens
1760 27         34 my $right = ($value =~ tr/\)/\)/); # count right parens
1761              
1762 27         56 while( $left != $right ) { # was "$value !~ /\)$/ or $left != $right"
1763 2 50       7 if ( $i >= $#qual) {
1764 0         0 $self->warn( "Unbalanced parens in:\n"
1765             . join("\n", @qual)
1766             . "\nNo further qualifiers will "
1767             . "be added for this feature");
1768 0         0 last QUAL;
1769             }
1770 2         4 $i++;
1771 2         3 my $next = $qual[$i];
1772 2         4 $value .= $next;
1773 2         4 $left += ($next =~ tr/\(/\(/);
1774 2         6 $right += ($next =~ tr/\)/\)/);
1775             }
1776             }
1777             }
1778             else {
1779 904         981 $value = '_no_value';
1780             }
1781             # Store the qualifier
1782 55807   100     76350 $out->field->{$qualifier} ||= [];
1783 55807         39024 push @{$out->field->{$qualifier}}, $value;
  55807         62263  
1784             }
1785 8280         16612 return $out;
1786             }
1787              
1788             =head2 _write_line_GenBank
1789              
1790             Title : _write_line_GenBank
1791             Usage :
1792             Function: internal function
1793             Example :
1794             Returns :
1795             Args :
1796              
1797             =cut
1798              
1799             sub _write_line_GenBank {
1800 0     0   0 my ($self, $pre1, $pre2, $line, $length) = @_;
1801              
1802 0 0       0 $length or $self->throw("Miscalled write_line_GenBank without length. Programming error!");
1803 0         0 my $subl = $length - length $pre2;
1804 0         0 my $linel = length $line;
1805 0         0 my $i;
1806              
1807 0         0 my $subr = substr($line,0,$length - length $pre1);
1808              
1809 0         0 $self->_print("$pre1$subr\n");
1810 0         0 for($i = ($length - length $pre1); $i < $linel; $i += $subl) {
1811 0         0 $subr = substr($line, $i, $subl);
1812 0         0 $self->_print("$pre2$subr\n");
1813             }
1814             }
1815              
1816             =head2 _write_line_GenBank_regex
1817              
1818             Title : _write_line_GenBank_regex
1819             Usage :
1820             Function: internal function for writing lines of specified
1821             length, with different first and the next line
1822             left hand headers and split at specific points in the
1823             text
1824             Example :
1825             Returns : nothing
1826             Args : file handle,
1827             first header,
1828             second header,
1829             text-line,
1830             regex for line breaks,
1831             total line length
1832              
1833             =cut
1834              
1835             sub _write_line_GenBank_regex {
1836 1765     1765   2516 my ($self, $pre1, $pre2, $line, $regex, $length) = @_;
1837              
1838             #print STDOUT "Going to print with $line!\n";
1839              
1840 1765 50       2721 $length or $self->throw("Miscalled write_line_GenBank without length. Programming error!");
1841              
1842 1765         2306 my $subl = $length - (length $pre1) - 2;
1843 1765         1843 my @lines = ();
1844              
1845             CHUNK:
1846 1765         2549 while ($line) {
1847 2129         3036 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1848 2129 50       26154 if ($line =~ m/^(.{0,$subl})($pat)(.*)/ ) {
1849 2129         4839 my $l = $1 . $2;
1850 2129         2839 $line = substr($line, length $l);
1851             # be strict about not padding spaces according to
1852             # genbank format
1853 2129         3932 $l =~ s/\s+$//;
1854 2129 50       3569 next CHUNK if ($l eq '');
1855 2129         2654 push @lines, $l;
1856 2129         5497 next CHUNK;
1857             }
1858             }
1859             # if we get here none of the patterns matched $subl or less chars
1860 0         0 $self->warn( "trouble dissecting \"$line\"\n into chunks "
1861             . "of $subl chars or less - this tag won't print right");
1862             # insert a space char to prevent infinite loops
1863 0         0 $line = substr($line, 0, $subl) . " " . substr($line, $subl);
1864             }
1865 1765         1737 my $s = shift @lines;
1866 1765 100       6388 $self->_print("$pre1$s\n") if $s;
1867 1765         5411 foreach my $s ( @lines ) {
1868 369         706 $self->_print("$pre2$s\n");
1869             }
1870             }
1871              
1872             =head2 _post_sort
1873              
1874             Title : _post_sort
1875             Usage : $obj->_post_sort($newval)
1876             Function:
1877             Returns : value of _post_sort
1878             Args : newvalue (optional)
1879              
1880             =cut
1881              
1882             sub _post_sort {
1883 25     25   35 my ($obj,$value) = @_;
1884 25 50       70 if ( defined $value) {
1885 0         0 $obj->{'_post_sort'} = $value;
1886             }
1887 25         83 return $obj->{'_post_sort'};
1888             }
1889              
1890             =head2 _show_dna
1891              
1892             Title : _show_dna
1893             Usage : $obj->_show_dna($newval)
1894             Function:
1895             Returns : value of _show_dna
1896             Args : newvalue (optional)
1897              
1898             =cut
1899              
1900             sub _show_dna {
1901 141     141   206 my ($obj,$value) = @_;
1902 141 100       356 if ( defined $value) {
1903 116         244 $obj->{'_show_dna'} = $value;
1904             }
1905 141         279 return $obj->{'_show_dna'};
1906             }
1907              
1908             =head2 _id_generation_func
1909              
1910             Title : _id_generation_func
1911             Usage : $obj->_id_generation_func($newval)
1912             Function:
1913             Returns : value of _id_generation_func
1914             Args : newvalue (optional)
1915              
1916             =cut
1917              
1918             sub _id_generation_func {
1919 25     25   43 my ($obj,$value) = @_;
1920 25 50       77 if ( defined $value ) {
1921 0         0 $obj->{'_id_generation_func'} = $value;
1922             }
1923 25         79 return $obj->{'_id_generation_func'};
1924             }
1925              
1926             =head2 _ac_generation_func
1927              
1928             Title : _ac_generation_func
1929             Usage : $obj->_ac_generation_func($newval)
1930             Function:
1931             Returns : value of _ac_generation_func
1932             Args : newvalue (optional)
1933              
1934             =cut
1935              
1936             sub _ac_generation_func {
1937 25     25   45 my ($obj,$value) = @_;
1938 25 50       207 if ( defined $value ) {
1939 0         0 $obj->{'_ac_generation_func'} = $value;
1940             }
1941 25         83 return $obj->{'_ac_generation_func'};
1942             }
1943              
1944             =head2 _sv_generation_func
1945              
1946             Title : _sv_generation_func
1947             Usage : $obj->_sv_generation_func($newval)
1948             Function:
1949             Returns : value of _sv_generation_func
1950             Args : newvalue (optional)
1951              
1952             =cut
1953              
1954             sub _sv_generation_func {
1955 25     25   40 my ($obj,$value) = @_;
1956 25 50       85 if ( defined $value ) {
1957 0         0 $obj->{'_sv_generation_func'} = $value;
1958             }
1959 25         288 return $obj->{'_sv_generation_func'};
1960             }
1961              
1962             =head2 _kw_generation_func
1963              
1964             Title : _kw_generation_func
1965             Usage : $obj->_kw_generation_func($newval)
1966             Function:
1967             Returns : value of _kw_generation_func
1968             Args : newvalue (optional)
1969              
1970             =cut
1971              
1972             sub _kw_generation_func {
1973 25     25   36 my ($obj,$value) = @_;
1974 25 50       72 if ( defined $value ) {
1975 0         0 $obj->{'_kw_generation_func'} = $value;
1976             }
1977 25         169 return $obj->{'_kw_generation_func'};
1978             }
1979              
1980             1;