File Coverage

Bio/SeqIO/genbank.pm
Criterion Covered Total %
statement 612 723 84.6
branch 308 404 76.2
condition 74 111 66.6
subroutine 25 26 96.1
pod 2 2 100.0
total 1021 1266 80.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::genbank
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Bioperl project bioperl-l(at)bioperl.org
7             #
8             # Copyright Elia Stupka and contributors see AUTHORS section
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::genbank - GenBank sequence input/output stream
17              
18             =head1 SYNOPSIS
19              
20             It is probably best not to use this object directly, but
21             rather go through the SeqIO handler:
22              
23             $stream = Bio::SeqIO->new(-file => $filename,
24             -format => 'GenBank');
25              
26             while ( my $seq = $stream->next_seq ) {
27             # do something with $seq
28             }
29              
30              
31             =head1 DESCRIPTION
32              
33             This object can transform Bio::Seq objects to and from GenBank flat
34             file databases.
35              
36             There is some flexibility here about how to write GenBank output
37             that is not fully documented.
38              
39             =head2 Optional functions
40              
41             =over 3
42              
43             =item _show_dna()
44              
45             (output only) shows the dna or not
46              
47             =item _post_sort()
48              
49             (output only) provides a sorting func which is applied to the FTHelpers
50             before printing
51              
52             =item _id_generation_func()
53              
54             This is function which is called as
55              
56             print "ID ", $func($seq), "\n";
57              
58             To generate the ID line. If it is not there, it generates a sensible ID
59             line using a number of tools.
60              
61             If you want to output annotations in Genbank format they need to be
62             stored in a Bio::Annotation::Collection object which is accessible
63             through the Bio::SeqI interface method L.
64              
65             The following are the names of the keys which are pulled from a
66             L object:
67              
68             reference - Should contain Bio::Annotation::Reference objects
69             comment - Should contain Bio::Annotation::Comment objects
70             dblink - Should contain a Bio::Annotation::DBLink object
71             segment - Should contain a Bio::Annotation::SimpleValue object
72             origin - Should contain a Bio::Annotation::SimpleValue object
73             wgs - Should contain a Bio::Annotation::SimpleValue object
74              
75             =back
76              
77             =head1 Where does the data go?
78              
79             Data parsed in Bio::SeqIO::genbank is stored in a variety of data
80             fields in the sequence object that is returned. Here is a partial list
81             of fields.
82              
83             Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you
84             the top level object which defines a function called NAME() which
85             stores this information.
86              
87             Items listed as Annotation 'NAME' tell you the data is stored the
88             associated Bio::AnnotationCollectionI object which is associated with
89             Bio::Seq objects. If it is explicitly requested that no annotations
90             should be stored when parsing a record of course they will not be
91             available when you try and get them. If you are having this problem
92             look at the type of SeqBuilder that is being used to contruct your
93             sequence object.
94              
95             Comments Annotation 'comment'
96             References Annotation 'reference'
97             Segment Annotation 'segment'
98             Origin Annotation 'origin'
99             Dbsource Annotation 'dblink'
100              
101             Accessions PrimarySeq accession_number()
102             Secondary accessions RichSeq get_secondary_accessions()
103             GI number PrimarySeq primary_id()
104             LOCUS PrimarySeq display_id()
105             Keywords RichSeq get_keywords()
106             Dates RichSeq get_dates()
107             Molecule RichSeq molecule()
108             Seq Version RichSeq seq_version()
109             PID RichSeq pid()
110             Division RichSeq division()
111             Features Seq get_SeqFeatures()
112             Alphabet PrimarySeq alphabet()
113             Definition PrimarySeq description() or desc()
114             Version PrimarySeq version()
115              
116             Sequence PrimarySeq seq()
117              
118             There is more information in the Feature-Annotation HOWTO about each
119             field and how it is mapped to the Sequence object
120             L.
121              
122             =head1 FEEDBACK
123              
124             =head2 Mailing Lists
125              
126             User feedback is an integral part of the evolution of this and other
127             Bioperl modules. Send your comments and suggestions preferably to one
128             of the Bioperl mailing lists. Your participation is much appreciated.
129              
130             bioperl-l@bioperl.org - General discussion
131             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
132              
133             =head2 Support
134              
135             Please direct usage questions or support issues to the mailing list:
136              
137             I
138              
139             rather than to the module maintainer directly. Many experienced and
140             reponsive experts will be able look at the problem and quickly
141             address it. Please include a thorough description of the problem
142             with code and data examples if at all possible.
143              
144             =head2 Reporting Bugs
145              
146             Report bugs to the Bioperl bug tracking system to help us keep track
147             the bugs and their resolution. Bug reports can be submitted via the web:
148              
149             https://github.com/bioperl/bioperl-live/issues
150              
151             =head1 AUTHOR - Bioperl Project
152              
153             bioperl-l at bioperl.org
154              
155             Original author Elia Stupka, elia -at- tigem.it
156              
157             =head1 CONTRIBUTORS
158              
159             Ewan Birney birney at ebi.ac.uk
160             Jason Stajich jason at bioperl.org
161             Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
162             Lincoln Stein lstein at cshl.org
163             Heikki Lehvaslaiho, heikki at ebi.ac.uk
164             Hilmar Lapp, hlapp at gmx.net
165             Donald G. Jackson, donald.jackson at bms.com
166             James Wasmuth, james.wasmuth at ed.ac.uk
167             Brian Osborne, bosborne at alum.mit.edu
168             Chris Fields, cjfields at bioperl dot org
169              
170             =head1 APPENDIX
171              
172             The rest of the documentation details each of the object
173             methods. Internal methods are usually preceded with a _
174              
175             =cut
176              
177             # Let the code begin...
178              
179             package Bio::SeqIO::genbank;
180 16     16   788 use strict;
  16         20  
  16         415  
181              
182 16     16   4085 use Bio::SeqIO::FTHelper;
  16         33  
  16         333  
183 16     16   66 use Bio::SeqFeature::Generic;
  16         21  
  16         229  
184 16     16   4785 use Bio::Species;
  16         27  
  16         351  
185 16     16   67 use Bio::Seq::SeqFactory;
  16         20  
  16         288  
186 16     16   50 use Bio::Annotation::Collection;
  16         19  
  16         568  
187 16     16   4802 use Bio::Annotation::Comment;
  16         24  
  16         346  
188 16     16   4821 use Bio::Annotation::Reference;
  16         27  
  16         354  
189 16     16   66 use Bio::Annotation::DBLink;
  16         17  
  16         276  
190              
191 16     16   48 use base qw(Bio::SeqIO);
  16         16  
  16         116667  
192              
193             # Note that a qualifier that exceeds one line (i.e. a long label) will
194             # automatically be quoted regardless:
195              
196             our $FTQUAL_LINE_LENGTH = 60;
197              
198             our %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
199             anticodon citation
200             codon codon_start
201             cons_splice direction
202             evidence label
203             mod_base number
204             rpt_type rpt_unit
205             transl_except transl_table
206             usedin
207             );
208              
209             our %DBSOURCE = map {$_ => 1} qw(
210             EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
211             TIGR GO InterPro Pfam PROSITE SGD GermOnline
212             HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
213             H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
214             MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
215             UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
216             PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
217             Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
218             TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
219             PhotoList Gramene WormBase WormPep Genew ZFIN
220             PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
221             swissprot GenBank GenPept REFSEQ embl PDB UniProtKB
222             DIP PeptideAtlas PRIDE CYGD HOGENOME Gene3D
223             Project);
224              
225             our %VALID_MOLTYPE = map {$_ => 1} qw(NA DNA RNA tRNA rRNA cDNA cRNA ms-DNA
226             mRNA uRNA ss-RNA ss-DNA snRNA snoRNA PRT);
227              
228             our %VALID_ALPHABET = (
229             'bp' => 'dna',
230             'aa' => 'protein',
231             'rc' => '' # rc = release candidate; file has no sequences
232             );
233              
234             sub _initialize {
235 115     115   271 my($self, @args) = @_;
236              
237 115         450 $self->SUPER::_initialize(@args);
238             # hash for functions for decoding keys.
239 115         399 $self->{'_func_ftunit_hash'} = {};
240 115         363 $self->_show_dna(1); # sets this to one by default. People can change it
241 115 50       386 if ( not defined $self->sequence_factory ) {
242 115         415 $self->sequence_factory
243             (Bio::Seq::SeqFactory->new(-verbose => $self->verbose,
244             -type => 'Bio::Seq::RichSeq'));
245             }
246             }
247              
248             =head2 next_seq
249              
250             Title : next_seq
251             Usage : $seq = $stream->next_seq()
252             Function: returns the next sequence in the stream
253             Returns : Bio::Seq object
254             Args :
255              
256             =cut
257              
258             sub next_seq {
259 117     117 1 3665 my ($self, @args) = @_;
260 117         219 my %args = @args;
261 117         342 my $builder = $self->sequence_builder;
262 117         148 my $seq;
263             my %params;
264              
265             RECORDSTART:
266 117         147 while (1) {
267 119         132 my $buffer;
268 119         147 my ( @acc, @features );
269 0         0 my ( $display_id, $annotation );
270 0         0 my $species;
271              
272             # initialize; we may come here because of starting over
273 119         176 @features = ();
274 119         161 $annotation = undef;
275 119         146 @acc = ();
276 119         137 $species = undef;
277 119         373 %params = ( -verbose => $self->verbose ); # reset hash
278 119         517 local ($/) = "\n";
279 119         421 while ( defined( $buffer = $self->_readline ) ) {
280 189 100       994 last if index( $buffer, 'LOCUS ' ) == 0;
281             }
282 119 100       300 return unless defined $buffer; # end of file
283 113 50       630 $buffer =~ /^LOCUS\s+(\S.*)$/o
284             or $self->throw( "GenBank stream with bad LOCUS line. "
285             . "Not GenBank in my book. Got '$buffer'");
286 113         555 my @tokens = split( ' ', $1 );
287              
288             # this is important to have the id for display in e.g. FTHelper,
289             # otherwise you won't know which entry caused an error
290 113         178 $display_id = shift @tokens;
291 113         235 $params{'-display_id'} = $display_id;
292              
293             # may still be useful if we don't want the seq
294 113         176 my $seqlength = shift @tokens;
295 113 50       296 if ( exists $VALID_ALPHABET{$seqlength} ) {
296             # moved one token too far. No locus name?
297 0         0 $self->warn( "Bad LOCUS name? Changing [$params{'-display_id'}] "
298             . "to 'unknown' and length to '$display_id'"
299             );
300 0         0 $params{'-display_id'} = 'unknown';
301 0         0 $params{'-length'} = $display_id;
302              
303             # add token back...
304 0         0 unshift @tokens, $seqlength;
305             }
306             else {
307 113         189 $params{'-length'} = $seqlength;
308             }
309              
310             # the alphabet of the entry
311             # shouldn't assign alphabet unless one
312             # is specifically designated (such as for rc files)
313 113         250 my $alphabet = lc( shift @tokens );
314             $params{'-alphabet'} =
315             ( exists $VALID_ALPHABET{$alphabet} )
316 113 50       346 ? $VALID_ALPHABET{$alphabet}
317             : $self->warn("Unknown alphabet: $alphabet");
318              
319             # for aa there is usually no 'molecule' (mRNA etc)
320 113 100       290 if ( $params{'-alphabet'} eq 'protein' ) {
321 10         18 $params{'-molecule'} = 'PRT';
322             }
323             else {
324 103         177 $params{'-molecule'} = shift(@tokens);
325             }
326              
327             # take care of lower case issues
328 113 100 66     595 if ( $params{'-molecule'} eq 'dna' or $params{'-molecule'} eq 'rna' ) {
329 4         9 $params{'-molecule'} = uc $params{'-molecule'};
330             }
331             $self->debug( "Unrecognized molecule type: " . $params{'-molecule'} )
332 113 50       357 if not exists( $VALID_MOLTYPE{ $params{'-molecule'} } );
333              
334 113         153 my $circ = shift @tokens;
335 113 100       252 if ( $circ eq 'circular' ) {
336 8         19 $params{'-is_circular'} = 1;
337 8         24 $params{'-division'} = shift @tokens;
338             }
339             else {
340             # 'linear' or 'circular' may actually be omitted altogether
341 105 100       294 $params{'-division'} =
342             ( CORE::length($circ) == 3 ) ? $circ : shift @tokens;
343             }
344 113         267 my $date = join( ' ', @tokens ); # we lump together the rest
345              
346             # this is per request bug #1513
347             # we can handle:
348             # 9-10-2003
349             # 9-10-03
350             # 09-10-2003
351             # 09-10-03
352 113 100       772 if ( $date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/ ) {
353 107 50       254 if ( length($date) < 11 ) {
354             # improperly formatted date
355             # But we'll be nice and fix it for them
356 0         0 my ( $d, $m, $y ) = ( $2, $3, $4 );
357 0 0       0 if ( length($d) == 1 ) {
358 0         0 $d = "0$d";
359             }
360              
361             # guess the century here
362 0 0       0 if ( length($y) == 2 ) {
363 0 0       0 if ( $y > 60 ) { # arbitrarily guess that '60' means 1960
364 0         0 $y = "19$y";
365             }
366             else {
367 0         0 $y = "20$y";
368             }
369 0         0 $self->warn( "Date was malformed, guessing the "
370             . "century for $date to be $y\n"
371             );
372             }
373 0         0 $params{'-dates'} = [ join( '-', $d, $m, $y ) ];
374             }
375             else {
376 107         266 $params{'-dates'} = [$date];
377             }
378             }
379              
380             # set them all at once
381 113         606 $builder->add_slot_value(%params);
382 113         292 %params = ();
383              
384             # parse the rest if desired, otherwise start over
385 113 100       304 if ( not $builder->want_object ) {
386 2         5 $builder->make_object;
387 2         7 next RECORDSTART;
388             }
389              
390             # set up annotation depending on what the builder wants
391 111 100       278 if ( $builder->want_slot('annotation') ) {
392 106         650 $annotation = Bio::Annotation::Collection->new;
393             }
394              
395 111         284 $buffer = $self->_readline;
396 111         328 while ( defined( my $line = $buffer ) ) {
397             # Description line(s)
398 739 100       1646 if ($line =~ /^DEFINITION\s+(\S.*\S)/) {
399 110         308 my @desc = ($1);
400 110         270 while ( defined( $line = $self->_readline ) ) {
401 172 100       509 if ($line =~ /^\s+(.*)/) {
402 62         164 push( @desc, $1 );
403 62         139 next;
404             }
405 110         144 last;
406             }
407 110         529 $builder->add_slot_value( -desc => join( ' ', @desc ) );
408              
409             # we'll continue right here because DEFINITION
410             # always comes at the top of the entry
411 110         182 $buffer = $line;
412             }
413              
414             # accession number (there can be multiple accessions)
415 739 100       4671 if ($line =~ /^ACCESSION\s+(\S.*\S)/) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
416 110         455 push( @acc, split( /\s+/, $1 ) );
417 110         291 while ( defined( $line = $self->_readline ) ) {
418 113 100       377 if ($line =~ /^\s+(.*)/) {
419 3         15 push( @acc, split( /\s+/, $1 ) );
420 3         8 next;
421             }
422 110         136 last;
423             }
424 110         136 $buffer = $line;
425 110         226 next;
426             }
427              
428             # PID
429             elsif ($line =~ /^PID\s+(\S+)/) {
430 1         4 $params{'-pid'} = $1;
431             }
432              
433             # Version number
434             elsif ($line =~ /^VERSION\s+(\S.+)$/) {
435 91         297 my ( $acc, $gi ) = split( ' ', $1 );
436 91 100       388 if ( $acc =~ /^\w+\.(\d+)/ ) {
437 87         228 $params{'-version'} = $1;
438 87         156 $params{'-seq_version'} = $1;
439             }
440 91 100 66     491 if ( $gi && ( index( $gi, "GI:" ) == 0 ) ) {
441 89         307 $params{'-primary_id'} = substr( $gi, 3 );
442             }
443             }
444              
445             # Keywords
446             elsif ($line =~ /^KEYWORDS\s+(\S.*)/) {
447 107         410 my @kw = split( /\s*\;\s*/, $1 );
448 107         277 while ( defined( $line = $self->_readline ) ) {
449 138         203 chomp $line;
450 138 100       386 if ($line =~ /^\s+(.*)/) {
451 31         207 push( @kw, split( /\s*\;\s*/, $1 ) );
452 31         69 next;
453             }
454 107         146 last;
455             }
456              
457 107 50       522 @kw && $kw[-1] =~ s/\.$//;
458 107         230 $params{'-keywords'} = \@kw;
459 107         143 $buffer = $line;
460 107         234 next;
461             }
462              
463             # Organism name and phylogenetic information
464             elsif ($line =~ /^SOURCE\s+\S/) {
465 106 100       289 if ( $builder->want_slot('species') ) {
466 101         328 $species = $self->_read_GenBank_Species( \$buffer );
467 101         581 $builder->add_slot_value( -species => $species );
468             }
469             else {
470 5         8 while ( defined( $buffer = $self->_readline ) ) {
471 18 100       35 last if substr( $buffer, 0, 1 ) ne ' ';
472             }
473             }
474 106         432 next;
475             }
476              
477             # References
478             elsif ($line =~ /^REFERENCE\s+\S/) {
479 109 100       257 if ($annotation) {
480 100         415 my @refs = $self->_read_GenBank_References( \$buffer );
481 100         225 foreach my $ref (@refs) {
482 417         852 $annotation->add_Annotation( 'reference', $ref );
483             }
484             }
485             else {
486 9         15 while ( defined( $buffer = $self->_readline ) ) {
487 47 100       98 last if substr( $buffer, 0, 1 ) ne ' ';
488             }
489             }
490 109         336 next;
491             }
492              
493             # Project
494             elsif ($line =~ /^PROJECT\s+(\S.*)/) {
495 2 50       5 if ($annotation) {
496 2         11 my $project =
497             Bio::Annotation::SimpleValue->new( -value => $1 );
498 2         6 $annotation->add_Annotation( 'project', $project );
499             }
500             }
501              
502             # Comments may be plain text or Structured Comments.
503             # Structured Comments are made up of tag/value pairs and have beginning
504             # and end delimiters like ##*-Data-START## and ##*-Data-END##
505             elsif ($line =~ /^COMMENT\s+(\S.*)/) {
506 60 100       135 if ($annotation) {
507 58         154 my $comment = $1;
508 58         160 while ( defined( $line = $self->_readline ) ) {
509 481 100       857 last if ($line =~ /^\S/);
510 423         787 $comment .= $line;
511             }
512 58         559 $comment =~ s/ +/ /g;
513             # Structured Comment, do not remove returns in the tabular section
514 58 100       194 if ( my ( $text, $table )= $comment
515             =~ /([^#]*)(##\S+Data-START##.+?##\S+Data-END##)/is
516             ) {
517 2 100       8 $text =~ s/\n/ /g if $text;
518 2         7 $table =~ s/START##/START##\n/;
519 2         9 $table =~ s/^\s+//gm;
520 2         5 $comment = $text . "\n" . $table;
521             }
522             # Plain text, remove returns
523             else {
524 56         319 $comment =~ s/\n/ /g;
525             }
526 58         560 $annotation->add_Annotation(
527             'comment',
528             Bio::Annotation::Comment->new(
529             -text => $comment,
530             -tagname => 'comment'
531             )
532             );
533 58         313 $buffer = $line;
534             }
535             else {
536 2         4 while ( defined( $buffer = $self->_readline ) ) {
537 7 100       16 last if substr( $buffer, 0, 1 ) ne ' ';
538             }
539             }
540 60         132 next;
541             }
542              
543             # Corresponding Genbank nucleotide id, Genpept only
544             elsif ($line =~ /^DB(?:SOURCE|LINK)\s+(\S.+)/) {
545 20 50       45 if ($annotation) {
546 20         41 my $dbsource = $1;
547 20         51 while ( defined( $line = $self->_readline ) ) {
548 85 100       175 last if ($line =~ /^\S/);
549 65         141 $dbsource .= $line;
550             }
551              
552             # deal with UniProKB dbsources
553 20 100       88 if ( $dbsource =~
554             s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n//
555             ) {
556 5         49 $annotation->add_Annotation(
557             'dblink',
558             Bio::Annotation::DBLink->new(
559             -primary_id => $2,
560             -database => $1,
561             -tagname => 'dblink'
562             )
563             );
564 5 50       52 if ( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) {
565 5         47 $annotation->add_Annotation(
566             'swissprot_dates',
567             Bio::Annotation::SimpleValue->new(
568             -tagname => 'date_created',
569             -value => $1
570             )
571             );
572             }
573 5         71 while ( $dbsource =~
574             s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g
575             ) {
576 5         27 $annotation->add_Annotation(
577             'swissprot_dates',
578             Bio::Annotation::SimpleValue->new(
579             -tagname => 'date_updated',
580             -value => $2
581             )
582             );
583             }
584 5         39 $dbsource =~ s/\n/ /g;
585 5 50       73 if ( $dbsource =~
    0          
586             s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/
587             ) {
588             # will use $i to determine even or odd
589             # for swissprot the accessions are paired
590 5         11 my $i = 0;
591 5         48 for my $dbsrc ( split( /,\s+/, $1 ) ) {
592 49 50 66     264 if ( $dbsrc =~ /(\S+)\.(\d+)/
593             or $dbsrc =~ /(\S+)/
594             ) {
595 49         105 my ( $id, $version ) = ( $1, $2 );
596 49 100       78 $version = '' unless defined $version;
597 49 100       142 my $db = ( $id =~ /^\d\S{3}/ ) ? 'PDB'
    100          
598             : ( $i++ % 2 ) ? 'GenPept'
599             : 'GenBank';
600              
601 49         181 $annotation->add_Annotation(
602             'dblink',
603             Bio::Annotation::DBLink->new(
604             -primary_id => $id,
605             -version => $version,
606             -database => $db,
607             -tagname => 'dblink'
608             )
609             );
610             }
611             }
612             }
613             elsif ( $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
614             # download screwed up and ncbi didn't put acc in for gi numbers
615 0         0 my $i = 0;
616 0         0 for my $id ( split( /\,\s+/, $1 ) ) {
617 0         0 my ( $acc, $db );
618 0 0       0 if ( $id =~ /gi:\s+(\d+)/ ) {
    0          
619 0         0 $acc = $1;
620 0 0       0 $db = ( $i++ % 2 ) ? 'GenPept' : 'GenBank';
621             }
622             elsif ( $id =~ /pdb\s+accession\s+(\S+)/ ) {
623 0         0 $acc = $1;
624 0         0 $db = 'PDB';
625             }
626             else {
627 0         0 $acc = $id;
628 0         0 $db = '';
629             }
630 0         0 $annotation->add_Annotation(
631             'dblink',
632             Bio::Annotation::DBLink->new(
633             -primary_id => $acc,
634             -database => $db,
635             -tagname => 'dblink'
636             )
637             );
638             }
639             }
640             else {
641 0         0 $self->debug("Cannot match $dbsource\n");
642             }
643 5 50       85 if ( $dbsource =~ s/xrefs\s+
644             \(non\-sequence\s+databases\):\s+
645             ((?:\S+,\s+)+\S+)//x
646             ) {
647 5         64 for my $id ( split( /\,\s+/, $1 ) ) {
648 86         70 my $db;
649              
650             # this is because GenBank dropped the spaces!!!
651             # I'm sure we're not going to get this right
652             ##if ( $id =~ s/^://i ) {
653             ## $db = $1;
654             ##}
655 86         146 $db = substr( $id, 0, index( $id, ':' ) );
656 86 100       179 if ( not exists $DBSOURCE{$db} ) {
657 16         17 $db = ''; # do we want 'GenBank' here?
658             }
659 86         106 $id = substr( $id, index( $id, ':' ) + 1 );
660 86         265 $annotation->add_Annotation(
661             'dblink',
662             Bio::Annotation::DBLink->new(
663             -primary_id => $id,
664             -database => $db,
665             -tagname => 'dblink'
666             )
667             );
668             }
669             }
670             }
671             else {
672 15 100       86 if ( $dbsource =~
    100          
    50          
673             /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/
674             ) {
675 4         12 my ( $db, $id, $version ) = ( $1, $2, $3 );
676 4   100     33 $annotation->add_Annotation(
677             'dblink',
678             Bio::Annotation::DBLink->new(
679             -primary_id => $id,
680             -version => $version,
681             -database => $db || 'GenBank',
682             -tagname => 'dblink'
683             )
684             );
685             }
686             elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) {
687 1         3 my ( $db, $id ) = ( $1, $2 );
688 1   50     7 $annotation->add_Annotation(
689             'dblink',
690             Bio::Annotation::DBLink->new(
691             -primary_id => $id,
692             -database => $db || 'GenBank',
693             -tagname => 'dblink'
694             )
695             );
696             }
697             elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) {
698 10         10 my ( $db, $version );
699 10         16 my @ids = ();
700 10 100       26 if ( $2 eq ':' ) {
701 9         13 $db = $1;
702              
703             # Genbank 192 release notes say this: "The second
704             # field can consist of multiple comma-separated
705             # identifiers, if a sequence record has multiple
706             # DBLINK cross-references of a given type."
707             # For example: DBLINK Project:100,200,300"
708 9         20 @ids = split( /,/, $3 );
709             }
710             else {
711 1         3 ( $db, $version ) = ( 'GenBank', $3 );
712 1         2 $ids[0] = $1;
713             }
714              
715 10         17 foreach my $id (@ids) {
716 12         75 $annotation->add_Annotation(
717             'dblink',
718             Bio::Annotation::DBLink->new(
719             -primary_id => $id,
720             -version => $version,
721             -database => $db,
722             -tagname => 'dblink'
723             )
724             );
725             }
726             }
727             else {
728 0         0 $self->warn(
729             "Unrecognized DBSOURCE data: $dbsource\n");
730             }
731             }
732 20         46 $buffer = $line;
733             }
734             else {
735 0         0 while ( defined( $buffer = $self->_readline ) ) {
736 0 0       0 last if substr( $buffer, 0, 1 ) ne ' ';
737             }
738             }
739 20         49 next;
740             }
741              
742             # Exit at start of Feature table, or start of sequence
743 227 100       823 if ($line =~ /^(FEATURES|ORIGIN)/) {
744 111         172 my $trap;
745             }
746 227 100       667 last if ($line =~ /^(FEATURES|ORIGIN)/);
747              
748             # Get next line and loop again
749 116         248 $buffer = $self->_readline;
750             }
751 111 50       260 return unless defined $buffer;
752              
753             # add them all at once for efficiency
754 111         823 $builder->add_slot_value(
755             -accession_number => shift(@acc),
756             -secondary_accessions => \@acc,
757             %params
758             );
759 111 100       517 $builder->add_slot_value( -annotation => $annotation ) if $annotation;
760 111         260 %params = (); # reset before possible re-use to avoid setting twice
761              
762             # start over if we don't want to continue with this entry
763 111 50       294 if ( not $builder->want_object ) {
764 0         0 $builder->make_object;
765 0         0 next RECORDSTART;
766             }
767              
768             # some "minimal" formats may not necessarily have a feature table
769 111 100 66     380 if ( $builder->want_slot('features')
      100        
770             and defined $buffer
771             and $buffer =~ /^FEATURES/o
772             ) {
773             # need to read the first line of the feature table
774 104         285 $buffer = $self->_readline;
775              
776             # DO NOT read lines in the while condition -- this is done
777             # as a side effect in _read_FTHelper_GenBank!
778              
779             # part of new circular spec:
780             # commented out for now until kinks worked out
781             #my $sourceEnd = 0;
782             #$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/);
783              
784 104         240 while ( defined $buffer ) {
785             # check immediately -- not at the end of the loop
786             # note: GenPept entries obviously do not have a BASE line
787 8384 100       53783 last if ( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o );
788              
789             # slurp in one feature at a time -- at return, the start of
790             # the next feature will have been read already, so we need
791             # to pass a reference, and the called method must set this
792             # to the last line read before returning
793 8280         14865 my $ftunit = $self->_read_FTHelper_GenBank( \$buffer );
794              
795             # implement new circular spec: features that cross the origin are now
796             # seamless instead of being 2 separate joined features
797             # commented out until kinks get worked out
798             #if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/
799             #&& $sourceEnd == $2 && $3 == 1) {
800             #my $start = $1;
801             #my $end = $2 + $4;
802             #$ftunit->{'loc'} = "$start..$end";
803             #}
804              
805             # fix suggested by James Diggans
806              
807 8280 50       11410 if ( not defined $ftunit ) {
808             # GRRRR. We have fallen over. Try to recover
809             $self->warn( "Unexpected error in feature table for "
810 0         0 . $params{'-display_id'}
811             . " Skipping feature, attempting to recover" );
812              
813 0 0 0     0 unless ( $buffer =~ /^\s{5,5}\S+/o
814             or $buffer =~ /^\S+/o
815             ) {
816 0         0 $buffer = $self->_readline;
817             }
818 0         0 next; # back to reading FTHelpers
819             }
820              
821             # process ftunit
822 8280         17701 my $feat =
823             $ftunit->_generic_seqfeature( $self->location_factory,
824             $display_id );
825              
826             # add taxon_id from source if available
827 8280 100 100     20166 if ( $species
      100        
      66        
      66        
828             and $feat->primary_tag eq 'source'
829             and $feat->has_tag('db_xref')
830             and ( not $species->ncbi_taxid
831             or ( $species->ncbi_taxid
832             and $species->ncbi_taxid =~ /^list/ ) )
833             ) {
834 95         269 foreach my $tagval ( $feat->get_tag_values('db_xref') ) {
835 96 100       335 if ( index( $tagval, "taxon:" ) == 0 ) {
836 95         346 $species->ncbi_taxid( substr( $tagval, 6 ) );
837 95         220 last;
838             }
839             }
840             }
841              
842             # add feature to list of features
843 8280         24343 push( @features, $feat );
844             }
845 104         564 $builder->add_slot_value( -features => \@features );
846             }
847              
848 111 50       304 if ( defined $buffer ) {
849             # CONTIG lines: TODO, this needs to be cleaned up
850 111 100       1175 if ($buffer =~/^CONTIG\s+(.*)/o) {
    100          
    100          
851 7         18 my $ctg = $1;
852 7         26 while ( defined( $buffer = $self->_readline ) ) {
853 29 100       162 last if $buffer =~ m{^ORIGIN|//}o;
854 22         76 $buffer =~ s/\s+(.*)/$1/;
855 22         42 $ctg .= $buffer;
856             }
857 7 50       22 if ($ctg) {
858 7         58 $annotation->add_Annotation(
859             Bio::Annotation::SimpleValue->new(
860             -tagname => 'contig',
861             -value => $ctg
862             )
863             );
864             }
865             }
866             elsif ($buffer =~ /^WGS|WGS_SCAFLD\s+/o) { # catch WGS/WGS_SCAFLD lines
867 1         6 while ( $buffer =~ s/(^WGS|WGS_SCAFLD)\s+// ) { # gulp lines
868 3         4 chomp $buffer;
869 3         17 $annotation->add_Annotation(
870             Bio::Annotation::SimpleValue->new(
871             -value => $buffer,
872             -tagname => lc $1
873             )
874             );
875 3         7 $buffer = $self->_readline;
876             }
877             }
878             elsif ( $buffer !~ m{^ORIGIN|//}o ) { # advance to the sequence, if any
879 53         185 while ( defined( $buffer = $self->_readline ) ) {
880 148 100       541 last if $buffer =~ m{^(ORIGIN|//)};
881             }
882             }
883             }
884 111 50       307 if ( not $builder->want_object ) {
885 0         0 $builder->make_object; # implicit end-of-object
886 0         0 next RECORDSTART;
887             }
888 111 100 33     338 if ( $builder->want_slot('seq') ) {
    50          
889             # the fact that we want a sequence does not necessarily mean that
890             # there also is a sequence ...
891 101 100 100     800 if ( defined $buffer and $buffer =~ s/^ORIGIN\s+// ) {
892 93 100 66     472 if ( $annotation and length($buffer) > 0 ) {
893 1         9 $annotation->add_Annotation(
894             'origin',
895             Bio::Annotation::SimpleValue->new(
896             -tagname => 'origin',
897             -value => $buffer
898             )
899             );
900             }
901 93         155 my $seqc = '';
902 93         300 while ( defined( $buffer = $self->_readline ) ) {
903 32735 100       41037 last if $buffer =~ m{^//};
904 32643         27707 $buffer = uc $buffer;
905 32643         193411 $buffer =~ s/[^A-Za-z]//g;
906 32643         52834 $seqc .= $buffer;
907             }
908              
909 93         367 $builder->add_slot_value( -seq => $seqc );
910             }
911             }
912             elsif ( defined($buffer) and ( substr( $buffer, 0, 2 ) ne '//' ) ) {
913             # advance to the end of the record
914 10         21 while ( defined( $buffer = $self->_readline ) ) {
915 172 100       335 last if substr( $buffer, 0, 2 ) eq '//';
916             }
917             }
918              
919             # Unlikely, but maybe the sequence is so weird that we don't want it
920             # anymore. We don't want to return undef if the stream's not exhausted
921             # yet.
922 111         385 $seq = $builder->make_object;
923 111 50       301 next RECORDSTART unless $seq;
924 111         2039 last RECORDSTART;
925             } # end while RECORDSTART
926              
927 111         504 return $seq;
928             }
929              
930             =head2 write_seq
931              
932             Title : write_seq
933             Usage : $stream->write_seq($seq)
934             Function: writes the $seq object (must be seq) to the stream
935             Returns : 1 for success and 0 for error
936             Args : array of 1 to n Bio::SeqI objects
937              
938             =cut
939              
940             sub write_seq {
941 25     25 1 78 my ($self,@seqs) = @_;
942              
943 25         75 foreach my $seq ( @seqs ) {
944 25 50       97 $self->throw("Attempting to write with no seq!") unless defined $seq;
945              
946 25 50 33     216 if ( not ref $seq or not $seq->isa('Bio::SeqI') ) {
947 0         0 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
948             }
949              
950 25         130 my $str = $seq->seq;
951 25         113 my $len = $seq->length;
952 25         103 my $alpha = $seq->alphabet;
953              
954 25         43 my ($div, $mol);
955 25 100 66     261 if ( not $seq->can('division')
956             or not defined($div = $seq->division)
957             ) {
958 2         5 $div = 'UNK';
959             }
960 25 100 66     197 if ( not $seq->can('molecule')
961             or not defined ($mol = $seq->molecule)
962             ) {
963 2   100     11 $mol = $alpha || 'DNA';
964             }
965              
966 25 100       162 my $circular = ($seq->is_circular) ? 'circular' : 'linear ';
967              
968 25         159 local($^W) = 0; # supressing warnings about uninitialized fields.
969              
970 25         43 my $temp_line;
971 25 50       86 if ( $self->_id_generation_func ) {
972 0         0 $temp_line = &{$self->_id_generation_func}($seq);
  0         0  
973             }
974             else {
975 25         41 my $date = '';
976 25 100       135 if ( $seq->can('get_dates') ) {
977 23         77 ($date) = $seq->get_dates;
978             }
979              
980 25 50       143 $self->warn("No whitespace allowed in GenBank display id [". $seq->display_id. "]")
981             if $seq->display_id =~ /\s/;
982              
983 25 100       140 my @data = ( lc($alpha) eq 'protein' ) ? ('aa', '', '') : ('bp', '', $mol);
984 25         102 $temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s\n",
985             'LOCUS', $seq->id, $len,
986             @data, $circular, $div, $date);
987             }
988              
989 25         164 $self->_print($temp_line);
990 25         110 $self->_write_line_GenBank_regex("DEFINITION ", " ",
991             $seq->desc, "\\s\+\|\$",80);
992              
993             # if there, write the accession line
994              
995 25 50       86 if ( $self->_ac_generation_func ) {
996 0         0 $temp_line = &{$self->_ac_generation_func}($seq);
  0         0  
997 0         0 $self->_print("ACCESSION $temp_line\n");
998             }
999             else {
1000 25         53 my @acc = ();
1001 25         101 push @acc, $seq->accession_number;
1002 25 100       147 if ( $seq->isa('Bio::Seq::RichSeqI') ) {
1003 23         95 push @acc, $seq->get_secondary_accessions;
1004             }
1005 25         141 $self->_print("ACCESSION ", join(" ", @acc), "\n");
1006             # otherwise - cannot print
1007             }
1008              
1009             # if PID defined, print it
1010 25 50 66     199 if ($seq->isa('Bio::Seq::RichSeqI') and $seq->pid) {
1011 0         0 $self->_print("PID ", $seq->pid, "\n");
1012             }
1013              
1014             # if there, write the version line
1015 25 50 100     87 if ( defined $self->_sv_generation_func ) {
    100          
1016 0         0 $temp_line = &{$self->_sv_generation_func}($seq);
  0         0  
1017 0 0       0 if ( $temp_line ) {
1018 0         0 $self->_print("VERSION $temp_line\n");
1019             }
1020             }
1021             elsif ($seq->isa('Bio::Seq::RichSeqI') and defined($seq->seq_version)) {
1022 18         67 my $id = $seq->primary_id; # this may be a GI number
1023 18 50 33     208 my $data = (defined $id and $id =~ /^\d+$/) ? " GI:$id" : "";
1024 18         65 $self->_print("VERSION ",
1025             $seq->accession_number, ".",
1026             $seq->seq_version, $data, "\n");
1027             }
1028              
1029             # if there, write the PROJECT line
1030 25         93 for my $proj ( $seq->annotation->get_Annotations('project') ) {
1031 2         10 $self->_print("PROJECT ".$proj->value."\n");
1032             }
1033              
1034             # if there, write the DBSOURCE line
1035 25         73 foreach my $ref ( $seq->annotation->get_Annotations('dblink') ) {
1036 2         6 my ($db, $id) = ($ref->database, $ref->primary_id);
1037 2 100       4 my $prefix = $db eq 'Project' ? 'DBLINK' : 'DBSOURCE';
1038 2 100       9 my $text = $db eq 'GenBank' ? ''
    50          
1039             : $db eq 'Project' ? "$db:$id"
1040             : "$db accession $id";
1041 2         7 $self->_print(sprintf ("%-11s %s\n", $prefix, $text));
1042             }
1043              
1044             # if there, write the keywords line
1045 25 50       100 if ( defined $self->_kw_generation_func ) {
    100          
1046 0         0 $temp_line = &{$self->_kw_generation_func}($seq);
  0         0  
1047 0         0 $self->_print("KEYWORDS $temp_line\n");
1048             }
1049             elsif ( $seq->can('keywords') ) {
1050 23         77 my $kw = $seq->keywords;
1051 23 50       108 $kw .= '.' if ( $kw !~ /\.$/ );
1052 23         97 $self->_print("KEYWORDS $kw\n");
1053             }
1054              
1055             # SEGMENT if it exists
1056 25         94 foreach my $ref ( $seq->annotation->get_Annotations('segment') ) {
1057 0         0 $self->_print(sprintf ("%-11s %s\n",'SEGMENT',
1058             $ref->value));
1059             }
1060              
1061             # Organism lines
1062 25 100       83 if (my $spec = $seq->species) {
1063 22 50       230 my ($on, $sn, $cn) = ($spec->can('organelle') ? $spec->organelle : '',
1064             $spec->scientific_name,
1065             $spec->common_name);
1066 22         50 my @classification;
1067 22 50       121 if ($spec->isa('Bio::Species')) {
1068 22         83 @classification = $spec->classification;
1069 22         54 shift @classification;
1070             }
1071             else {
1072             # Bio::Taxon should have a DB handle of some type attached, so
1073             # derive the classification from that
1074 0         0 my $node = $spec;
1075 0         0 while ($node) {
1076 0   0     0 $node = $node->ancestor || last;
1077 0         0 unshift @classification, $node->node_name;
1078             #$node eq $root && last;
1079             }
1080 0         0 @classification = reverse @classification;
1081             }
1082 22 50       86 my $abname = $spec->name('abbreviated') ? # from genbank file
1083             $spec->name('abbreviated')->[0] : $sn;
1084 22 100       72 my $sl = $on ? "$on " : '';
1085 22 100       86 $sl .= $cn ? "$abname ($cn)" : $abname;
1086              
1087 22         82 $self->_write_line_GenBank_regex("SOURCE ", ' 'x12, $sl, "\\s\+\|\$", 80);
1088 22         76 $self->_print(" ORGANISM ", $spec->scientific_name, "\n");
1089 22         129 my $OC = join('; ', reverse @classification) . '.';
1090 22         83 $self->_write_line_GenBank_regex(' 'x12,' 'x12, $OC, "\\s\+\|\$", 80);
1091             }
1092              
1093             # Reference lines
1094 25         48 my $count = 1;
1095 25         99 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
1096 47         88 $temp_line = "REFERENCE $count";
1097 47 50       148 if ($ref->start) {
    0          
1098 47 100       139 $temp_line .= sprintf (" (%s %d to %d)",
1099             ($seq->alphabet() eq "protein" ?
1100             "residues" : "bases"),
1101             $ref->start, $ref->end);
1102             }
1103             elsif ($ref->gb_reference) {
1104 0         0 $temp_line .= sprintf (" (%s)", $ref->gb_reference);
1105             }
1106 47         177 $self->_print("$temp_line\n");
1107 47         159 $self->_write_line_GenBank_regex(" AUTHORS ", ' 'x12,
1108             $ref->authors, "\\s\+\|\$", 80);
1109 47 100       145 $self->_write_line_GenBank_regex(" CONSRTM ", ' 'x12,
1110             $ref->consortium, "\\s\+\|\$", 80) if $ref->consortium;
1111 47         134 $self->_write_line_GenBank_regex(" TITLE ", ' 'x12,
1112             $ref->title, "\\s\+\|\$", 80);
1113 47         143 $self->_write_line_GenBank_regex(" JOURNAL ", ' 'x12,
1114             $ref->location, "\\s\+\|\$", 80);
1115 47 100       142 if ( $ref->medline) {
1116 5         27 $self->_write_line_GenBank_regex(" MEDLINE ", ' 'x12,
1117             $ref->medline, "\\s\+\|\$", 80);
1118             # I am assuming that pubmed entries only exist when there
1119             # are also MEDLINE entries due to the indentation
1120             }
1121             # This could be a wrong assumption
1122 47 100       118 if ( $ref->pubmed ) {
1123 13         43 $self->_write_line_GenBank_regex(" PUBMED ", ' 'x12,
1124             $ref->pubmed, "\\s\+\|\$", 80);
1125             }
1126             # put remark at the end
1127 47 50       170 if ($ref->comment) {
1128 0         0 $self->_write_line_GenBank_regex(" REMARK ", ' 'x12,
1129             $ref->comment, "\\s\+\|\$", 80);
1130             }
1131 47         96 $count++;
1132             }
1133              
1134             # Comment lines
1135 25         94 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
1136 13         44 $self->_write_line_GenBank_regex("COMMENT ", ' 'x12,
1137             $comment->text, "\\s\+\|\$", 80);
1138             }
1139              
1140             # FEATURES section
1141 25         81 $self->_print("FEATURES Location/Qualifiers\n");
1142              
1143 25 50       66 if ( defined $self->_post_sort ) {
1144             # we need to read things into an array. Process. Sort them. Print 'em
1145 0         0 my $post_sort_func = $self->_post_sort;
1146 0         0 my @fth;
1147              
1148 0         0 foreach my $sf ( $seq->top_SeqFeatures ) {
1149 0         0 push @fth, Bio::SeqIO::FTHelper::from_SeqFeature($sf, $seq);
1150             }
1151              
1152 0         0 @fth = sort { &$post_sort_func($a, $b) } @fth;
  0         0  
1153              
1154 0         0 foreach my $fth ( @fth ) {
1155 0         0 $self->_print_GenBank_FTHelper($fth);
1156             }
1157             }
1158             else {
1159             # not post sorted. And so we can print as we get them.
1160             # lower memory load...
1161 25         119 foreach my $sf ( $seq->top_SeqFeatures ) {
1162 430         1108 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf, $seq);
1163 430         487 foreach my $fth ( @fth ) {
1164 430 50       1006 if ( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
1165 0         0 $sf->throw("Cannot process FTHelper... $fth");
1166             }
1167 430         994 $self->_print_GenBank_FTHelper($fth);
1168             }
1169             }
1170             }
1171              
1172             # deal with WGS; WGS_SCAFLD present only if WGS is also present
1173 25 50       208 if ($seq->annotation->get_Annotations('wgs')) {
1174 0         0 foreach my $wgs (map {$seq->annotation->get_Annotations($_)}
  0         0  
1175             qw(wgs wgs_scaffold)
1176             ) {
1177 0         0 $self->_print(sprintf ("%-11s %s\n",
1178             uc($wgs->tagname),
1179             $wgs->value));
1180             }
1181 0         0 $self->_show_dna(0);
1182             }
1183 25 100       90 if ($seq->annotation->get_Annotations('contig')) {
1184 1         2 my $ct = 0;
1185 1         2 my $cline;
1186 1         2 foreach my $contig ($seq->annotation->get_Annotations('contig')) {
1187 1 50       4 unless ($ct) {
1188 1         4 $cline = uc($contig->tagname) . " " . $contig->value . "\n";
1189             }
1190             else {
1191 0         0 $cline = " " . $contig->value . "\n";
1192             }
1193 1         3 $self->_print($cline);
1194 1         2 $ct++;
1195             }
1196             }
1197 25 100       99 if ( $seq->length == 0 ) {
1198 1         4 $self->_show_dna(0);
1199             }
1200              
1201 25 100       94 if ( $self->_show_dna == 0 ) {
1202 1         4 $self->_print("\n//\n");
1203 1         6 return;
1204             }
1205              
1206             # finished printing features.
1207              
1208 24         472 $str =~ tr/A-Z/a-z/;
1209              
1210 24         89 my ($o) = $seq->annotation->get_Annotations('origin');
1211 24 50       177 $self->_print(sprintf("%-12s%s\n",
1212             'ORIGIN', $o ? $o->value : ''));
1213             # print out the sequence
1214 24         38 my $nuc = 60; # Number of nucleotides per line
1215 24         43 my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
1216 24         38 my $out_pat = 'A11' x 6; # Pattern for packing a line
1217 24         35 my $length = length $str;
1218              
1219             # Calculate the number of nucleotides which fit on whole lines
1220 24         103 my $whole = int($length / $nuc) * $nuc;
1221              
1222             # Print the whole lines
1223 24         31 my $i;
1224 24         135 for ($i = 0; $i < $whole; $i += $nuc) {
1225 1953         7032 my $blocks = pack $out_pat,
1226             unpack $whole_pat,
1227             substr($str, $i, $nuc);
1228 1953         2338 chop $blocks;
1229 1953         6014 $self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59));
1230             }
1231              
1232             # Print the last line
1233 24 100       82 if (my $last = substr($str, $i)) {
1234 19         30 my $last_len = length($last);
1235 19         93 my $last_pat = 'a10' x int($last_len / 10)
1236             . 'a' . $last_len % 10;
1237 19         98 my $blocks = pack $out_pat,
1238             unpack($last_pat, $last);
1239 19         128 $blocks =~ s/ +$//;
1240 19         110 $self->_print(sprintf("%9d $blocks\n",
1241             $length - $last_len + 1));
1242             }
1243              
1244 24         66 $self->_print("//\n");
1245              
1246 24 50 33     87 $self->flush if $self->_flush_on_write && defined $self->_fh;
1247 24         190 return 1;
1248             }
1249             }
1250              
1251             =head2 _print_GenBank_FTHelper
1252              
1253             Title : _print_GenBank_FTHelper
1254             Usage :
1255             Function:
1256             Example :
1257             Returns :
1258             Args :
1259              
1260             =cut
1261              
1262             sub _print_GenBank_FTHelper {
1263 430     430   413 my ( $self, $fth ) = @_;
1264              
1265 430 50 33     1614 if ( not ref $fth or not $fth->isa('Bio::SeqIO::FTHelper') ) {
1266 0         0 $fth->warn(
1267             "$fth is not a FTHelper class. Attempting to print but there could be issues"
1268             );
1269             }
1270              
1271 430 50       819 my $spacer = ( length $fth->key >= 15 ) ? ' ' : '';
1272 430         769 $self->_write_line_GenBank_regex(
1273             sprintf( " %-16s%s", $fth->key, $spacer ),
1274             " " x 21, $fth->loc, "\,\|\$", 80 );
1275              
1276 430         554 foreach my $tag ( keys %{ $fth->field } ) {
  430         977  
1277             # Account for hash structure in Annotation::DBLink, not the expected array
1278 997 50 66     2200 if ( $tag eq 'db_xref' and grep /HASH/, @{ $fth->field->{$tag} }) {
  140         425  
1279 0         0 for my $ref ( @{ $fth->field->{$tag} } ) {
  0         0  
1280 0         0 my $db = $ref->{'database'};
1281 0         0 my $id = $ref->{'primary_id'};
1282 0         0 $self->_write_line_GenBank_regex
1283             ( " " x 21, " " x 21,
1284             "/$tag=\"$db:$id\"", "\.\|\$", 80 );
1285             }
1286             }
1287             # The usual case, where all values are found in an array
1288             else {
1289 997         782 foreach my $value ( @{ $fth->field->{$tag} } ) {
  997         1696  
1290 1085         1406 $value =~ s/\"/\"\"/g;
1291 1085 50 100     3490 if ( $value eq "_no_value" ) {
    100          
1292 0         0 $self->_write_line_GenBank_regex
1293             ( " " x 21, " " x 21,
1294             "/$tag", "\.\|\$", 80 );
1295             }
1296              
1297             # There are almost 3x more quoted qualifier values and they
1298             # are more common too so we take quoted ones first.
1299             # Long qualifiers, that will be line wrapped, are always quoted
1300             elsif ( not $FTQUAL_NO_QUOTE{$tag}
1301             or length("/$tag=$value") >= $FTQUAL_LINE_LENGTH
1302             ) {
1303 976 100       2541 my ($pat) = ( $value =~ /\s/ ? '\s|$' : '.|$' );
1304 976         2967 $self->_write_line_GenBank_regex
1305             ( " " x 21, " " x 21,
1306             "/$tag=\"$value\"", $pat, 80 );
1307             }
1308             else {
1309 109         364 $self->_write_line_GenBank_regex
1310             ( " " x 21, " " x 21,
1311             "/$tag=$value", "\.\|\$", 80 );
1312             }
1313             }
1314             }
1315             }
1316             }
1317              
1318             =head2 _read_GenBank_References
1319              
1320             Title : _read_GenBank_References
1321             Usage :
1322             Function: Reads references from GenBank format. Internal function really
1323             Returns :
1324             Args :
1325              
1326             =cut
1327              
1328             sub _read_GenBank_References {
1329 100     100   173 my ($self, $buffer) = @_;
1330 100         112 my (@refs);
1331             my $ref;
1332              
1333             # assumme things are starting with RN
1334 100 50       372 if ( $$buffer !~ /^REFERENCE/ ) {
1335 0         0 warn("Not parsing line '$$buffer' which maybe important");
1336             }
1337              
1338 100         153 my $line = $$buffer;
1339              
1340 100         147 my (@title,@loc,@authors,@consort,@com,@medline,@pubmed);
1341              
1342             REFLOOP:
1343 100   66     322 while( defined($line) or defined($line = $self->_readline) ) {
1344 1379 100       3088 if ($line =~ /^\s{2}AUTHORS\s+(.*)/o) {
1345 402         824 push @authors, $1;
1346 402         713 while ( defined($line = $self->_readline) ) {
1347 1040 100       2317 if ($line =~ /^\s{9,}(.*)/o) {
1348 638         926 push @authors, $1;
1349 638         935 next;
1350             }
1351 402         411 last;
1352             }
1353 402         1465 $ref->authors(join(' ', @authors));
1354             }
1355              
1356 1379 100       2070 if ($line =~ /^\s{2}CONSRTM\s+(.*)/o) {
1357 35         333 push @consort, $1;
1358 35         83 while ( defined($line = $self->_readline) ) {
1359 36 100       93 if ($line =~ /^\s{9,}(.*)/o) {
1360 1         2 push @consort, $1;
1361 1         3 next;
1362             }
1363 35         37 last;
1364             }
1365 35         112 $ref->consortium(join(' ', @consort));
1366             }
1367              
1368 1379 100       2463 if ($line =~ /^\s{2}TITLE\s+(.*)/o) {
1369 408         656 push @title, $1;
1370 408         979 while ( defined($line = $self->_readline) ) {
1371 693 100       1573 if ($line =~ /^\s{9,}(.*)/o) {
1372 285         466 push @title, $1;
1373 285         452 next;
1374             }
1375 408         398 last;
1376             }
1377 408         1412 $ref->title(join(' ', @title));
1378             }
1379              
1380 1379 100       2502 if ($line =~ /^\s{2}JOURNAL\s+(.*)/o) {
1381 416         654 push @loc, $1;
1382 416         741 while ( defined($line = $self->_readline) ) {
1383             # we only match when there are at least 4 spaces
1384             # there is probably a better way to match this
1385             # as it assumes that the describing tag is short enough
1386 565 100       1280 if ($line =~ /^\s{9,}(.*)/o) {
1387 149         437 push @loc, $1;
1388 149         274 next;
1389             }
1390 416         410 last;
1391             }
1392 416         1251 $ref->location(join(' ', @loc));
1393 416         500 redo REFLOOP;
1394             }
1395              
1396 963 100       1317 if ($line =~ /^\s{2}REMARK\s+(.*)/o) {
1397 59         105 push @com, $1;
1398 59         130 while ( defined($line = $self->_readline) ) {
1399 83 100       217 if ($line =~ /^\s{9,}(.*)/o) {
1400 24         40 push @com, $1;
1401 24         47 next;
1402             }
1403 59         53 last;
1404             }
1405 59         216 $ref->comment(join(' ', @com));
1406 59         63 redo REFLOOP;
1407             }
1408              
1409 904 100       1411 if ( $line =~ /^\s{2}MEDLINE\s+(.*)/ ) {
1410 130         230 push @medline, $1;
1411 130         271 while ( defined($line = $self->_readline) ) {
1412 130 50       309 if ($line =~ /^\s{9,}(.*)/) {
1413 0         0 push @medline, $1;
1414 0         0 next;
1415             }
1416 130         135 last;
1417             }
1418 130         329 $ref->medline(join(' ', @medline));
1419 130         142 redo REFLOOP;
1420             }
1421              
1422 774 100       1351 if ( $line =~ /^\s{3}PUBMED\s+(.*)/ ) {
1423 237         377 push @pubmed, $1;
1424 237         432 while ( defined($line = $self->_readline) ) {
1425 237 50       510 if ($line =~ /^\s{9,}(.*)/) {
1426 0         0 push @pubmed, $1;
1427 0         0 next;
1428             }
1429 237         199 last;
1430             }
1431 237         650 $ref->pubmed(join(' ', @pubmed));
1432 237         227 redo REFLOOP;
1433             }
1434              
1435 537 100       1085 if ( $line =~ /^REFERENCE/o ) {
1436             # store current reference
1437 417 100       1008 $self->_add_ref_to_array(\@refs,$ref) if defined $ref;
1438              
1439             # reset
1440 417         610 @authors = ();
1441 417         416 @title = ();
1442 417         340 @loc = ();
1443 417         353 @com = ();
1444 417         337 @pubmed = ();
1445 417         336 @medline = ();
1446              
1447             # create the new reference object
1448 417         1634 $ref = Bio::Annotation::Reference->new(-tagname => 'reference');
1449              
1450             # check whether start and end base is given
1451 417 100       1721 if ($line =~ /^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)\)/){
    100          
1452 286         720 $ref->start($1);
1453 286         544 $ref->end($2);
1454             }
1455             elsif ($line =~ /^REFERENCE\s+\d+\s+\((.*)\)/) {
1456 14         42 $ref->gb_reference($1);
1457             }
1458             }
1459              
1460 537 100       2264 last if ($line =~ /^(FEATURES)|(COMMENT)/o);
1461              
1462 437         1560 $line = undef; # Empty $line to trigger read of next line
1463             }
1464              
1465             # store last reference
1466 100 50       434 $self->_add_ref_to_array(\@refs, $ref) if defined $ref;
1467              
1468 100         151 $$buffer = $line;
1469              
1470             #print "\nnumber of references found: ", $#refs+1,"\n";
1471              
1472 100         392 return @refs;
1473             }
1474              
1475             =head2 _add_ref_to_array
1476              
1477             Title: _add_ref_to_array
1478             Usage:
1479             Function: Adds a Reference object to an array of Reference objects, takes
1480             care of possible cleanups to be done (currently, only author and title
1481             will be chopped of trailing semicolons).
1482             Args: A reference to an array of Reference objects and
1483             the Reference object to be added
1484             Returns: nothing
1485              
1486             =cut
1487              
1488             sub _add_ref_to_array {
1489 417     417   459 my ($self, $refs, $ref) = @_;
1490              
1491             # first, polish author and title by removing possible trailing semicolons
1492 417         729 my $au = $ref->authors;
1493 417         673 my $title = $ref->title;
1494 417 100       845 $au =~ s/;\s*$//g if $au;
1495 417 100       730 $title =~ s/;\s*$//g if $title;
1496 417         642 $ref->authors($au);
1497 417         582 $ref->title($title);
1498             # the rest should be clean already, so go ahead and add it
1499 417         322 push @{$refs}, $ref;
  417         735  
1500             }
1501              
1502             =head2 _read_GenBank_Species
1503              
1504             Title : _read_GenBank_Species
1505             Usage :
1506             Function: Reads the GenBank Organism species and classification
1507             lines. Able to deal with unconvential Organism naming
1508             formats, and varietas in plants
1509             Example : ORGANISM unknown marine gamma proteobacterium NOR5
1510             $genus = undef
1511             $species = unknown marine gamma proteobacterium NOR5
1512              
1513             ORGANISM Drosophila sp. 'white tip scutellum'
1514             $genus = Drosophila
1515             $species = sp. 'white tip scutellum'
1516             (yes, this really is a species and that is its name)
1517             $subspecies = undef
1518              
1519             ORGANISM Ajellomyces capsulatus var. farciminosus
1520             $genus = Ajellomyces
1521             $species = capsulatus
1522             $subspecies = var. farciminosus
1523              
1524             ORGANISM Hepatitis delta virus
1525             $genus = undef (though this virus has a genus in its lineage, we
1526             cannot know that without a database lookup)
1527             $species = Hepatitis delta virus
1528              
1529             Returns : A Bio::Species object
1530             Args : A reference to the current line buffer
1531              
1532             =cut
1533              
1534             sub _read_GenBank_Species {
1535 101     101   143 my ($self, $buffer) = @_;
1536              
1537 101         380 my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown',
1538             'Unspecified', 'Unknown', 'None', 'unclassified',
1539             'unidentified organism', 'not supplied');
1540             # dictionary of synonyms for taxid 32644
1541 101         224 my @unkn_genus = ('unknown', 'unclassified', 'uncultured', 'unidentified');
1542             # all above can be part of valid species name
1543              
1544 101         129 my $line = $$buffer;
1545              
1546 101         109 my( $sub_species, $species, $genus, $sci_name, $common,
1547             $class_lines, $source_flag, $abbr_name, $organelle, $sl );
1548 101         156 my %source = map { $_ => 1 } qw(SOURCE ORGANISM CLASSIFICATION);
  303         610  
1549              
1550             # upon first entering the loop, we must not read a new line -- the SOURCE
1551             # line is already in the buffer (HL 05/10/2000)
1552 101         119 my ($ann, $tag, $data);
1553 101   66     309 while (defined($line) or defined($line = $self->_readline)) {
1554             # de-HTMLify (links that may be encountered here don't contain
1555             # escaped '>', so a simple-minded approach suffices)
1556 517         583 $line =~ s{<[^>]+>}{}g;
1557 517 100       1632 if ($line =~ m{^(?:\s{0,2})(\w+)\s+(.+)?$}ox) {
1558 303   50     1005 ($tag, $data) = ($1, $2 || '');
1559 303 100 66     1212 last if ($tag and not exists $source{$tag});
1560             }
1561             else {
1562 214 50       377 return unless $tag;
1563 214         590 ($data = $line) =~ s{^\s+}{};
1564 214         256 chomp $data;
1565 214 100 100     1099 $tag = 'CLASSIFICATION' if ( $tag ne 'CLASSIFICATION'
      100        
1566             and $tag eq 'ORGANISM'
1567             # Don't match "str." or "var." (fix NC_021815),
1568             # and don't match ".1" (fix NC_021902)
1569             and $line =~ m{(?
1570             }
1571 416 100       1083 (exists $ann->{$tag}) ? ($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data);
1572 416         1100 $line = undef;
1573             }
1574              
1575 101         298 ($sl, $class_lines, $sci_name) = ($ann->{SOURCE}, $ann->{CLASSIFICATION}, $ann->{ORGANISM});
1576              
1577 101         160 $$buffer = $line;
1578              
1579 101 50       207 $sci_name or return;
1580              
1581             # parse out organelle, common name, abbreviated name if present;
1582             # this should catch everything, but falls back to
1583             # entire SOURCE line just in case
1584 101 50       939 if ($sl =~ m{^(mitochondrion|chloroplast|plastid)?
1585             \s*(.*?)
1586             \s*(?: \( (.*?) \) )?\.?
1587             $
1588             }xms
1589             ) {
1590 101         297 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
1591             }
1592             else {
1593 0         0 $abbr_name = $sl; # nothing caught; this is a backup!
1594             }
1595              
1596             # Convert data in classification lines into classification array.
1597             # only split on ';' or '.' so that classification that is 2 or more words will
1598             # still get matched, use map() to remove trailing/leading/intervening spaces
1599 101         728 my @class = map { $_ =~ s/^\s+//;
  955         1272  
1600 955         902 $_ =~ s/\s+$//;
1601 955         1056 $_ =~ s/\s{2,}/ /g;
1602 955         1021 $_; }
1603             split /(?
1604              
1605             # do we have a genus?
1606 101 50       514 my $possible_genus = quotemeta($class[-1])
1607             . ($class[-2] ? "|" . quotemeta($class[-2]) : '');
1608 101 100       3085 if ($sci_name =~ /^($possible_genus)/) {
1609 88         173 $genus = $1;
1610 88         952 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1611             }
1612             else {
1613 13         31 $species = $sci_name;
1614             }
1615              
1616             # is this organism of rank species or is it lower?
1617             # (we don't catch everything lower than species, but it doesn't matter -
1618             # this is just so we abide by previous behaviour whilst not calling a
1619             # species a subspecies)
1620 101 100 66     1282 if ($species and $species =~ /(.+)\s+((?:subsp\.|var\.).+)/) {
1621 5         19 ($species, $sub_species) = ($1, $2);
1622             }
1623              
1624             # Don't make a species object if it's empty or "Unknown" or "None"
1625             # return unless $genus and $genus !~ /^(Unknown|None)$/oi;
1626             # Don't make a species object if it belongs to taxid 32644
1627             # my $unkn = grep { $_ =~ /^\Q$sl\E$/; } @unkn_names;
1628 101         147 my $unkn = grep { $_ eq $sl } @unkn_names;
  1010         1009  
1629 101 50 33     596 return unless (defined $species or defined $genus) and $unkn == 0;
      33        
1630              
1631             # Bio::Species array needs array in Species -> Kingdom direction
1632 101         149 push @class, $sci_name;
1633 101         159 @class = reverse @class;
1634              
1635 101         641 my $make = Bio::Species->new;
1636 101         289 $make->scientific_name($sci_name);
1637 101 50       444 $make->classification(@class) if @class > 0;
1638 101 100       347 $make->common_name( $common ) if $common;
1639 101 50       456 $make->name('abbreviated', $abbr_name) if $abbr_name;
1640 101 100       235 $make->organelle($organelle) if $organelle;
1641             #$make->sub_species( $sub_species ) if $sub_species;
1642 101         880 return $make;
1643             }
1644              
1645             =head2 _read_FTHelper_GenBank
1646              
1647             Title : _read_FTHelper_GenBank
1648             Usage : _read_FTHelper_GenBank($buffer)
1649             Function: reads the next FT key line
1650             Example :
1651             Returns : Bio::SeqIO::FTHelper object
1652             Args : filehandle and reference to a scalar
1653              
1654             =cut
1655              
1656             sub _read_FTHelper_GenBank {
1657 8280     8280   7136 my ($self, $buffer) = @_;
1658              
1659 8280         5855 my ($key, # The key of the feature
1660             $loc # The location line from the feature
1661             );
1662 8280         8003 my @qual = (); # An array of lines making up the qualifiers
1663              
1664 8280 50       48101 if ($$buffer =~ /^\s{5}(\S+)\s+(.+?)\s*$/o) {
1665 8280         14443 $key = $1;
1666 8280         9676 $loc = $2;
1667             # Read all the lines up to the next feature
1668 8280         16963 while ( defined(my $line = $self->_readline) ) {
1669 88592 100       366661 if ($line =~ /^(\s+)(.+?)\s*$/o) {
1670             # Lines inside features are preceded by 21 spaces
1671             # A new feature is preceded by 5 spaces
1672 88489 100       121190 if (length($1) > 6) {
1673             # Add to qualifiers if we're in the qualifiers, or if it's
1674             # the first qualifier
1675 80312 100 100     161091 if (@qual or (index($2,'/') == 0)) {
1676 68121         151805 push @qual, $2;
1677             }
1678             # We're still in the location line, so append to location
1679             else {
1680 12191         29173 $loc .= $2;
1681             }
1682             }
1683             else {
1684             # We've reached the start of the next feature
1685             # Put the first line of the next feature into the buffer
1686 8177         7901 $$buffer = $line;
1687 8177         10803 last;
1688             }
1689             }
1690             else {
1691             # We're at the end of the feature table
1692             # Put the first line of the next feature into the buffer
1693 103         163 $$buffer = $line;
1694 103         161 last;
1695             }
1696             }
1697             }
1698             else {
1699             # No feature key
1700 0         0 $self->debug("no feature key!\n");
1701             # change suggested by JDiggans to avoid infinite loop-
1702             # see bugreport 1062.
1703             # reset buffer to prevent infinite loop
1704 0         0 $$buffer = $self->_readline;
1705 0         0 return;
1706             }
1707              
1708             # Make the new FTHelper object
1709 8280         18787 my $out = Bio::SeqIO::FTHelper->new;
1710 8280         14678 $out->verbose($self->verbose);
1711 8280         13900 $out->key($key);
1712 8280         12005 $out->loc($loc);
1713              
1714             # Now parse and add any qualifiers. (@qual is kept
1715             # intact to provide informative error messages.)
1716             QUAL:
1717 8280         16293 for (my $i = 0; $i < @qual; $i++) {
1718 55807         45287 my $data = $qual[$i];
1719 55807 50       182681 my ( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s*(.+))?})
1720             or $self->warn( "cannot see new qualifier in feature $key: "
1721             . $data);
1722 55807 50       70297 $qualifier = '' if not defined $qualifier;
1723              
1724 55807 100       53913 if (defined $value) {
1725             # Do we have a quoted value?
1726 54903 100       69273 if (substr($value, 0, 1) eq '"') {
    100          
1727             # Keep adding to value until we find the trailing quote
1728             # and the quotes are balanced
1729 52083   66     177322 while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) {
1730 12312 50       17431 if ($i >= $#qual) {
1731 0         0 $self->warn( "Unbalanced quote in:\n"
1732             . join("\n", @qual)
1733             . "No further qualifiers will "
1734             . "be added for this feature");
1735 0         0 last QUAL;
1736             }
1737             # modifying a for-loop variable inside of the loop
1738             # is not the best programming style ...
1739 12312         9189 $i++;
1740 12312         10209 my $next = $qual[$i];
1741              
1742             # add to value with a space unless the value appears
1743             # to be a sequence (translation for example)
1744             # if (($value.$next) =~ /[^A-Za-z\"\-]/o) {
1745             # changed to explicitly look for translation tag - cjf 06/8/29
1746 12312 100       16404 if ($qualifier !~ /^translation$/i ) {
1747 9870         8807 $value .= " ";
1748             }
1749 12312         44696 $value .= $next;
1750             }
1751             # Trim leading and trailing quotes
1752 52083         217168 $value =~ s/^"|"$//g;
1753             # Undouble internal quotes
1754 52083         49799 $value =~ s/""/\"/g;
1755             }
1756             elsif ( $value =~ /^\(/ ) { # values quoted by ()s
1757             # Keep adding to value until we find the trailing bracket
1758             # and the ()s are balanced
1759 27         34 my $left = ($value =~ tr/\(/\(/); # count left parens
1760 27         24 my $right = ($value =~ tr/\)/\)/); # count right parens
1761              
1762 27         55 while( $left != $right ) { # was "$value !~ /\)$/ or $left != $right"
1763 2 50       8 if ( $i >= $#qual) {
1764 0         0 $self->warn( "Unbalanced parens in:\n"
1765             . join("\n", @qual)
1766             . "\nNo further qualifiers will "
1767             . "be added for this feature");
1768 0         0 last QUAL;
1769             }
1770 2         4 $i++;
1771 2         3 my $next = $qual[$i];
1772 2         8 $value .= $next;
1773 2         2 $left += ($next =~ tr/\(/\(/);
1774 2         6 $right += ($next =~ tr/\)/\)/);
1775             }
1776             }
1777             }
1778             else {
1779 904         927 $value = '_no_value';
1780             }
1781             # Store the qualifier
1782 55807   100     75945 $out->field->{$qualifier} ||= [];
1783 55807         41583 push @{$out->field->{$qualifier}}, $value;
  55807         61250  
1784             }
1785 8280         15310 return $out;
1786             }
1787              
1788             =head2 _write_line_GenBank
1789              
1790             Title : _write_line_GenBank
1791             Usage :
1792             Function: internal function
1793             Example :
1794             Returns :
1795             Args :
1796              
1797             =cut
1798              
1799             sub _write_line_GenBank {
1800 0     0   0 my ($self, $pre1, $pre2, $line, $length) = @_;
1801              
1802 0 0       0 $length or $self->throw("Miscalled write_line_GenBank without length. Programming error!");
1803 0         0 my $subl = $length - length $pre2;
1804 0         0 my $linel = length $line;
1805 0         0 my $i;
1806              
1807 0         0 my $subr = substr($line,0,$length - length $pre1);
1808              
1809 0         0 $self->_print("$pre1$subr\n");
1810 0         0 for($i = ($length - length $pre1); $i < $linel; $i += $subl) {
1811 0         0 $subr = substr($line, $i, $subl);
1812 0         0 $self->_print("$pre2$subr\n");
1813             }
1814             }
1815              
1816             =head2 _write_line_GenBank_regex
1817              
1818             Title : _write_line_GenBank_regex
1819             Usage :
1820             Function: internal function for writing lines of specified
1821             length, with different first and the next line
1822             left hand headers and split at specific points in the
1823             text
1824             Example :
1825             Returns : nothing
1826             Args : file handle,
1827             first header,
1828             second header,
1829             text-line,
1830             regex for line breaks,
1831             total line length
1832              
1833             =cut
1834              
1835             sub _write_line_GenBank_regex {
1836 1765     1765   2581 my ($self, $pre1, $pre2, $line, $regex, $length) = @_;
1837              
1838             #print STDOUT "Going to print with $line!\n";
1839              
1840 1765 50       2941 $length or $self->throw("Miscalled write_line_GenBank without length. Programming error!");
1841              
1842 1765         2369 my $subl = $length - (length $pre1) - 2;
1843 1765         1963 my @lines = ();
1844              
1845             CHUNK:
1846 1765         2690 while ($line) {
1847 2129         3102 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1848 2129 50       27878 if ($line =~ m/^(.{0,$subl})($pat)(.*)/ ) {
1849 2129         5165 my $l = $1 . $2;
1850 2129         2741 $line = substr($line, length $l);
1851             # be strict about not padding spaces according to
1852             # genbank format
1853 2129         4188 $l =~ s/\s+$//;
1854 2129 50       3402 next CHUNK if ($l eq '');
1855 2129         2470 push @lines, $l;
1856 2129         6660 next CHUNK;
1857             }
1858             }
1859             # if we get here none of the patterns matched $subl or less chars
1860 0         0 $self->warn( "trouble dissecting \"$line\"\n into chunks "
1861             . "of $subl chars or less - this tag won't print right");
1862             # insert a space char to prevent infinite loops
1863 0         0 $line = substr($line, 0, $subl) . " " . substr($line, $subl);
1864             }
1865 1765         1869 my $s = shift @lines;
1866 1765 100       7422 $self->_print("$pre1$s\n") if $s;
1867 1765         7645 foreach my $s ( @lines ) {
1868 369         771 $self->_print("$pre2$s\n");
1869             }
1870             }
1871              
1872             =head2 _post_sort
1873              
1874             Title : _post_sort
1875             Usage : $obj->_post_sort($newval)
1876             Function:
1877             Returns : value of _post_sort
1878             Args : newvalue (optional)
1879              
1880             =cut
1881              
1882             sub _post_sort {
1883 25     25   42 my ($obj,$value) = @_;
1884 25 50       70 if ( defined $value) {
1885 0         0 $obj->{'_post_sort'} = $value;
1886             }
1887 25         85 return $obj->{'_post_sort'};
1888             }
1889              
1890             =head2 _show_dna
1891              
1892             Title : _show_dna
1893             Usage : $obj->_show_dna($newval)
1894             Function:
1895             Returns : value of _show_dna
1896             Args : newvalue (optional)
1897              
1898             =cut
1899              
1900             sub _show_dna {
1901 141     141   196 my ($obj,$value) = @_;
1902 141 100       367 if ( defined $value) {
1903 116         217 $obj->{'_show_dna'} = $value;
1904             }
1905 141         268 return $obj->{'_show_dna'};
1906             }
1907              
1908             =head2 _id_generation_func
1909              
1910             Title : _id_generation_func
1911             Usage : $obj->_id_generation_func($newval)
1912             Function:
1913             Returns : value of _id_generation_func
1914             Args : newvalue (optional)
1915              
1916             =cut
1917              
1918             sub _id_generation_func {
1919 25     25   43 my ($obj,$value) = @_;
1920 25 50       71 if ( defined $value ) {
1921 0         0 $obj->{'_id_generation_func'} = $value;
1922             }
1923 25         82 return $obj->{'_id_generation_func'};
1924             }
1925              
1926             =head2 _ac_generation_func
1927              
1928             Title : _ac_generation_func
1929             Usage : $obj->_ac_generation_func($newval)
1930             Function:
1931             Returns : value of _ac_generation_func
1932             Args : newvalue (optional)
1933              
1934             =cut
1935              
1936             sub _ac_generation_func {
1937 25     25   45 my ($obj,$value) = @_;
1938 25 50       75 if ( defined $value ) {
1939 0         0 $obj->{'_ac_generation_func'} = $value;
1940             }
1941 25         82 return $obj->{'_ac_generation_func'};
1942             }
1943              
1944             =head2 _sv_generation_func
1945              
1946             Title : _sv_generation_func
1947             Usage : $obj->_sv_generation_func($newval)
1948             Function:
1949             Returns : value of _sv_generation_func
1950             Args : newvalue (optional)
1951              
1952             =cut
1953              
1954             sub _sv_generation_func {
1955 25     25   47 my ($obj,$value) = @_;
1956 25 50       79 if ( defined $value ) {
1957 0         0 $obj->{'_sv_generation_func'} = $value;
1958             }
1959 25         204 return $obj->{'_sv_generation_func'};
1960             }
1961              
1962             =head2 _kw_generation_func
1963              
1964             Title : _kw_generation_func
1965             Usage : $obj->_kw_generation_func($newval)
1966             Function:
1967             Returns : value of _kw_generation_func
1968             Args : newvalue (optional)
1969              
1970             =cut
1971              
1972             sub _kw_generation_func {
1973 25     25   53 my ($obj,$value) = @_;
1974 25 50       74 if ( defined $value ) {
1975 0         0 $obj->{'_kw_generation_func'} = $value;
1976             }
1977 25         194 return $obj->{'_kw_generation_func'};
1978             }
1979              
1980             1;