File Coverage

Bio/SeqIO/gbdriver.pm
Criterion Covered Total %
statement 87 125 69.6
branch 54 92 58.7
condition 26 46 56.5
subroutine 10 12 83.3
pod 3 4 75.0
total 180 279 64.5


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::gbdriver
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Bioperl project bioperl-l(at)bioperl.org
7             #
8             # Copyright Chris Fields and contributors see AUTHORS section
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::gbdriver - GenBank handler-based push parser
17              
18             =head1 SYNOPSIS
19              
20             #It is probably best not to use this object directly, but
21             #rather go through the SeqIO handler:
22              
23             $stream = Bio::SeqIO->new(-file => $filename,
24             -format => 'gbdriver');
25              
26             while ( my $seq = $stream->next_seq() ) {
27             # do something with $seq
28             }
29              
30             =head1 DESCRIPTION
31              
32             This object can transform Bio::Seq objects to and from GenBank flat file
33             databases. The key difference between this parser and the tried-and-true
34             Bio::SeqIO::genbank parser is this version separates the parsing and data
35             manipulation into a 'driver' method (next_seq) and separate object handlers
36             which deal with the data passed to it.
37              
38             =head2 The Driver
39              
40             The main purpose of the driver routine, in this case next_seq(), is to carve out
41             the data into meaningful chunks which are passed along to relevant handlers (see
42             below).
43              
44             Each chunk of data in the has a NAME tag attached to it, similar to that for XML
45             parsing. This designates the type of data passed (annotation type or seqfeature)
46             and the handler to be called for processing the data.
47              
48             For GenBank annotations, the data is divided up and passed along to handlers
49             according to whether the data is tagged with a field name (i.e. LOCUS) and
50             whether the field name represents 'primary' annotation (in this case, is present
51             at the beginning of the line, such as REFERENCE). If the field is primary, it is
52             assigned to the NAME tag. Field names which aren't primary (have at least 2
53             spaces before the name, like ORGANISM) are appended to the preceding primary
54             field name as additional tags.
55              
56             For feature table data each new feature name signals the beginning of a new
57             chunk of data. 'FEATURES' is attached to NAME, the feature key ('CDS', 'gene',
58             etc) is attached as the PRIMARY_ID, and the location is assigned to it's own tag
59             name (LOCATION). Feature qualifiers are added as additional keys, with multiple
60             keys included in an array.
61              
62             Once a particular event occurs (new primary tag, sequence, end of record), the
63             data is passed along to be processed by a handler or (if no handler is defined)
64             tossed away.
65              
66             Internally, the hash ref for a representative annotation (here a REFERENCE)
67             looks like this:
68              
69             $VAR1 = {
70             'JOURNAL' => 'Unpublished (2003)',
71             'TITLE' => 'The DNA sequence of Homo sapiens',
72             'NAME' => 'REFERENCE',
73             'REFERENCE' => '1 (bases 1 to 10001)',
74             'AUTHORS' => 'International Human Genome Sequencing Consortium.'
75             };
76              
77             and a SeqFeature as this:
78              
79             $VAR1 = {
80             'db_xref' => [
81             'GeneID:127086',
82             'InterimID:127086'
83             ],
84             'LOCATION' => 'complement(3024..6641)',
85             'NAME' => 'FEATURES',
86             'FEATURE_KEY' => 'gene',
87             'gene' => 'LOC127086',
88             'note' => 'Derived by automated computational analysis using
89             gene prediction method: GNOMON.'
90             };
91              
92             Note that any driver implementation would suffice as long as it fulfilled the
93             requirements above.
94              
95             =head1 FEEDBACK
96              
97             =head2 Mailing Lists
98              
99             User feedback is an integral part of the evolution of this and other
100             Bioperl modules. Send your comments and suggestions preferably to one
101             of the Bioperl mailing lists. Your participation is much appreciated.
102              
103             bioperl-l@bioperl.org - General discussion
104             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
105              
106             =head2 Support
107              
108             Please direct usage questions or support issues to the mailing list:
109              
110             I
111              
112             rather than to the module maintainer directly. Many experienced and
113             reponsive experts will be able look at the problem and quickly
114             address it. Please include a thorough description of the problem
115             with code and data examples if at all possible.
116              
117             =head2 Reporting Bugs
118              
119             Report bugs to the Bioperl bug tracking system to help us keep track
120             the bugs and their resolution. Bug reports can be submitted via the web:
121              
122             https://github.com/bioperl/bioperl-live/issues
123              
124             =head1 AUTHOR - Bioperl Project
125              
126             bioperl-l at bioperl.org
127              
128             Original author Elia Stupka, elia -at- tigem.it
129              
130             =head1 CONTRIBUTORS
131              
132             Ewan Birney birney at ebi.ac.uk
133             Jason Stajich jason at bioperl.org
134             Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
135             Lincoln Stein lstein at cshl.org
136             Heikki Lehvaslaiho, heikki at ebi.ac.uk
137             Hilmar Lapp, hlapp at gmx.net
138             Donald G. Jackson, donald.jackson at bms.com
139             James Wasmuth, james.wasmuth at ed.ac.uk
140             Brian Osborne, bosborne at alum.mit.edu
141              
142             =head1 APPENDIX
143              
144             The rest of the documentation details each of the object
145             methods. Internal methods are usually preceded with a _
146              
147             =cut
148              
149             # POD is at the end of the module
150              
151             # Let the code begin...
152              
153             package Bio::SeqIO::gbdriver;
154 1     1   5 use strict;
  1         2  
  1         25  
155 1     1   3 use warnings;
  1         1  
  1         23  
156 1     1   4 use Data::Dumper;
  1         1  
  1         45  
157 1     1   718 use Bio::SeqIO::Handler::GenericRichSeqHandler;
  1         2  
  1         61  
158 1     1   7 use Bio::Seq::SeqFactory;
  1         2  
  1         21  
159              
160 1     1   4 use base qw(Bio::SeqIO);
  1         1  
  1         1147  
161              
162             # map all annotation keys to consistent INSDC-based tags for all handlers
163              
164             my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
165             anticodon citation
166             codon codon_start
167             cons_splice direction
168             evidence label
169             mod_base number
170             rpt_type rpt_unit
171             transl_except transl_table
172             usedin
173             );
174              
175              
176             # 1) change this to indicate what should be secondary, not primary, which allows
177             # unknown or new stuff to be passed to handler automatically; current behavior
178             # appends unknowns to previous data, which isn't good since it's subtly passing
179             # by important data
180             # 2) add mapping details about how to separate data using specific delimiters
181              
182              
183             # Features are the only ones postprocessed for now
184             # Uncomment relevant code in next_seq and add keys as needed...
185             my %POSTPROCESS_DATA = map {$_ => 1} qw (FEATURES);
186              
187             sub _initialize {
188 23     23   74 my($self,@args) = @_;
189              
190 23         104 $self->SUPER::_initialize(@args);
191 23         109 my $handler = $self->_rearrange([qw(HANDLER)],@args);
192             # hash for functions for decoding keys.
193 23 50       121 $handler ? $self->seqhandler($handler) :
194             $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
195             -format => 'genbank',
196             -verbose => $self->verbose,
197             -builder => $self->sequence_builder
198             ));
199 23 50       94 if( ! defined $self->sequence_factory ) {
200 23         61 $self->sequence_factory(Bio::Seq::SeqFactory->new
201             (-verbose => $self->verbose(),
202             -type => 'Bio::Seq::RichSeq'));
203             }
204             }
205              
206             =head2 next_seq
207              
208             Title : next_seq
209             Usage : $seq = $stream->next_seq()
210             Function: returns the next sequence in the stream
211             Returns : Bio::Seq object
212             Args :
213              
214             =cut
215              
216             # at this point there is minimal sequence validation,
217             # but the parser seems to hold up nicely so far...
218              
219             sub next_seq {
220 33     33 1 2774 my $self = shift;
221 33         152 local($/) = "\n";
222 33         63 my ($ann, $data, $annkey);
223 33         55 my $endrec = my $seenfeat = 0;
224 33         46 my $seqdata;
225             my $seenlocus;
226 33         92 my $hobj = $self->seqhandler;
227 33         74 my $handlers = $self->seqhandler->handler_methods;
228             #$self->debug(Dumper($handlers));
229             PARSER:
230 33         110 while (defined(my $line = $self->_readline)) {
231 4516 100       9252 next if $line =~ m{^\s*$};
232            
233             # have to catch this at the top of the loop, then exit SEQ loop on //
234             # The reason? The regex match for ann/feat keys also matches some lines
235             # in the sequence; no easy way around it since some feature keys may
236             # start with a number as well
237 4513 100 100     9870 if ($ann && $ann eq 'ORIGIN') {
238             SEQ:
239 25         62 while (defined($line)) {
240 3258 100       4354 last SEQ if index($line,'//') == 0;
241 3233         4739 $seqdata->{DATA} .= uc $line;
242 3233         3989 $line = $self->_readline;
243             }
244 25         607 $seqdata->{DATA} =~ tr{0-9 \n}{}d;
245             }
246 4513 100       6410 $endrec = 1 if (index($line,'//')==0);
247              
248 4513 100 100     13284 if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
249 1450         3259 ($ann, $data) = ($2, $3);
250 1450 100       1924 unless ($seenlocus) {
251 31 50       70 $self->throw("No LOCUS found. Not GenBank in my book!")
252             if ($ann ne 'LOCUS');
253 31         41 $seenlocus = 1;
254             }
255             # use the spacer to determine the annotation type
256 1450   100     3078 my $len = length($1 || '');
257            
258 1450 100 100     3260 $annkey = ($len == 0 || $len > 4) ? 'DATA' : $ann;
259            
260             # Push off the previously cached data to the handler
261             # whenever a new primary annotation or seqfeature is found
262             # Note use of $endrec for catching end of record
263 1450 100 100     3251 if (($annkey eq 'DATA') && $seqdata) {
264 914         1127 chomp $seqdata->{DATA};
265             # postprocessing for some data
266 914 100       1658 if ($seqdata->{NAME} eq 'FEATURES') {
267 547         896 $self->_process_features($seqdata)
268             }
269            
270             # using handlers directly, slightly faster
271             #my $method = (exists $handlers->{ $seqdata->{NAME} }) ?
272             # ($handlers->{$seqdata->{NAME}}) :
273             # (exists $handlers->{'_DEFAULT_'}) ?
274             # ($handlers->{'_DEFAULT_'}) :
275             # undef;
276             #($method) ? ($hobj->$method($seqdata) ) :
277             # $self->debug("No handler defined for ",$seqdata->{NAME},"\n");
278              
279             # using handler methods in the Handler object, more centralized
280             #$self->debug(Dumper($seqdata));
281 914         2146 $hobj->data_handler($seqdata);
282              
283             # bail here on //
284 914 100       1584 last PARSER if $endrec;
285             # reset for next round
286 884         1691 $seqdata = undef;
287             }
288              
289             $seqdata->{NAME} = ($len == 0) ? $ann : # primary ann
290             ($len > 4 ) ? 'FEATURES': # sf feature key
291 1420 100       2992 $seqdata->{NAME}; # all rest are sec. ann
    100          
292 1420 100       2061 if ($seqdata->{NAME} eq 'FEATURES') {
293 547         783 $seqdata->{FEATURE_KEY} = $ann;
294             }
295             # throw back to top if seq is found to avoid regex
296 1420 100       2090 next PARSER if $ann eq 'ORIGIN';
297            
298             } else {
299 3063         7519 ($data = $line) =~ s{^\s+}{};
300 3063         3708 chomp $data;
301             }
302 4458 100 66     10030 my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
303 4458 100       13846 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
304             }
305 33         140 return $hobj->build_sequence;
306             }
307              
308             sub next_chunk {
309 0     0 0 0 my $self = shift;
310 0         0 local($/) = "\n";
311 0         0 my ($ann, $data, $annkey);
312 0         0 my $endrec = my $seenfeat = 0;
313 0         0 my $seqdata;
314             my $seenlocus;
315 0         0 my $hobj = $self->seqhandler;
316             PARSER:
317 0         0 while (defined(my $line = $self->_readline)) {
318 0 0       0 next if $line =~ m{^\s*$};
319             # have to catch this at the top of the loop, then exit SEQ loop on //
320             # The reason? The regex match for ann/feat keys also matches some lines
321             # in the sequence; no easy way around it since some feature keys may
322             # start with a number as well
323 0 0 0     0 if ($ann && $ann eq 'ORIGIN') {
324             SEQ:
325 0         0 while (defined($line)) {
326 0 0       0 last SEQ if index($line,'//') == 0;
327 0         0 $seqdata->{DATA} .= uc $line;
328 0         0 $line = $self->_readline;
329             }
330 0         0 $seqdata->{DATA} =~ tr{0-9 \n}{}d;
331             }
332 0 0       0 $endrec = 1 if (index($line,'//')==0);
333              
334 0 0 0     0 if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
335 0         0 ($ann, $data) = ($2, $3);
336 0 0       0 unless ($seenlocus) {
337 0 0       0 $self->throw("No LOCUS found. Not GenBank in my book!")
338             if ($ann ne 'LOCUS');
339 0         0 $seenlocus = 1;
340             }
341             # use the spacer to determine the annotation type
342 0   0     0 my $len = length($1 || '');
343            
344 0 0 0     0 $annkey = ($len == 0 || $len > 4) ? 'DATA' : $ann;
345            
346             # Push off the previously cached data to the handler
347             # whenever a new primary annotation or seqfeature is found
348             # Note use of $endrec for catching end of record
349 0 0 0     0 if (($annkey eq 'DATA') && $seqdata) {
350 0         0 chomp $seqdata->{DATA};
351             # postprocessing for some data
352 0 0       0 if ($seqdata->{NAME} eq 'FEATURES') {
353 0         0 $self->_process_features($seqdata)
354             }
355             # using handler methods in the Handler object, more centralized
356 0         0 $hobj->data_handler($seqdata);
357             # bail here on //
358 0 0       0 last PARSER if $endrec;
359             # reset for next round
360 0         0 $seqdata = undef;
361             }
362              
363             $seqdata->{NAME} = ($len == 0) ? $ann : # primary ann
364             ($len > 4 ) ? 'FEATURES': # sf feature key
365 0 0       0 $seqdata->{NAME}; # all rest are sec. ann
    0          
366 0 0       0 if ($seqdata->{NAME} eq 'FEATURES') {
367 0         0 $seqdata->{FEATURE_KEY} = $ann;
368             }
369             # throw back to top if seq is found to avoid regex
370 0 0       0 next PARSER if $ann eq 'ORIGIN';
371             } else {
372 0         0 ($data = $line) =~ s{^\s+}{};
373 0         0 chomp $data;
374             }
375 0 0 0     0 my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
376 0 0       0 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
377             }
378             }
379              
380             =head2 write_seq
381              
382             Title : write_seq
383             Usage : $stream->write_seq($seq)
384             Function: writes the $seq object (must be seq) to the stream
385             Returns : 1 for success and 0 for error
386             Args : array of 1 to n Bio::SeqI objects
387              
388             =cut
389              
390             sub write_seq {
391 0     0 1 0 shift->throw("Use Bio::SeqIO::genbank for output");
392             # maybe make a Writer class as well????
393             }
394              
395             =head2 seqhandler
396              
397             Title : seqhandler
398             Usage : $stream->seqhandler($handler)
399             Function: Get/Set the Bio::Seq::HandlerBaseI object
400             Returns : Bio::Seq::HandlerBaseI
401             Args : Bio::Seq::HandlerBaseI
402              
403             =cut
404              
405             sub seqhandler {
406 89     89 1 144 my ($self, $handler) = @_;
407 89 100       153 if ($handler) {
408 23 50 33     132 $self->throw("Not a Bio::HandlerBaseI") unless
409             ref($handler) && $handler->isa("Bio::HandlerBaseI");
410 23         88 $self->{'_seqhandler'} = $handler;
411             }
412 89         181 return $self->{'_seqhandler'};
413             }
414              
415             #=head2 _process_features
416             #
417             # Title : _process_features
418             # Usage : $self->_process_features($seqdata)
419             # Function: Process feature data chunk into usable bits
420             # Returns :
421             # Args : data chunk
422             #
423             #=cut
424              
425             sub _process_features {
426 547     547   678 my ($self, $seqdata) = @_;
427 547         1943 my @ftlines = split m{\n}, $seqdata->{DATA};
428 547         922 delete $seqdata->{DATA};
429             # don't deal with balancing quotes for now; just get rid of them...
430             # Should we worry about checking whether these are balanced
431             # for round-tripping tests?
432 547         842 map { s{"}{}g } @ftlines;
  3136         5961  
433             # all sfs start with the location...
434 547         767 my $qual = 'LOCATION';
435 547         521 my $ct = 0;
436 547         814 for my $qualdata (@ftlines) {
437 3136 100       6961 if ($qualdata =~ m{^/([^=]+)=?(.+)?}) {
438 1724         3543 ($qual, $qualdata) = ($1, $2);
439 1724   100     2547 $qualdata ||= ''; # for those qualifiers that have no data, like 'pseudo'
440             $ct = (exists $seqdata->{$qual}) ?
441 1724 100       2413 ((ref($seqdata->{$qual})) ? scalar(@{ $seqdata->{$qual} }) : 1)
  98 100       137  
442             : 0 ;
443             }
444 3136 100 100     6624 my $delim = ($qual eq 'translation' || exists $FTQUAL_NO_QUOTE{$qual}) ?
445             '' : ' ';
446             # if more than one, turn into an array ref and append
447 3136 100       3404 if ($ct == 0) {
448             (exists $seqdata->{$qual}) ? ($seqdata->{$qual}.= $delim.$qualdata || '') :
449 2962 100 100     7750 ($seqdata->{$qual} .= $qualdata || '');
      100        
450             } else {
451 174 100       243 if (!ref($seqdata->{$qual})) {
452 69         150 $seqdata->{$qual} = [$seqdata->{$qual}];
453             }
454             (exists $seqdata->{$qual}->[$ct]) ? (($seqdata->{$qual}->[$ct]) .= $delim.$qualdata) :
455 174 100       470 (($seqdata->{$qual}->[$ct]) .= $qualdata);
456             }
457             }
458             }
459              
460             1;
461              
462             __END__