File Coverage

Bio/SeqIO/gbdriver.pm
Criterion Covered Total %
statement 87 125 69.6
branch 54 92 58.7
condition 25 46 54.3
subroutine 10 12 83.3
pod 3 4 75.0
total 179 279 64.1


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::gbdriver
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Bioperl project bioperl-l(at)bioperl.org
7             #
8             # Copyright Chris Fields and contributors see AUTHORS section
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::gbdriver - GenBank handler-based push parser
17              
18             =head1 SYNOPSIS
19              
20             #It is probably best not to use this object directly, but
21             #rather go through the SeqIO handler:
22              
23             $stream = Bio::SeqIO->new(-file => $filename,
24             -format => 'gbdriver');
25              
26             while ( my $seq = $stream->next_seq() ) {
27             # do something with $seq
28             }
29              
30             =head1 DESCRIPTION
31              
32             This object can transform Bio::Seq objects to and from GenBank flat file
33             databases. The key difference between this parser and the tried-and-true
34             Bio::SeqIO::genbank parser is this version separates the parsing and data
35             manipulation into a 'driver' method (next_seq) and separate object handlers
36             which deal with the data passed to it.
37              
38             =head2 The Driver
39              
40             The main purpose of the driver routine, in this case next_seq(), is to carve out
41             the data into meaningful chunks which are passed along to relevant handlers (see
42             below).
43              
44             Each chunk of data in the has a NAME tag attached to it, similar to that for XML
45             parsing. This designates the type of data passed (annotation type or seqfeature)
46             and the handler to be called for processing the data.
47              
48             For GenBank annotations, the data is divided up and passed along to handlers
49             according to whether the data is tagged with a field name (i.e. LOCUS) and
50             whether the field name represents 'primary' annotation (in this case, is present
51             at the beginning of the line, such as REFERENCE). If the field is primary, it is
52             assigned to the NAME tag. Field names which aren't primary (have at least 2
53             spaces before the name, like ORGANISM) are appended to the preceding primary
54             field name as additional tags.
55              
56             For feature table data each new feature name signals the beginning of a new
57             chunk of data. 'FEATURES' is attached to NAME, the feature key ('CDS', 'gene',
58             etc) is attached as the PRIMARY_ID, and the location is assigned to it's own tag
59             name (LOCATION). Feature qualifiers are added as additional keys, with multiple
60             keys included in an array.
61              
62             Once a particular event occurs (new primary tag, sequence, end of record), the
63             data is passed along to be processed by a handler or (if no handler is defined)
64             tossed away.
65              
66             Internally, the hash ref for a representative annotation (here a REFERENCE)
67             looks like this:
68              
69             $VAR1 = {
70             'JOURNAL' => 'Unpublished (2003)',
71             'TITLE' => 'The DNA sequence of Homo sapiens',
72             'NAME' => 'REFERENCE',
73             'REFERENCE' => '1 (bases 1 to 10001)',
74             'AUTHORS' => 'International Human Genome Sequencing Consortium.'
75             };
76              
77             and a SeqFeature as this:
78              
79             $VAR1 = {
80             'db_xref' => [
81             'GeneID:127086',
82             'InterimID:127086'
83             ],
84             'LOCATION' => 'complement(3024..6641)',
85             'NAME' => 'FEATURES',
86             'FEATURE_KEY' => 'gene',
87             'gene' => 'LOC127086',
88             'note' => 'Derived by automated computational analysis using
89             gene prediction method: GNOMON.'
90             };
91              
92             Note that any driver implementation would suffice as long as it fulfilled the
93             requirements above.
94              
95             =head1 FEEDBACK
96              
97             =head2 Mailing Lists
98              
99             User feedback is an integral part of the evolution of this and other
100             Bioperl modules. Send your comments and suggestions preferably to one
101             of the Bioperl mailing lists. Your participation is much appreciated.
102              
103             bioperl-l@bioperl.org - General discussion
104             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
105              
106             =head2 Support
107              
108             Please direct usage questions or support issues to the mailing list:
109              
110             I
111              
112             rather than to the module maintainer directly. Many experienced and
113             reponsive experts will be able look at the problem and quickly
114             address it. Please include a thorough description of the problem
115             with code and data examples if at all possible.
116              
117             =head2 Reporting Bugs
118              
119             Report bugs to the Bioperl bug tracking system to help us keep track
120             the bugs and their resolution. Bug reports can be submitted via the web:
121              
122             https://github.com/bioperl/bioperl-live/issues
123              
124             =head1 AUTHOR - Bioperl Project
125              
126             bioperl-l at bioperl.org
127              
128             Original author Elia Stupka, elia -at- tigem.it
129              
130             =head1 CONTRIBUTORS
131              
132             Ewan Birney birney at ebi.ac.uk
133             Jason Stajich jason at bioperl.org
134             Chris Mungall cjm at fruitfly.bdgp.berkeley.edu
135             Lincoln Stein lstein at cshl.org
136             Heikki Lehvaslaiho, heikki at ebi.ac.uk
137             Hilmar Lapp, hlapp at gmx.net
138             Donald G. Jackson, donald.jackson at bms.com
139             James Wasmuth, james.wasmuth at ed.ac.uk
140             Brian Osborne, bosborne at alum.mit.edu
141              
142             =head1 APPENDIX
143              
144             The rest of the documentation details each of the object
145             methods. Internal methods are usually preceded with a _
146              
147             =cut
148              
149             # POD is at the end of the module
150              
151             # Let the code begin...
152              
153             package Bio::SeqIO::gbdriver;
154 1     1   3 use strict;
  1         1  
  1         23  
155 1     1   3 use warnings;
  1         1  
  1         19  
156 1     1   3 use Data::Dumper;
  1         1  
  1         39  
157 1     1   770 use Bio::SeqIO::Handler::GenericRichSeqHandler;
  1         2  
  1         33  
158 1     1   5 use Bio::Seq::SeqFactory;
  1         1  
  1         18  
159              
160 1     1   3 use base qw(Bio::SeqIO);
  1         1  
  1         1088  
161              
162             # map all annotation keys to consistent INSDC-based tags for all handlers
163              
164             my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
165             anticodon citation
166             codon codon_start
167             cons_splice direction
168             evidence label
169             mod_base number
170             rpt_type rpt_unit
171             transl_except transl_table
172             usedin
173             );
174              
175              
176             # 1) change this to indicate what should be secondary, not primary, which allows
177             # unknown or new stuff to be passed to handler automatically; current behavior
178             # appends unknowns to previous data, which isn't good since it's subtly passing
179             # by important data
180             # 2) add mapping details about how to separate data using specific delimiters
181              
182              
183             # Features are the only ones postprocessed for now
184             # Uncomment relevant code in next_seq and add keys as needed...
185             my %POSTPROCESS_DATA = map {$_ => 1} qw (FEATURES);
186              
187             sub _initialize {
188 23     23   40 my($self,@args) = @_;
189              
190 23         65 $self->SUPER::_initialize(@args);
191 23         78 my $handler = $self->_rearrange([qw(HANDLER)],@args);
192             # hash for functions for decoding keys.
193 23 50       103 $handler ? $self->seqhandler($handler) :
194             $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
195             -format => 'genbank',
196             -verbose => $self->verbose,
197             -builder => $self->sequence_builder
198             ));
199 23 50       67 if( ! defined $self->sequence_factory ) {
200 23         52 $self->sequence_factory(Bio::Seq::SeqFactory->new
201             (-verbose => $self->verbose(),
202             -type => 'Bio::Seq::RichSeq'));
203             }
204             }
205              
206             =head2 next_seq
207              
208             Title : next_seq
209             Usage : $seq = $stream->next_seq()
210             Function: returns the next sequence in the stream
211             Returns : Bio::Seq object
212             Args :
213              
214             =cut
215              
216             # at this point there is minimal sequence validation,
217             # but the parser seems to hold up nicely so far...
218              
219             sub next_seq {
220 33     33 1 2272 my $self = shift;
221 33         124 local($/) = "\n";
222 33         46 my ($ann, $data, $annkey);
223 33         43 my $endrec = my $seenfeat = 0;
224 33         29 my $seqdata;
225             my $seenlocus;
226 33         58 my $hobj = $self->seqhandler;
227 33         50 my $handlers = $self->seqhandler->handler_methods;
228             #$self->debug(Dumper($handlers));
229             PARSER:
230 33         78 while (defined(my $line = $self->_readline)) {
231 4516 100       8389 next if $line =~ m{^\s*$};
232            
233             # have to catch this at the top of the loop, then exit SEQ loop on //
234             # The reason? The regex match for ann/feat keys also matches some lines
235             # in the sequence; no easy way around it since some feature keys may
236             # start with a number as well
237 4513 100 100     11314 if ($ann && $ann eq 'ORIGIN') {
238             SEQ:
239 25         48 while (defined($line)) {
240 3258 100       4145 last SEQ if index($line,'//') == 0;
241 3233         3940 $seqdata->{DATA} .= uc $line;
242 3233         3625 $line = $self->_readline;
243             }
244 25         549 $seqdata->{DATA} =~ tr{0-9 \n}{}d;
245             }
246 4513 100       6187 $endrec = 1 if (index($line,'//')==0);
247              
248 4513 100 100     14163 if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
249 1450         2656 ($ann, $data) = ($2, $3);
250 1450 100       1708 unless ($seenlocus) {
251 31 50       53 $self->throw("No LOCUS found. Not GenBank in my book!")
252             if ($ann ne 'LOCUS');
253 31         24 $seenlocus = 1;
254             }
255             # use the spacer to determine the annotation type
256 1450   100     3163 my $len = length($1 || '');
257            
258 1450 100 100     3774 $annkey = ($len == 0 || $len > 4) ? 'DATA' : $ann;
259            
260             # Push off the previously cached data to the handler
261             # whenever a new primary annotation or seqfeature is found
262             # Note use of $endrec for catching end of record
263 1450 100 100     3389 if (($annkey eq 'DATA') && $seqdata) {
264 914         938 chomp $seqdata->{DATA};
265             # postprocessing for some data
266 914 100       1194 if ($seqdata->{NAME} eq 'FEATURES') {
267 547         818 $self->_process_features($seqdata)
268             }
269            
270             # using handlers directly, slightly faster
271             #my $method = (exists $handlers->{ $seqdata->{NAME} }) ?
272             # ($handlers->{$seqdata->{NAME}}) :
273             # (exists $handlers->{'_DEFAULT_'}) ?
274             # ($handlers->{'_DEFAULT_'}) :
275             # undef;
276             #($method) ? ($hobj->$method($seqdata) ) :
277             # $self->debug("No handler defined for ",$seqdata->{NAME},"\n");
278              
279             # using handler methods in the Handler object, more centralized
280             #$self->debug(Dumper($seqdata));
281 914         1995 $hobj->data_handler($seqdata);
282              
283             # bail here on //
284 914 100       1450 last PARSER if $endrec;
285             # reset for next round
286 884         774 $seqdata = undef;
287             }
288              
289             $seqdata->{NAME} = ($len == 0) ? $ann : # primary ann
290             ($len > 4 ) ? 'FEATURES': # sf feature key
291 1420 100       3793 $seqdata->{NAME}; # all rest are sec. ann
    100          
292 1420 100       2057 if ($seqdata->{NAME} eq 'FEATURES') {
293 547         636 $seqdata->{FEATURE_KEY} = $ann;
294             }
295             # throw back to top if seq is found to avoid regex
296 1420 100       1895 next PARSER if $ann eq 'ORIGIN';
297            
298             } else {
299 3063         6038 ($data = $line) =~ s{^\s+}{};
300 3063         2871 chomp $data;
301             }
302 4458 100 66     12210 my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
303 4458 100       13369 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
304             }
305 33         93 return $hobj->build_sequence;
306             }
307              
308             sub next_chunk {
309 0     0 0 0 my $self = shift;
310 0         0 local($/) = "\n";
311 0         0 my ($ann, $data, $annkey);
312 0         0 my $endrec = my $seenfeat = 0;
313 0         0 my $seqdata;
314             my $seenlocus;
315 0         0 my $hobj = $self->seqhandler;
316             PARSER:
317 0         0 while (defined(my $line = $self->_readline)) {
318 0 0       0 next if $line =~ m{^\s*$};
319             # have to catch this at the top of the loop, then exit SEQ loop on //
320             # The reason? The regex match for ann/feat keys also matches some lines
321             # in the sequence; no easy way around it since some feature keys may
322             # start with a number as well
323 0 0 0     0 if ($ann && $ann eq 'ORIGIN') {
324             SEQ:
325 0         0 while (defined($line)) {
326 0 0       0 last SEQ if index($line,'//') == 0;
327 0         0 $seqdata->{DATA} .= uc $line;
328 0         0 $line = $self->_readline;
329             }
330 0         0 $seqdata->{DATA} =~ tr{0-9 \n}{}d;
331             }
332 0 0       0 $endrec = 1 if (index($line,'//')==0);
333              
334 0 0 0     0 if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
335 0         0 ($ann, $data) = ($2, $3);
336 0 0       0 unless ($seenlocus) {
337 0 0       0 $self->throw("No LOCUS found. Not GenBank in my book!")
338             if ($ann ne 'LOCUS');
339 0         0 $seenlocus = 1;
340             }
341             # use the spacer to determine the annotation type
342 0   0     0 my $len = length($1 || '');
343            
344 0 0 0     0 $annkey = ($len == 0 || $len > 4) ? 'DATA' : $ann;
345            
346             # Push off the previously cached data to the handler
347             # whenever a new primary annotation or seqfeature is found
348             # Note use of $endrec for catching end of record
349 0 0 0     0 if (($annkey eq 'DATA') && $seqdata) {
350 0         0 chomp $seqdata->{DATA};
351             # postprocessing for some data
352 0 0       0 if ($seqdata->{NAME} eq 'FEATURES') {
353 0         0 $self->_process_features($seqdata)
354             }
355             # using handler methods in the Handler object, more centralized
356 0         0 $hobj->data_handler($seqdata);
357             # bail here on //
358 0 0       0 last PARSER if $endrec;
359             # reset for next round
360 0         0 $seqdata = undef;
361             }
362              
363             $seqdata->{NAME} = ($len == 0) ? $ann : # primary ann
364             ($len > 4 ) ? 'FEATURES': # sf feature key
365 0 0       0 $seqdata->{NAME}; # all rest are sec. ann
    0          
366 0 0       0 if ($seqdata->{NAME} eq 'FEATURES') {
367 0         0 $seqdata->{FEATURE_KEY} = $ann;
368             }
369             # throw back to top if seq is found to avoid regex
370 0 0       0 next PARSER if $ann eq 'ORIGIN';
371             } else {
372 0         0 ($data = $line) =~ s{^\s+}{};
373 0         0 chomp $data;
374             }
375 0 0 0     0 my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
376 0 0       0 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
377             }
378             }
379              
380             =head2 write_seq
381              
382             Title : write_seq
383             Usage : $stream->write_seq($seq)
384             Function: writes the $seq object (must be seq) to the stream
385             Returns : 1 for success and 0 for error
386             Args : array of 1 to n Bio::SeqI objects
387              
388             =cut
389              
390             sub write_seq {
391 0     0 1 0 shift->throw("Use Bio::SeqIO::genbank for output");
392             # maybe make a Writer class as well????
393             }
394              
395             =head2 seqhandler
396              
397             Title : seqhandler
398             Usage : $stream->seqhandler($handler)
399             Function: Get/Set the Bio::Seq::HandlerBaseI object
400             Returns : Bio::Seq::HandlerBaseI
401             Args : Bio::Seq::HandlerBaseI
402              
403             =cut
404              
405             sub seqhandler {
406 89     89 1 65 my ($self, $handler) = @_;
407 89 100       158 if ($handler) {
408 23 50 33     129 $self->throw("Not a Bio::HandlerBaseI") unless
409             ref($handler) && $handler->isa("Bio::HandlerBaseI");
410 23         64 $self->{'_seqhandler'} = $handler;
411             }
412 89         142 return $self->{'_seqhandler'};
413             }
414              
415             #=head2 _process_features
416             #
417             # Title : _process_features
418             # Usage : $self->_process_features($seqdata)
419             # Function: Process feature data chunk into usable bits
420             # Returns :
421             # Args : data chunk
422             #
423             #=cut
424              
425             sub _process_features {
426 547     547   489 my ($self, $seqdata) = @_;
427 547         1757 my @ftlines = split m{\n}, $seqdata->{DATA};
428 547         688 delete $seqdata->{DATA};
429             # don't deal with balancing quotes for now; just get rid of them...
430             # Should we worry about checking whether these are balanced
431             # for round-tripping tests?
432 547         567 map { s{"}{}g } @ftlines;
  3136         4220  
433             # all sfs start with the location...
434 547         434 my $qual = 'LOCATION';
435 547         389 my $ct = 0;
436 547         656 for my $qualdata (@ftlines) {
437 3136 100       6360 if ($qualdata =~ m{^/([^=]+)=?(.+)?}) {
438 1724         3163 ($qual, $qualdata) = ($1, $2);
439 1724   100     2211 $qualdata ||= ''; # for those qualifiers that have no data, like 'pseudo'
440             $ct = (exists $seqdata->{$qual}) ?
441 1724 100       2136 ((ref($seqdata->{$qual})) ? scalar(@{ $seqdata->{$qual} }) : 1)
  98 100       109  
442             : 0 ;
443             }
444 3136 100 66     5660 my $delim = ($qual eq 'translation' || exists $FTQUAL_NO_QUOTE{$qual}) ?
445             '' : ' ';
446             # if more than one, turn into an array ref and append
447 3136 100       2906 if ($ct == 0) {
448             (exists $seqdata->{$qual}) ? ($seqdata->{$qual}.= $delim.$qualdata || '') :
449 2962 100 100     7759 ($seqdata->{$qual} .= $qualdata || '');
      100        
450             } else {
451 174 100       243 if (!ref($seqdata->{$qual})) {
452 69         110 $seqdata->{$qual} = [$seqdata->{$qual}];
453             }
454             (exists $seqdata->{$qual}->[$ct]) ? (($seqdata->{$qual}->[$ct]) .= $delim.$qualdata) :
455 174 100       432 (($seqdata->{$qual}->[$ct]) .= $qualdata);
456             }
457             }
458             }
459              
460             1;
461              
462             __END__