File Coverage

Bio/SearchIO/blasttable.pm
Criterion Covered Total %
statement 150 174 86.2
branch 55 76 72.3
condition 22 36 61.1
subroutine 17 21 80.9
pod 11 12 91.6
total 255 319 79.9


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SearchIO::blasttable
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SearchIO::blasttable - Driver module for SearchIO for parsing NCBI -m 8/9 format
17              
18             =head1 SYNOPSIS
19              
20             # do not use this module directly
21             use Bio::SearchIO;
22             my $parser = Bio::SearchIO->new(-file => $file,
23             -format => 'blasttable');
24              
25             while( my $result = $parser->next_result ) {
26             }
27              
28             =head1 DESCRIPTION
29              
30             This module will support parsing NCBI -m 8 or -m 9 tabular output
31             and WU-BLAST -mformat 2 or -mformat 3 tabular output.
32              
33             =head1 FEEDBACK
34              
35             =head2 Mailing Lists
36              
37             User feedback is an integral part of the evolution of this and other
38             Bioperl modules. Send your comments and suggestions preferably to
39             the Bioperl mailing list. Your participation is much appreciated.
40              
41             bioperl-l@bioperl.org - General discussion
42             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43              
44             =head2 Support
45              
46             Please direct usage questions or support issues to the mailing list:
47              
48             I
49              
50             rather than to the module maintainer directly. Many experienced and
51             reponsive experts will be able look at the problem and quickly
52             address it. Please include a thorough description of the problem
53             with code and data examples if at all possible.
54              
55             =head2 Reporting Bugs
56              
57             Report bugs to the Bioperl bug tracking system to help us keep track
58             of the bugs and their resolution. Bug reports can be submitted via
59             the web:
60              
61             https://github.com/bioperl/bioperl-live/issues
62              
63             =head1 AUTHOR - Jason Stajich
64              
65             Email jason-at-bioperl-dot-org
66              
67             =head1 APPENDIX
68              
69             The rest of the documentation details each of the object methods.
70             Internal methods are usually preceded with a _
71              
72             =cut
73              
74             # Let the code begin...
75              
76              
77             package Bio::SearchIO::blasttable;
78 2     2   8 use vars qw(%MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $DefaultProgramName);
  2         2  
  2         145  
79 2     2   9 use strict;
  2         2  
  2         36  
80 2     2   563 use Bio::Search::Result::ResultFactory;
  2         3  
  2         45  
81 2     2   553 use Bio::Search::Hit::HitFactory;
  2         2  
  2         41  
82 2     2   566 use Bio::Search::HSP::HSPFactory;
  2         4  
  2         177  
83              
84             $DefaultProgramName = 'BLASTN';
85             $DEFAULT_WRITER_CLASS = 'Bio::SearchIO::Writer::HitTableWriter';
86              
87             # mapping of terms to Bioperl hash keys
88             %MODEMAP = (
89             'Result' => 'result',
90             'Hit' => 'hit',
91             'Hsp' => 'hsp'
92             );
93              
94             %MAPPING = (
95             'Hsp_bit-score' => 'HSP-bits',
96             'Hsp_score' => 'HSP-score',
97             'Hsp_evalue' => 'HSP-evalue',
98             'Hsp_query-from' => 'HSP-query_start',
99             'Hsp_query-to' => 'HSP-query_end',
100             'Hsp_hit-from' => 'HSP-hit_start',
101             'Hsp_hit-to' => 'HSP-hit_end',
102             'Hsp_positive' => 'HSP-conserved',
103             'Hsp_identity' => 'HSP-identical',
104             'Hsp_mismatches' => 'HSP-mismatches',
105             'Hsp_qgapblocks' => 'HSP-query_gapblocks',
106             'Hsp_hgapblocks' => 'HSP-hit_gapblocks',
107             'Hsp_gaps' => 'HSP-hsp_gaps',
108             'Hsp_hitgaps' => 'HSP-hit_gaps',
109             'Hsp_querygaps' => 'HSP-query_gaps',
110             'Hsp_align-len' => 'HSP-hsp_length',
111             'Hsp_query-frame'=> 'HSP-query_frame',
112             'Hsp_hit-frame' => 'HSP-hit_frame',
113              
114             'Hit_id' => 'HIT-name',
115             'Hit_len' => 'HIT-length',
116             'Hit_accession' => 'HIT-accession',
117             'Hit_def' => 'HIT-description',
118             'Hit_signif' => 'HIT-significance',
119             'Hit_score' => 'HIT-score',
120             'Hit_bits' => 'HIT-bits',
121              
122             'Result_program' => 'RESULT-algorithm_name',
123             'Result_version' => 'RESULT-algorithm_version',
124             'Result_query-def'=> 'RESULT-query_name',
125             'Result_query-len'=> 'RESULT-query_length',
126             'Result_query-acc'=> 'RESULT-query_accession',
127             'Result_querydesc'=> 'RESULT-query_description',
128             'Result_db' => 'RESULT-database_name',
129             'Result_db-len' => 'RESULT-database_entries',
130             'Result_db-let' => 'RESULT-database_letters',
131             );
132              
133 2     2   12 use base qw(Bio::SearchIO);
  2         2  
  2         2928  
134              
135             =head2 new
136              
137             Title : new
138             Usage : my $obj = Bio::SearchIO::blasttable->new();
139             Function: Builds a new Bio::SearchIO::blasttable object
140             Returns : an instance of Bio::SearchIO::blasttable
141             Args :
142              
143              
144             =cut
145              
146             sub _initialize {
147 11     11   22 my ($self,@args) = @_;
148 11         32 $self->SUPER::_initialize(@args);
149              
150 11         35 my ($pname) = $self->_rearrange([qw(PROGRAM_NAME)],
151             @args);
152 11   66     57 $self->program_name($pname || $DefaultProgramName);
153 11         32 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::GenericResult'));
154 11         19 $self->_eventHandler->register_factory('hit', Bio::Search::Hit::HitFactory->new(-type => 'Bio::Search::Hit::GenericHit'));
155 11         15 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::GenericHSP'));
156             }
157              
158              
159             =head2 next_result
160              
161             Title : next_result
162             Usage : my $result = $parser->next_result
163             Function: Parse the next result from the data stream
164             Returns : L
165             Args : none
166              
167              
168             =cut
169              
170             sub next_result{
171 12     12 1 518 my ($self) = @_;
172 12         21 my ($lastquery,$lasthit);
173 12         48 local $/ = "\n";
174 12         14 local $_;
175 12         15 my ($alg, $ver);
176 12         36 while( defined ($_ = $self->_readline) ) {
177             # WU-BLAST -mformat 3 only
178 766 100       1988 if(m{^#\s((?:\S+?)?BLAST[NPX])\s(\d+\.\d+.+\d{4}\])}) {
    100          
179 1         3 ($alg, $ver) = ($1, $2);
180             # only one header for whole file with WU-BLAST
181             # so $alg and $ver won't get set properly for
182             # each result
183 1 50       6 $self->program_name($alg) if $alg;
184 1 50       6 $self->element({'Name' => 'Result_version',
185             'Data' => $ver}) if $ver;
186 1         3 next;
187             }
188             # -m 9 only
189             elsif(m{^#\s+((?:\S+?)?BLAST[NPX])\s+(.+)}) {
190 1         3 ($alg, $ver) = ($1, $2);
191 1         4 next;
192             }
193 764 100 100     2801 next if /^#/ || /^\s*$/;
194              
195 686         2130 my @fields = split;
196 686 50       997 next if @fields == 1;
197 686         564 my ($qname,$hname, $percent_id, $hsp_len, $mismatches,$gapsm,
198             $qstart,$qend,$hstart,$hend,$evalue,$bits);
199             # WU-BLAST-specific
200 0         0 my ($num_scores, $raw_score, $identities, $positives, $percent_pos,
201             $qgap_blocks,$qgaps, $sgap_blocks, $sgaps, $qframe,
202             $sframe);
203             # NCBI -m8 and -m9
204 686 100 33     1107 if (@fields == 12) {
    100          
    50          
205 8         12 ($qname,$hname, $percent_id, $hsp_len, $mismatches,$gapsm,
206             $qstart,$qend,$hstart,$hend,$evalue,$bits) = @fields;
207             # NCBI -m8 and -m9, v 2.2.18+
208             } elsif (@fields == 13) {
209 675         1274 ($qname, $hname, $percent_id, $percent_pos, $hsp_len, $mismatches, $gapsm,
210             $qstart,$qend,$hstart,$hend,$evalue,$bits) = @fields;
211             }
212             # WU-BLAST -mformat 2 and 3
213             elsif ((@fields == 22) or (@fields == 24)) {
214 3         11 ($qname,$hname,$evalue,$num_scores, $bits, $raw_score, $hsp_len,
215             $identities, $positives,$mismatches, $percent_id, $percent_pos,
216             $qgap_blocks, $qgaps, $sgap_blocks, $sgaps, $qframe, $qstart,
217             $qend, $sframe, $hstart,$hend,) = @fields;
218             # we need total gaps in the alignment
219 3         4 $gapsm=$qgaps+$sgaps;
220             }
221              
222 686 100 100     2129 if (@fields == 12 || @fields == 13) {
223             # need to determine total gaps in the alignment for NCBI output
224             # since NCBI reports number of gapopens and NOT total gaps
225 683         1202 my $qlen = abs($qstart - $qend) + 1;
226 683         504 my $querygaps = $hsp_len - $qlen;
227 683         558 my $hlen = abs($hstart - $hend) + 1;
228 683         476 my $hitgaps = $hsp_len - $hlen;
229 683         638 $gapsm = $querygaps + $hitgaps;
230             }
231              
232             # Remember Jim's code is 0 based
233 686 100 100     2679 if( defined $lastquery &&
    100          
    100          
234             $lastquery ne $qname ) {
235 4         13 $self->end_element({'Name' => 'Hit'});
236 4         16 $self->end_element({'Name' => 'Result'});
237 4         18 $self->_pushback($_);
238 4         11 return $self->end_document;
239             } elsif( ! defined $lastquery ) {
240 11         23 $self->{'_result_count'}++;
241 11         47 $self->start_element({'Name' => 'Result'});
242 11   66     49 $self->element({'Name' => 'Result_program',
243             'Data' => $alg || $self->program_name});
244 11 100       33 $self->element({'Name' => 'Result_version',
245             'Data' => $ver}) if $ver;
246 11         38 $self->element({'Name' => 'Result_query-def',
247             'Data' => $qname});
248 11         32 $self->start_element({'Name' => 'Hit'});
249 11         36 $self->element({'Name' => 'Hit_id',
250             'Data' => $hname});
251             # we'll store the 1st hsp bits as the hit bits
252 11         37 $self->element({'Name' => 'Hit_bits',
253             'Data' => $bits});
254             # we'll store the 1st hsp value as the hit evalue
255 11         36 $self->element({'Name' => 'Hit_signif',
256             'Data' => $evalue});
257            
258             } elsif( $lasthit ne $hname ) {
259 640 50       957 if( $self->in_element('hit') ) {
260 640         1272 $self->end_element({'Name' => 'Hit'});
261             }
262 640         1491 $self->start_element({'Name' => 'Hit'});
263 640         1496 $self->element({'Name' => 'Hit_id',
264             'Data' => $hname});
265             # we'll store the 1st hsp bits as the hit bits
266 640         1452 $self->element({'Name' => 'Hit_bits',
267             'Data' => $bits});
268             # we'll store the 1st hsp value as the hit evalue
269 640         1358 $self->element({'Name' => 'Hit_signif',
270             'Data' => $evalue});
271             }
272 682         1131 my $identical = $hsp_len - $mismatches - $gapsm;
273             # If $positives value is absent, try to recover it from $percent_pos,
274             # this is better than letting the program to assume "conserved == identical"
275 682 100 100     2417 if (not defined $positives and defined $percent_pos) {
276 671         2013 $positives = sprintf "%d", ($percent_pos * $hsp_len / 100);
277             }
278 682         1200 $self->start_element({'Name' => 'Hsp'});
279 682         1428 $self->element({'Name' => 'Hsp_evalue',
280             'Data' => $evalue});
281 682         1581 $self->element({'Name' => 'Hsp_bit-score',
282             'Data' => $bits});
283 682         1530 $self->element({'Name' => 'Hsp_identity',
284             'Data' => $identical});
285 682         1546 $self->element({'Name' => 'Hsp_positive',
286             'Data' => $positives});
287 682         1462 $self->element({'Name' => 'Hsp_gaps',
288             'Data' => $gapsm});
289 682         1696 $self->element({'Name' => 'Hsp_query-from',
290             'Data' => $qstart});
291 682         1550 $self->element({'Name' => 'Hsp_query-to',
292             'Data' => $qend});
293              
294 682         1499 $self->element({'Name' => 'Hsp_hit-from',
295             'Data' => $hstart });
296 682         1493 $self->element({'Name' => 'Hsp_hit-to',
297             'Data' => $hend });
298 682         1608 $self->element({'Name' => 'Hsp_align-len',
299             'Data' => $hsp_len});
300 682         1382 $self->end_element({'Name' => 'Hsp'});
301 682         854 $lastquery = $qname;
302 682         2393 $lasthit = $hname;
303             }
304             # fencepost
305 8 100 66     50 if( defined $lasthit && defined $lastquery ) {
306 7 50       19 if( $self->in_element('hit') ) {
307 7         20 $self->end_element({'Name' => 'Hit'});
308             }
309 7         26 $self->end_element({'Name' => 'Result'});
310 7         25 return $self->end_document;
311             }
312             }
313              
314             =head2 start_element
315              
316             Title : start_element
317             Usage : $eventgenerator->start_element
318             Function: Handles a start element event
319             Returns : none
320             Args : hashref with at least 2 keys 'Data' and 'Name'
321              
322              
323             =cut
324              
325             sub start_element{
326 10142     10142 1 6828 my ($self,$data) = @_;
327             # we currently don't care about attributes
328 10142         7753 my $nm = $data->{'Name'};
329 10142 100       13566 if( my $type = $MODEMAP{$nm} ) {
330 1344         1558 $self->_mode($type);
331 1344 50       1531 if( $self->_will_handle($type) ) {
332 1344         2296 my $func = sprintf("start_%s",lc $type);
333 1344         2073 $self->_eventHandler->$func($data->{'Attributes'});
334             }
335 1344         1436 unshift @{$self->{'_elements'}}, $type;
  1344         1842  
336             }
337 10142 100       14369 if($nm eq 'Result') {
338 11         16 $self->{'_values'} = {};
339 11         22 $self->{'_result'}= undef;
340 11         16 $self->{'_mode'} = '';
341             }
342              
343             }
344              
345             =head2 end_element
346              
347             Title : start_element
348             Usage : $eventgenerator->end_element
349             Function: Handles an end element event
350             Returns : none
351             Args : hashref with at least 2 keys 'Data' and 'Name'
352              
353              
354             =cut
355              
356             sub end_element {
357 10142     10142 1 7059 my ($self,$data) = @_;
358 10142         7511 my $nm = $data->{'Name'};
359 10142         6547 my $rc;
360             # Hsp are sort of weird, in that they end when another
361             # object begins so have to detect this in end_element for now
362            
363 10142 100       16035 if( my $type = $MODEMAP{$nm} ) {
    50          
364 1344 50       1786 if( $self->_will_handle($type) ) {
365 1344         2819 my $func = sprintf("end_%s",lc $type);
366             $rc = $self->_eventHandler->$func($self->{'_reporttype'},
367 1344         2267 $self->{'_values'});
368             }
369 1344         1130 shift @{$self->{'_elements'}};
  1344         1488  
370              
371             } elsif( $MAPPING{$nm} ) {
372 8798 50       9135 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
373 0         0 my $key = (keys %{$MAPPING{$nm}})[0];
  0         0  
374 0         0 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
375             } else {
376 8798         10801 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
377             }
378             } else {
379 0         0 $self->warn( "unknown nm $nm ignoring\n");
380             }
381 10142         7276 $self->{'_last_data'} = ''; # remove read data if we are at
382             # end of an element
383 10142 100       12325 $self->{'_result'} = $rc if( $nm eq 'Result' );
384 10142         9117 return $rc;
385              
386             }
387              
388             =head2 element
389              
390             Title : element
391             Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
392             Function: Convience method that calls start_element, characters, end_element
393             Returns : none
394             Args : Hash ref with the keys 'Name' and 'Data'
395              
396              
397             =cut
398              
399             sub element{
400 8798     8798 1 6632 my ($self,$data) = @_;
401 8798         8284 $self->start_element($data);
402 8798         8563 $self->characters($data);
403 8798         8976 $self->end_element($data);
404             }
405              
406              
407             =head2 characters
408              
409             Title : characters
410             Usage : $eventgenerator->characters($str)
411             Function: Send a character events
412             Returns : none
413             Args : string
414              
415              
416             =cut
417              
418             sub characters{
419 8798     8798 1 5960 my ($self,$data) = @_;
420              
421             # deep bug fix: set $self->{'_last_data'} to undef if $$data{Data} is
422             # a valid slot, whose value is undef --
423             # allows an undef to be propagated to object constructors and
424             # handled there as desired; in particular, when Hsp_postive => -conserved
425             # is not defined (in BLASTN, e.g.), the value of hsp's {CONSERVED} property is
426             # set to the value of {IDENTICAL}.
427             #/maj
428             # return unless ( defined $data->{'Data'} );
429 8798 50       23033 return unless ( grep /Data/, keys %$data );
430 8798 100       11737 if ( !defined $data->{'Data'} ) {
431 8         8 $self->{'_last_data'} = undef;
432 8         8 return;
433             }
434 8790 50       13979 if( $data->{'Data'} =~ /^\s+$/ ) {
435 0 0       0 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
436             }
437              
438 8790 50 66     8786 if( $self->in_element('hsp') &&
439             $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
440            
441 0         0 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
442             }
443            
444 8790         9778 $self->{'_last_data'} = $data->{'Data'};
445             }
446              
447             =head2 _mode
448              
449             Title : _mode
450             Usage : $obj->_mode($newval)
451             Function:
452             Example :
453             Returns : value of _mode
454             Args : newvalue (optional)
455              
456              
457             =cut
458              
459             sub _mode{
460 1344     1344   1115 my ($self,$value) = @_;
461 1344 50       1786 if( defined $value) {
462 1344         1190 $self->{'_mode'} = $value;
463             }
464 1344         1236 return $self->{'_mode'};
465             }
466              
467             =head2 within_element
468              
469             Title : within_element
470             Usage : if( $eventgenerator->within_element($element) ) {}
471             Function: Test if we are within a particular element
472             This is different than 'in' because within can be tested
473             for a whole block.
474             Returns : boolean
475             Args : string element name
476              
477              
478             =cut
479              
480             sub within_element{
481 0     0 1 0 my ($self,$name) = @_;
482             return 0 if ( ! defined $name &&
483             ! defined $self->{'_elements'} ||
484 0 0 0     0 scalar @{$self->{'_elements'}} == 0) ;
  0   0     0  
485 0         0 foreach ( @{$self->{'_elements'}} ) {
  0         0  
486 0 0       0 if( $_ eq $name ) {
487 0         0 return 1;
488             }
489             }
490 0         0 return 0;
491             }
492              
493             =head2 in_element
494              
495             Title : in_element
496             Usage : if( $eventgenerator->in_element($element) ) {}
497             Function: Test if we are in a particular element
498             This is different than 'in' because within can be tested
499             for a whole block.
500             Returns : boolean
501             Args : string element name
502              
503              
504             =cut
505              
506             sub in_element{
507 9437     9437 1 7026 my ($self,$name) = @_;
508 9437 100       11927 return 0 if ! defined $self->{'_elements'}->[0];
509 9436         31375 return ( $self->{'_elements'}->[0] eq $name)
510             }
511              
512              
513             =head2 start_document
514              
515             Title : start_document
516             Usage : $eventgenerator->start_document
517             Function: Handles a start document event
518             Returns : none
519             Args : none
520              
521              
522             =cut
523              
524             sub start_document{
525 0     0 1 0 my ($self) = @_;
526 0         0 $self->{'_lasttype'} = '';
527 0         0 $self->{'_values'} = {};
528 0         0 $self->{'_result'}= undef;
529 0         0 $self->{'_mode'} = '';
530 0         0 $self->{'_elements'} = [];
531             }
532              
533              
534             =head2 end_document
535              
536             Title : end_document
537             Usage : $eventgenerator->end_document
538             Function: Handles an end document event
539             Returns : Bio::Search::Result::ResultI object
540             Args : none
541              
542              
543             =cut
544              
545             sub end_document{
546 11     11 1 15 my ($self,@args) = @_;
547 11         90 return $self->{'_result'};
548             }
549              
550             =head2 result_count
551              
552             Title : result_count
553             Usage : my $count = $searchio->result_count
554             Function: Returns the number of results we have processed
555             Returns : integer
556             Args : none
557              
558              
559             =cut
560              
561             sub result_count {
562 0     0 1 0 my $self = shift;
563 0         0 return $self->{'_result_count'};
564             }
565              
566 0     0 0 0 sub report_count { shift->result_count }
567              
568              
569             =head2 program_name
570              
571             Title : program_name
572             Usage : $obj->program_name($newval)
573             Function: Get/Set the program name
574             Returns : value of program_name (a scalar)
575             Args : on set, new value (a scalar or undef, optional)
576              
577              
578             =cut
579              
580             sub program_name{
581 21     21 1 23 my $self = shift;
582              
583 21 100       48 $self->{'program_name'} = shift if @_;
584 21   33     74 return $self->{'program_name'} || $DefaultProgramName;
585             }
586              
587              
588             =head2 _will_handle
589              
590             Title : _will_handle
591             Usage : Private method. For internal use only.
592             if( $self->_will_handle($type) ) { ... }
593             Function: Provides an optimized way to check whether or not an element of a
594             given type is to be handled.
595             Returns : Reference to EventHandler object if the element type is to be handled.
596             undef if the element type is not to be handled.
597             Args : string containing type of element.
598              
599             Optimizations:
600              
601             =over 2
602              
603             =item 1
604              
605             Using the cached pointer to the EventHandler to minimize repeated
606             lookups.
607              
608             =item 2
609              
610             Caching the will_handle status for each type that is encountered so
611             that it only need be checked by calling
612             handler-Ewill_handle($type) once.
613              
614             =back
615              
616             This does not lead to a major savings by itself (only 5-10%). In
617             combination with other optimizations, or for large parse jobs, the
618             savings good be significant.
619              
620             To test against the unoptimized version, remove the parentheses from
621             around the third term in the ternary " ? : " operator and add two
622             calls to $self-E_eventHandler().
623              
624             =cut
625              
626             sub _will_handle {
627 2688     2688   1966 my ($self,$type) = @_;
628 2688         2083 my $handler = $self->{'_handler'};
629             my $will_handle = defined($self->{'_will_handle_cache'}->{$type})
630             ? $self->{'_will_handle_cache'}->{$type}
631 2688 100       3912 : ($self->{'_will_handle_cache'}->{$type} =
632             $handler->will_handle($type));
633              
634 2688 50       5345 return $will_handle ? $handler : undef;
635             }
636              
637             1;