File Coverage

Bio/SearchIO/blasttable.pm
Criterion Covered Total %
statement 151 174 86.7
branch 55 76 72.3
condition 22 36 61.1
subroutine 17 21 80.9
pod 11 12 91.6
total 256 319 80.2


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SearchIO::blasttable
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SearchIO::blasttable - Driver module for SearchIO for parsing NCBI -m 8/9 format
17              
18             =head1 SYNOPSIS
19              
20             # do not use this module directly
21             use Bio::SearchIO;
22             my $parser = Bio::SearchIO->new(-file => $file,
23             -format => 'blasttable');
24              
25             while( my $result = $parser->next_result ) {
26             }
27              
28             =head1 DESCRIPTION
29              
30             This module will support parsing NCBI -m 8 or -m 9 tabular output
31             and WU-BLAST -mformat 2 or -mformat 3 tabular output.
32              
33             =head1 FEEDBACK
34              
35             =head2 Mailing Lists
36              
37             User feedback is an integral part of the evolution of this and other
38             Bioperl modules. Send your comments and suggestions preferably to
39             the Bioperl mailing list. Your participation is much appreciated.
40              
41             bioperl-l@bioperl.org - General discussion
42             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43              
44             =head2 Support
45              
46             Please direct usage questions or support issues to the mailing list:
47              
48             I
49              
50             rather than to the module maintainer directly. Many experienced and
51             reponsive experts will be able look at the problem and quickly
52             address it. Please include a thorough description of the problem
53             with code and data examples if at all possible.
54              
55             =head2 Reporting Bugs
56              
57             Report bugs to the Bioperl bug tracking system to help us keep track
58             of the bugs and their resolution. Bug reports can be submitted via
59             the web:
60              
61             https://github.com/bioperl/bioperl-live/issues
62              
63             =head1 AUTHOR - Jason Stajich
64              
65             Email jason-at-bioperl-dot-org
66              
67             =head1 APPENDIX
68              
69             The rest of the documentation details each of the object methods.
70             Internal methods are usually preceded with a _
71              
72             =cut
73              
74             # Let the code begin...
75              
76              
77             package Bio::SearchIO::blasttable;
78 2     2   13 use vars qw(%MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $DefaultProgramName);
  2         3  
  2         127  
79 2     2   12 use strict;
  2         2  
  2         41  
80 2     2   497 use Bio::Search::Result::ResultFactory;
  2         4  
  2         47  
81 2     2   531 use Bio::Search::Hit::HitFactory;
  2         4  
  2         46  
82 2     2   494 use Bio::Search::HSP::HSPFactory;
  2         5  
  2         181  
83              
84             $DefaultProgramName = 'BLASTN';
85             $DEFAULT_WRITER_CLASS = 'Bio::SearchIO::Writer::HitTableWriter';
86              
87             # mapping of terms to Bioperl hash keys
88             %MODEMAP = (
89             'Result' => 'result',
90             'Hit' => 'hit',
91             'Hsp' => 'hsp'
92             );
93              
94             %MAPPING = (
95             'Hsp_bit-score' => 'HSP-bits',
96             'Hsp_score' => 'HSP-score',
97             'Hsp_evalue' => 'HSP-evalue',
98             'Hsp_query-from' => 'HSP-query_start',
99             'Hsp_query-to' => 'HSP-query_end',
100             'Hsp_hit-from' => 'HSP-hit_start',
101             'Hsp_hit-to' => 'HSP-hit_end',
102             'Hsp_positive' => 'HSP-conserved',
103             'Hsp_identity' => 'HSP-identical',
104             'Hsp_mismatches' => 'HSP-mismatches',
105             'Hsp_qgapblocks' => 'HSP-query_gapblocks',
106             'Hsp_hgapblocks' => 'HSP-hit_gapblocks',
107             'Hsp_gaps' => 'HSP-hsp_gaps',
108             'Hsp_hitgaps' => 'HSP-hit_gaps',
109             'Hsp_querygaps' => 'HSP-query_gaps',
110             'Hsp_align-len' => 'HSP-hsp_length',
111             'Hsp_query-frame'=> 'HSP-query_frame',
112             'Hsp_hit-frame' => 'HSP-hit_frame',
113              
114             'Hit_id' => 'HIT-name',
115             'Hit_len' => 'HIT-length',
116             'Hit_accession' => 'HIT-accession',
117             'Hit_def' => 'HIT-description',
118             'Hit_signif' => 'HIT-significance',
119             'Hit_score' => 'HIT-score',
120             'Hit_bits' => 'HIT-bits',
121              
122             'Result_program' => 'RESULT-algorithm_name',
123             'Result_version' => 'RESULT-algorithm_version',
124             'Result_query-def'=> 'RESULT-query_name',
125             'Result_query-len'=> 'RESULT-query_length',
126             'Result_query-acc'=> 'RESULT-query_accession',
127             'Result_querydesc'=> 'RESULT-query_description',
128             'Result_db' => 'RESULT-database_name',
129             'Result_db-len' => 'RESULT-database_entries',
130             'Result_db-let' => 'RESULT-database_letters',
131             );
132              
133 2     2   11 use base qw(Bio::SearchIO);
  2         2  
  2         3213  
134              
135             =head2 new
136              
137             Title : new
138             Usage : my $obj = Bio::SearchIO::blasttable->new();
139             Function: Builds a new Bio::SearchIO::blasttable object
140             Returns : an instance of Bio::SearchIO::blasttable
141             Args :
142              
143              
144             =cut
145              
146             sub _initialize {
147 11     11   37 my ($self,@args) = @_;
148 11         51 $self->SUPER::_initialize(@args);
149              
150 11         67 my ($pname) = $self->_rearrange([qw(PROGRAM_NAME)],
151             @args);
152 11   66     74 $self->program_name($pname || $DefaultProgramName);
153 11         47 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::GenericResult'));
154 11         36 $self->_eventHandler->register_factory('hit', Bio::Search::Hit::HitFactory->new(-type => 'Bio::Search::Hit::GenericHit'));
155 11         35 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::GenericHSP'));
156             }
157              
158              
159             =head2 next_result
160              
161             Title : next_result
162             Usage : my $result = $parser->next_result
163             Function: Parse the next result from the data stream
164             Returns : L
165             Args : none
166              
167              
168             =cut
169              
170             sub next_result{
171 12     12 1 457 my ($self) = @_;
172 12         21 my ($lastquery,$lasthit);
173 12         62 local $/ = "\n";
174 12         22 local $_;
175 12         24 my ($alg, $ver);
176 12         45 while( defined ($_ = $self->_readline) ) {
177             # WU-BLAST -mformat 3 only
178 766 100       1948 if(m{^#\s((?:\S+?)?BLAST[NPX])\s(\d+\.\d+.+\d{4}\])}) {
    100          
179 1         5 ($alg, $ver) = ($1, $2);
180             # only one header for whole file with WU-BLAST
181             # so $alg and $ver won't get set properly for
182             # each result
183 1 50       7 $self->program_name($alg) if $alg;
184 1 50       6 $self->element({'Name' => 'Result_version',
185             'Data' => $ver}) if $ver;
186 1         6 next;
187             }
188             # -m 9 only
189             elsif(m{^#\s+((?:\S+?)?BLAST[NPX])\s+(.+)}) {
190 1         6 ($alg, $ver) = ($1, $2);
191 1         4 next;
192             }
193 764 100 100     2891 next if /^#/ || /^\s*$/;
194              
195 686         2503 my @fields = split;
196 686 50       1133 next if @fields == 1;
197 686         1348 my ($qname,$hname, $percent_id, $hsp_len, $mismatches,$gapsm,
198             $qstart,$qend,$hstart,$hend,$evalue,$bits);
199             # WU-BLAST-specific
200 686         0 my ($num_scores, $raw_score, $identities, $positives, $percent_pos,
201             $qgap_blocks,$qgaps, $sgap_blocks, $sgaps, $qframe,
202             $sframe);
203             # NCBI -m8 and -m9
204 686 100 33     1238 if (@fields == 12) {
    100          
    50          
205 8         27 ($qname,$hname, $percent_id, $hsp_len, $mismatches,$gapsm,
206             $qstart,$qend,$hstart,$hend,$evalue,$bits) = @fields;
207             # NCBI -m8 and -m9, v 2.2.18+
208             } elsif (@fields == 13) {
209 675         1704 ($qname, $hname, $percent_id, $percent_pos, $hsp_len, $mismatches, $gapsm,
210             $qstart,$qend,$hstart,$hend,$evalue,$bits) = @fields;
211             }
212             # WU-BLAST -mformat 2 and 3
213             elsif ((@fields == 22) or (@fields == 24)) {
214 3         11 ($qname,$hname,$evalue,$num_scores, $bits, $raw_score, $hsp_len,
215             $identities, $positives,$mismatches, $percent_id, $percent_pos,
216             $qgap_blocks, $qgaps, $sgap_blocks, $sgaps, $qframe, $qstart,
217             $qend, $sframe, $hstart,$hend,) = @fields;
218             # we need total gaps in the alignment
219 3         7 $gapsm=$qgaps+$sgaps;
220             }
221              
222 686 100 100     1757 if (@fields == 12 || @fields == 13) {
223             # need to determine total gaps in the alignment for NCBI output
224             # since NCBI reports number of gapopens and NOT total gaps
225 683         1571 my $qlen = abs($qstart - $qend) + 1;
226 683         923 my $querygaps = $hsp_len - $qlen;
227 683         807 my $hlen = abs($hstart - $hend) + 1;
228 683         854 my $hitgaps = $hsp_len - $hlen;
229 683         934 $gapsm = $querygaps + $hitgaps;
230             }
231              
232             # Remember Jim's code is 0 based
233 686 100 100     2562 if( defined $lastquery &&
    100          
    100          
234             $lastquery ne $qname ) {
235 4         21 $self->end_element({'Name' => 'Hit'});
236 4         56 $self->end_element({'Name' => 'Result'});
237 4         29 $self->_pushback($_);
238 4         16 return $self->end_document;
239             } elsif( ! defined $lastquery ) {
240 11         26 $self->{'_result_count'}++;
241 11         50 $self->start_element({'Name' => 'Result'});
242 11   66     48 $self->element({'Name' => 'Result_program',
243             'Data' => $alg || $self->program_name});
244 11 100       36 $self->element({'Name' => 'Result_version',
245             'Data' => $ver}) if $ver;
246 11         40 $self->element({'Name' => 'Result_query-def',
247             'Data' => $qname});
248 11         37 $self->start_element({'Name' => 'Hit'});
249 11         46 $self->element({'Name' => 'Hit_id',
250             'Data' => $hname});
251             # we'll store the 1st hsp bits as the hit bits
252 11         46 $self->element({'Name' => 'Hit_bits',
253             'Data' => $bits});
254             # we'll store the 1st hsp value as the hit evalue
255 11         39 $self->element({'Name' => 'Hit_signif',
256             'Data' => $evalue});
257            
258             } elsif( $lasthit ne $hname ) {
259 640 50       1307 if( $self->in_element('hit') ) {
260 640         1678 $self->end_element({'Name' => 'Hit'});
261             }
262 640         1919 $self->start_element({'Name' => 'Hit'});
263 640         1930 $self->element({'Name' => 'Hit_id',
264             'Data' => $hname});
265             # we'll store the 1st hsp bits as the hit bits
266 640         2003 $self->element({'Name' => 'Hit_bits',
267             'Data' => $bits});
268             # we'll store the 1st hsp value as the hit evalue
269 640         2007 $self->element({'Name' => 'Hit_signif',
270             'Data' => $evalue});
271             }
272 682         1592 my $identical = $hsp_len - $mismatches - $gapsm;
273             # If $positives value is absent, try to recover it from $percent_pos,
274             # this is better than letting the program to assume "conserved == identical"
275 682 100 100     1923 if (not defined $positives and defined $percent_pos) {
276 671         2503 $positives = sprintf "%d", ($percent_pos * $hsp_len / 100);
277             }
278 682         1673 $self->start_element({'Name' => 'Hsp'});
279 682         1952 $self->element({'Name' => 'Hsp_evalue',
280             'Data' => $evalue});
281 682         2128 $self->element({'Name' => 'Hsp_bit-score',
282             'Data' => $bits});
283 682         1987 $self->element({'Name' => 'Hsp_identity',
284             'Data' => $identical});
285 682         2051 $self->element({'Name' => 'Hsp_positive',
286             'Data' => $positives});
287 682         2106 $self->element({'Name' => 'Hsp_gaps',
288             'Data' => $gapsm});
289 682         2230 $self->element({'Name' => 'Hsp_query-from',
290             'Data' => $qstart});
291 682         2065 $self->element({'Name' => 'Hsp_query-to',
292             'Data' => $qend});
293              
294 682         2060 $self->element({'Name' => 'Hsp_hit-from',
295             'Data' => $hstart });
296 682         2057 $self->element({'Name' => 'Hsp_hit-to',
297             'Data' => $hend });
298 682         2068 $self->element({'Name' => 'Hsp_align-len',
299             'Data' => $hsp_len});
300 682         1795 $self->end_element({'Name' => 'Hsp'});
301 682         1093 $lastquery = $qname;
302 682         2954 $lasthit = $hname;
303             }
304             # fencepost
305 8 100 66     47 if( defined $lasthit && defined $lastquery ) {
306 7 50       18 if( $self->in_element('hit') ) {
307 7         25 $self->end_element({'Name' => 'Hit'});
308             }
309 7         27 $self->end_element({'Name' => 'Result'});
310 7         27 return $self->end_document;
311             }
312             }
313              
314             =head2 start_element
315              
316             Title : start_element
317             Usage : $eventgenerator->start_element
318             Function: Handles a start element event
319             Returns : none
320             Args : hashref with at least 2 keys 'Data' and 'Name'
321              
322              
323             =cut
324              
325             sub start_element{
326 10142     10142 1 10767 my ($self,$data) = @_;
327             # we currently don't care about attributes
328 10142         10878 my $nm = $data->{'Name'};
329 10142 100       14451 if( my $type = $MODEMAP{$nm} ) {
330 1344         2374 $self->_mode($type);
331 1344 50       1769 if( $self->_will_handle($type) ) {
332 1344         2823 my $func = sprintf("start_%s",lc $type);
333 1344         2551 $self->_eventHandler->$func($data->{'Attributes'});
334             }
335 1344         1908 unshift @{$self->{'_elements'}}, $type;
  1344         2790  
336             }
337 10142 100       15279 if($nm eq 'Result') {
338 11         24 $self->{'_values'} = {};
339 11         24 $self->{'_result'}= undef;
340 11         26 $self->{'_mode'} = '';
341             }
342              
343             }
344              
345             =head2 end_element
346              
347             Title : start_element
348             Usage : $eventgenerator->end_element
349             Function: Handles an end element event
350             Returns : none
351             Args : hashref with at least 2 keys 'Data' and 'Name'
352              
353              
354             =cut
355              
356             sub end_element {
357 10142     10142 1 10978 my ($self,$data) = @_;
358 10142         10567 my $nm = $data->{'Name'};
359 10142         9253 my $rc;
360             # Hsp are sort of weird, in that they end when another
361             # object begins so have to detect this in end_element for now
362            
363 10142 100       17558 if( my $type = $MODEMAP{$nm} ) {
    50          
364 1344 50       1947 if( $self->_will_handle($type) ) {
365 1344         3515 my $func = sprintf("end_%s",lc $type);
366             $rc = $self->_eventHandler->$func($self->{'_reporttype'},
367 1344         2778 $self->{'_values'});
368             }
369 1344         1459 shift @{$self->{'_elements'}};
  1344         1770  
370              
371             } elsif( $MAPPING{$nm} ) {
372 8798 50       11674 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
373 0         0 my $key = (keys %{$MAPPING{$nm}})[0];
  0         0  
374 0         0 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
375             } else {
376 8798         14088 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
377             }
378             } else {
379 0         0 $self->warn( "unknown nm $nm ignoring\n");
380             }
381 10142         10778 $self->{'_last_data'} = ''; # remove read data if we are at
382             # end of an element
383 10142 100       13459 $self->{'_result'} = $rc if( $nm eq 'Result' );
384 10142         12579 return $rc;
385              
386             }
387              
388             =head2 element
389              
390             Title : element
391             Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
392             Function: Convience method that calls start_element, characters, end_element
393             Returns : none
394             Args : Hash ref with the keys 'Name' and 'Data'
395              
396              
397             =cut
398              
399             sub element{
400 8798     8798 1 9978 my ($self,$data) = @_;
401 8798         12382 $self->start_element($data);
402 8798         12988 $self->characters($data);
403 8798         11571 $self->end_element($data);
404             }
405              
406              
407             =head2 characters
408              
409             Title : characters
410             Usage : $eventgenerator->characters($str)
411             Function: Send a character events
412             Returns : none
413             Args : string
414              
415              
416             =cut
417              
418             sub characters{
419 8798     8798 1 8956 my ($self,$data) = @_;
420              
421             # deep bug fix: set $self->{'_last_data'} to undef if $$data{Data} is
422             # a valid slot, whose value is undef --
423             # allows an undef to be propagated to object constructors and
424             # handled there as desired; in particular, when Hsp_postive => -conserved
425             # is not defined (in BLASTN, e.g.), the value of hsp's {CONSERVED} property is
426             # set to the value of {IDENTICAL}.
427             #/maj
428             # return unless ( defined $data->{'Data'} );
429 8798 50       27026 return unless ( grep /Data/, keys %$data );
430 8798 100       14350 if ( !defined $data->{'Data'} ) {
431 8         9 $self->{'_last_data'} = undef;
432 8         10 return;
433             }
434 8790 50       14317 if( $data->{'Data'} =~ /^\s+$/ ) {
435 0 0       0 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
436             }
437              
438 8790 50 66     11595 if( $self->in_element('hsp') &&
439             $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
440            
441 0         0 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
442             }
443            
444 8790         14261 $self->{'_last_data'} = $data->{'Data'};
445             }
446              
447             =head2 _mode
448              
449             Title : _mode
450             Usage : $obj->_mode($newval)
451             Function:
452             Example :
453             Returns : value of _mode
454             Args : newvalue (optional)
455              
456              
457             =cut
458              
459             sub _mode{
460 1344     1344   1736 my ($self,$value) = @_;
461 1344 50       1879 if( defined $value) {
462 1344         1461 $self->{'_mode'} = $value;
463             }
464 1344         1539 return $self->{'_mode'};
465             }
466              
467             =head2 within_element
468              
469             Title : within_element
470             Usage : if( $eventgenerator->within_element($element) ) {}
471             Function: Test if we are within a particular element
472             This is different than 'in' because within can be tested
473             for a whole block.
474             Returns : boolean
475             Args : string element name
476              
477              
478             =cut
479              
480             sub within_element{
481 0     0 1 0 my ($self,$name) = @_;
482             return 0 if ( ! defined $name &&
483             ! defined $self->{'_elements'} ||
484 0 0 0     0 scalar @{$self->{'_elements'}} == 0) ;
  0   0     0  
485 0         0 foreach ( @{$self->{'_elements'}} ) {
  0         0  
486 0 0       0 if( $_ eq $name ) {
487 0         0 return 1;
488             }
489             }
490 0         0 return 0;
491             }
492              
493             =head2 in_element
494              
495             Title : in_element
496             Usage : if( $eventgenerator->in_element($element) ) {}
497             Function: Test if we are in a particular element
498             This is different than 'in' because within can be tested
499             for a whole block.
500             Returns : boolean
501             Args : string element name
502              
503              
504             =cut
505              
506             sub in_element{
507 9437     9437 1 11721 my ($self,$name) = @_;
508 9437 100       14443 return 0 if ! defined $self->{'_elements'}->[0];
509 9436         32708 return ( $self->{'_elements'}->[0] eq $name)
510             }
511              
512              
513             =head2 start_document
514              
515             Title : start_document
516             Usage : $eventgenerator->start_document
517             Function: Handles a start document event
518             Returns : none
519             Args : none
520              
521              
522             =cut
523              
524             sub start_document{
525 0     0 1 0 my ($self) = @_;
526 0         0 $self->{'_lasttype'} = '';
527 0         0 $self->{'_values'} = {};
528 0         0 $self->{'_result'}= undef;
529 0         0 $self->{'_mode'} = '';
530 0         0 $self->{'_elements'} = [];
531             }
532              
533              
534             =head2 end_document
535              
536             Title : end_document
537             Usage : $eventgenerator->end_document
538             Function: Handles an end document event
539             Returns : Bio::Search::Result::ResultI object
540             Args : none
541              
542              
543             =cut
544              
545             sub end_document{
546 11     11 1 26 my ($self,@args) = @_;
547 11         118 return $self->{'_result'};
548             }
549              
550             =head2 result_count
551              
552             Title : result_count
553             Usage : my $count = $searchio->result_count
554             Function: Returns the number of results we have processed
555             Returns : integer
556             Args : none
557              
558              
559             =cut
560              
561             sub result_count {
562 0     0 1 0 my $self = shift;
563 0         0 return $self->{'_result_count'};
564             }
565              
566 0     0 0 0 sub report_count { shift->result_count }
567              
568              
569             =head2 program_name
570              
571             Title : program_name
572             Usage : $obj->program_name($newval)
573             Function: Get/Set the program name
574             Returns : value of program_name (a scalar)
575             Args : on set, new value (a scalar or undef, optional)
576              
577              
578             =cut
579              
580             sub program_name{
581 21     21 1 40 my $self = shift;
582              
583 21 100       57 $self->{'program_name'} = shift if @_;
584 21   33     90 return $self->{'program_name'} || $DefaultProgramName;
585             }
586              
587              
588             =head2 _will_handle
589              
590             Title : _will_handle
591             Usage : Private method. For internal use only.
592             if( $self->_will_handle($type) ) { ... }
593             Function: Provides an optimized way to check whether or not an element of a
594             given type is to be handled.
595             Returns : Reference to EventHandler object if the element type is to be handled.
596             undef if the element type is not to be handled.
597             Args : string containing type of element.
598              
599             Optimizations:
600              
601             =over 2
602              
603             =item 1
604              
605             Using the cached pointer to the EventHandler to minimize repeated
606             lookups.
607              
608             =item 2
609              
610             Caching the will_handle status for each type that is encountered so
611             that it only need be checked by calling
612             handler-Ewill_handle($type) once.
613              
614             =back
615              
616             This does not lead to a major savings by itself (only 5-10%). In
617             combination with other optimizations, or for large parse jobs, the
618             savings good be significant.
619              
620             To test against the unoptimized version, remove the parentheses from
621             around the third term in the ternary " ? : " operator and add two
622             calls to $self-E_eventHandler().
623              
624             =cut
625              
626             sub _will_handle {
627 2688     2688   3078 my ($self,$type) = @_;
628 2688         2910 my $handler = $self->{'_handler'};
629             my $will_handle = defined($self->{'_will_handle_cache'}->{$type})
630             ? $self->{'_will_handle_cache'}->{$type}
631 2688 100       4575 : ($self->{'_will_handle_cache'}->{$type} =
632             $handler->will_handle($type));
633              
634 2688 50       5261 return $will_handle ? $handler : undef;
635             }
636              
637             1;