File Coverage

Bio/SearchIO/infernal.pm
Criterion Covered Total %
statement 475 491 96.7
branch 250 310 80.6
condition 77 118 65.2
subroutine 28 29 96.5
pod 21 21 100.0
total 851 969 87.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SearchIO::infernal
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Chris Fields
7             #
8             # Copyright Chris Fields
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SearchIO::infernal - SearchIO-based Infernal parser
17              
18             =head1 SYNOPSIS
19              
20             my $parser = Bio::SearchIO->new(-format => 'infernal',
21             -file => 'purine.inf');
22             while( my $result = $parser->next_result ) {
23             # general result info, such as model used, Infernal version
24             while( my $hit = $result->next_hit ) {
25             while( my $hsp = $hit->next_hsp ) {
26             # ...
27             }
28             }
29             }
30              
31             =head1 DESCRIPTION
32              
33             This is a SearchIO-based parser for Infernal output from the cmsearch program.
34             It currently parses cmsearch output for Infernal versions 0.7-1.1; older
35             versions may work but will not be supported.
36              
37             The latest version of Infernal is 1.1. The output has changed substantially
38             relative to version 1.0. Versions 1.x are stable releases (and output has
39             stabilized) therefore it is highly recommended that users upgrade to using
40             the latest Infernal release. Support for the older pre-v.1 developer releases
41             will be dropped for future core 1.6 releases.
42              
43             =head1 FEEDBACK
44              
45             =head2 Mailing Lists
46              
47             User feedback is an integral part of the evolution of this and other
48             Bioperl modules. Send your comments and suggestions preferably to
49             the Bioperl mailing list. Your participation is much appreciated.
50              
51             bioperl-l@bioperl.org - General discussion
52             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53              
54             =head2 Support
55              
56             Please direct usage questions or support issues to the mailing list:
57              
58             I
59              
60             rather than to the module maintainer directly. Many experienced and
61             reponsive experts will be able look at the problem and quickly
62             address it. Please include a thorough description of the problem
63             with code and data examples if at all possible.
64              
65             =head2 Reporting Bugs
66              
67             Report bugs to the Bioperl bug tracking system to help us keep track
68             of the bugs and their resolution. Bug reports can be submitted via the
69             web:
70              
71             https://github.com/bioperl/bioperl-live/issues
72              
73             =head1 AUTHOR - Chris Fields
74              
75             Email cjfields-at-uiuc-dot-edu
76              
77             =head1 CONTRIBUTORS
78              
79             Jeffrey Barrick, Michigan State University
80             Paul Cantalupo, University of Pittsburgh
81              
82             =head1 APPENDIX
83              
84             The rest of the documentation details each of the object methods.
85             Internal methods are usually preceded with a _
86              
87             =cut
88              
89             # Let the code begin...
90              
91             package Bio::SearchIO::infernal;
92 1     1   3 use strict;
  1         1  
  1         26  
93              
94 1     1   4 use Data::Dumper;
  1         1  
  1         54  
95 1     1   4 use base qw(Bio::SearchIO);
  1         1  
  1         4944  
96              
97             our %MODEMAP = (
98             'Result' => 'result',
99             'Hit' => 'hit',
100             'Hsp' => 'hsp'
101             );
102              
103             our %MAPPING = (
104             'Hsp_bit-score' => 'HSP-bits',
105             'Hsp_score' => 'HSP-score',
106             'Hsp_evalue' => 'HSP-evalue', # evalues only in v0.81, optional
107             'Hsp_pvalue' => 'HSP-pvalue', # pvalues only in v0.81, optional
108             'Hsp_query-from' => 'HSP-query_start',
109             'Hsp_query-to' => 'HSP-query_end',
110             'Hsp_query-strand'=> 'HSP-query_strand',
111             'Hsp_hit-from' => 'HSP-hit_start',
112             'Hsp_hit-to' => 'HSP-hit_end',
113             'Hsp_hit-strand' => 'HSP-hit_strand',
114             'Hsp_gaps' => 'HSP-hsp_gaps',
115             'Hsp_hitgaps' => 'HSP-hit_gaps',
116             'Hsp_querygaps' => 'HSP-query_gaps',
117             'Hsp_qseq' => 'HSP-query_seq',
118             'Hsp_ncline' => 'HSP-nc_seq',
119             'Hsp_hseq' => 'HSP-hit_seq',
120             'Hsp_midline' => 'HSP-homology_seq',
121             'Hsp_pline' => 'HSP-pp_seq',
122             'Hsp_structure' => 'HSP-meta',
123             'Hsp_align-len' => 'HSP-hsp_length',
124             'Hsp_stranded' => 'HSP-stranded',
125              
126             'Hit_id' => 'HIT-name',
127             'Hit_len' => 'HIT-length',
128             'Hit_gi' => 'HIT-ncbi_gi',
129             'Hit_accession' => 'HIT-accession',
130             'Hit_desc' => 'HIT-description',
131             'Hit_def' => 'HIT-description',
132             'Hit_signif' => 'HIT-significance', # evalues in v1.1 and v0.81, optional
133             'Hit_p' => 'HIT-p', # pvalues only in 1.0, optional
134             'Hit_score' => 'HIT-score', # best HSP bit score (in v1.1, the only HSP bit score)
135             'Hit_bits' => 'HIT-bits', # best HSP bit score (ditto)
136              
137             'Infernal_program' => 'RESULT-algorithm_name', # get/set
138             'Infernal_version' => 'RESULT-algorithm_version', # get/set
139             'Infernal_query-def'=> 'RESULT-query_name', # get/set
140             'Infernal_query-len'=> 'RESULT-query_length',
141             'Infernal_query-acc'=> 'RESULT-query_accession', # get/set
142             'Infernal_querydesc'=> 'RESULT-query_description', # get/set
143             'Infernal_cm' => 'RESULT-cm_name',
144             'Infernal_db' => 'RESULT-database_name', # get/set
145             'Infernal_db-len' => 'RESULT-database_entries', # in v1.1 only
146             'Infernal_db-let' => 'RESULT-database_letters', # in v1.1 only
147             );
148              
149             my $MINSCORE = 0;
150             my $DEFAULT_ALGORITHM = 'cmsearch';
151             my $DEFAULT_VERSION = '1.1';
152              
153             my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown gap);
154             my %STRUCTURE_SYMBOLS = (
155             '5-prime' => '<',
156             '3-prime' => '>',
157             'single-strand' => ':',
158             'unknown' => '?',
159             'gap' => '.'
160             );
161              
162             =head2 new
163              
164             Title : new
165             Usage : my $obj = Bio::SearchIO::infernal->new();
166             Function: Builds a new Bio::SearchIO::infernal object
167             Returns : Bio::SearchIO::infernal
168             Args : -fh/-file => cmsearch (infernal) filename
169             -format => 'infernal'
170             -model => query model (Rfam ID) (default undef)
171             -database => database name (default undef)
172             -query_acc => query accession, eg. Rfam accession RF####
173             -query_desc => query description, eg. Rfam description
174             -hsp_minscore => minimum HSP score cutoff
175             -convert_meta => boolean, set to convert meta string to simple WUSS format
176             -symbols => hash ref of structure symbols to use
177             (default symbols in %STRUCTURE_SYMBOLS hash)
178              
179             =cut
180              
181             sub _initialize {
182 7     7   22 my ( $self, @args ) = @_;
183 7         24 $self->SUPER::_initialize(@args);
184 7         44 my ($model, $database, $convert, $symbols, $cutoff,
185             $desc, $accession, $algorithm, $version) =
186             $self->_rearrange([qw(MODEL
187             DATABASE
188             CONVERT_META
189             SYMBOLS
190             HSP_MINSCORE
191             QUERY_DESC
192             QUERY_ACC
193             ALGORITHM
194             VERSION)],@args);
195 7         32 my $handler = $self->_eventHandler;
196 7         33 $handler->register_factory(
197             'result',
198             Bio::Factory::ObjectFactory->new(
199             -type => 'Bio::Search::Result::INFERNALResult',
200             -interface => 'Bio::Search::Result::ResultI',
201             -verbose => $self->verbose
202             )
203             );
204              
205 7         16 $handler->register_factory(
206             'hit',
207             Bio::Factory::ObjectFactory->new(
208             -type => 'Bio::Search::Hit::ModelHit',
209             -interface => 'Bio::Search::Hit::HitI',
210             -verbose => $self->verbose
211             )
212             );
213              
214 7         13 $handler->register_factory(
215             'hsp',
216             Bio::Factory::ObjectFactory->new(
217             -type => 'Bio::Search::HSP::ModelHSP',
218             -interface => 'Bio::Search::HSP::HSPI',
219             -verbose => $self->verbose
220             )
221             );
222              
223 7 100       13 defined $model && $self->model($model);
224 7 100       28 defined $database && $self->database($database);
225 7 100       31 defined $accession && $self->query_accession($accession);
226 7 100       22 defined $convert && $self->convert_meta($convert);
227 7 100       21 defined $desc && $self->query_description($desc);
228              
229 7   66     23 $version ||= $DEFAULT_VERSION;
230 7         23 $self->version($version);
231 7   100     26 $symbols ||= \%STRUCTURE_SYMBOLS;
232 7         19 $self->structure_symbols($symbols);
233 7   66     22 $cutoff ||= $MINSCORE;
234 7         17 $self->hsp_minscore($cutoff);
235 7   33     26 $algorithm ||= $DEFAULT_ALGORITHM;
236 7         17 $self->algorithm($algorithm);
237             }
238              
239             =head2 next_result
240              
241             Title : next_result
242             Usage : my $hit = $searchio->next_result;
243             Function: Returns the next Result from a search
244             Returns : Bio::Search::Result::ResultI object
245             Args : none
246              
247             =cut
248              
249             sub next_result {
250 9     9 1 485 my ($self) = @_;
251 9 100       24 unless (exists $self->{'_handlerset'}) {
252 7         8 my $line;
253 7         25 while ($line = $self->_readline) {
254             # advance to first line
255 9 100       45 next if $line =~ m{^\s*$};
256             # newer output starts with model name
257 7 100       37 if ($line =~ m{^\#\s+cmsearch\s}) {
    100          
258 4         9 my $secondline = $self->_readline;
259 4 100       15 if ($secondline =~ m{INFERNAL 1\.1}) {
260 3         6 $self->{'_handlerset'} = '1.1';
261             }
262             else {
263 1         3 $self->{'_handlerset'} = 'latest'; # v1.0
264             }
265 4         13 $self->_pushback($secondline);
266             }
267             elsif ($line =~ m{^CM\s\d+:}) {
268 1         3 $self->{'_handlerset'} = 'pre-1.0';
269             }
270             else {
271 2         6 $self->{'_handlerset'} ='old';
272             }
273 7         12 last;
274             }
275 7         18 $self->_pushback($line);
276             #if ($self->{'_handlerset'} ne '1.0') {
277             # $self->deprecated(
278             # -message => "Parsing of Infernal pre-1.0 release is deprecated;\n".
279             # "upgrading to Infernal 1.0 or above is highly recommended",
280             # -version => 1.007);
281             #}
282             }
283             return ($self->{'_handlerset'} eq '1.1') ? $self->_parse_v1_1 :
284             ($self->{'_handlerset'} eq 'latest') ? $self->_parse_latest :
285 9 100       50 ($self->{'_handlerset'} eq 'pre-1.0') ? $self->_parse_pre :
    100          
    100          
286             $self->_parse_old;
287             }
288              
289              
290             sub _parse_v1_1 {
291 5     5   10 my ($self) = @_;
292 5         6 my $seentop = 0;
293 5         19 local $/ = "\n";
294 5         17 my ($accession, $description) = ($self->query_accession, $self->query_description);
295 5         4 my ($buffer, $last, %modelcounter, @hit_list, %hitindex,
296             @hsp_list, %hspindex);
297 5         13 $self->start_document();
298 5         17 $buffer = $self->_readline;
299 5 100 66     32 if ( !defined $buffer || $buffer =~ m/^\[ok/ ) { # end of report
300 1         6 return undef;
301             }
302             else {
303 4         10 $self->_pushback($buffer);
304             }
305              
306             PARSER: # Parse each line of report
307 4         11 while ( defined( $buffer = $self->_readline ) ) {
308 52         40 my $hit_counter = 0;
309 52         41 my $lineorig = $buffer;
310 52         49 chomp $buffer;
311              
312             # INFERNAL program name
313 52 100 66     357 if ( $buffer =~ m/^\#\s(\S+)\s\:\:\s/ ) {
    100 66        
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
314 3         7 $seentop = 1;
315 3         6 my $prog = $1;
316 3         16 $self->start_element( { 'Name' => 'Result' } );
317 3         14 $self->element_hash( { 'Infernal_program' => uc($prog) } );
318             }
319              
320             # INFERNAL version and release date
321             elsif ( $buffer =~ m/^\#\sINFERNAL\s+(\S+)\s+\((.+)\)/ ) {
322 3         5 my $version = $1;
323 3         6 my $versiondate = $2;
324 3         6 $self->{'_cmidline'} = $buffer;
325 3         9 $self->element_hash( { 'Infernal_version' => $version } );
326             }
327              
328             # Query info
329             elsif ( $buffer =~ /^\#\squery (?:\w+ )?file\:\s+(\S+)/ ) {
330 3         9 $self->{'_cmfileline'} = $lineorig;
331 3         13 $self->element_hash( { 'Infernal_cm' => $1 } );
332             }
333              
334             # Database info
335             elsif ( $buffer =~ m/^\#\starget\s\S+\sdatabase\:\s+(\S+)/ ) {
336 3         5 $self->{'_cmseqline'} = $lineorig;
337 3         15 $self->element_hash( { 'Infernal_db' => $1 } );
338             }
339              
340             # Query data
341             elsif ( $buffer =~ m/^Query:\s+(\S+)\s+\[CLEN=(\d+)\]$/ ) {
342 4         29 $self->element_hash( { 'Infernal_query-def' => $1,
343             'Infernal_query-len' => $2,
344             'Infernal_query-acc' => $accession,
345             'Infernal_querydesc' => $description,
346             } );
347             }
348              
349             # Get query accession
350             elsif ( $buffer =~ s/^Accession:\s+// && ! $accession) {
351 3         6 $buffer =~ s/\s+$//;
352 3         11 $self->element_hash( { 'Infernal_query-acc' => $buffer } );
353             }
354              
355             # Get query description
356             elsif ( $buffer =~ s/^Description:\s+// && ! $description) {
357 2         5 $buffer =~ s/\s+$//;
358 2         6 $self->element_hash( { 'Infernal_querydesc' => $buffer } );
359             }
360              
361             # Process hit table - including those below inclusion threshold
362             elsif ( $buffer =~ m/^Hit scores:/) {
363 4         8 @hit_list = (); # here is case there are multi-query reports
364 4         11 while ( defined( $buffer = $self->_readline ) ) {
365 38 100 100     222 if ( $buffer =~ m/^Hit alignments:/ ) {
    100 100        
      100        
366 4         8 $self->_pushback($buffer);
367 4         4 last;
368             }
369             elsif ( $buffer =~ m/^\s+rank\s+E-value/
370             || $buffer =~ m/\-\-\-/
371             || $buffer =~ m/^$/
372             || $buffer =~ m/No hits detected/ ) {
373 19         28 next;
374             }
375              
376             # Process hit
377 15         11 $hit_counter++;
378 15         76 my ($rank, $threshold, $eval, $score,
379             $bias, $hitid, $start, $end, $strand,
380             $mdl, $truc, $gc, @desc) = split( " ", $buffer );
381 15         24 my $desc = join " ", @desc;
382 15 50       26 $desc = '' if ( !defined($desc) );
383              
384 15         21 push @hit_list, [ $hitid, $desc, $eval, $score ];
385 15         44 $hitindex{ $hitid.$hit_counter } = $#hit_list;
386             }
387             }
388              
389             # Process hit alignments
390             elsif ( $buffer =~ /^Hit alignments:/ ) {
391 4         5 my $hitid;
392 4         4 my $align_counter = 0;
393 4         7 while ( defined( $buffer = $self->_readline ) ) {
394 38 100       67 if ( $buffer =~ /^Internal CM pipeline statistics summary/ ) {
395 4         10 $self->_pushback($buffer);
396 4         6 last;
397             }
398 34 100       92 if ( $buffer =~ m/^\>\>\s(\S*)\s+(.*)/ ) { # defline of hit
    100          
399 15         24 $hitid = $1;
400 15         18 my $desc = $2;
401 15         14 $align_counter++;
402 15         17 my $hitid_alignctr = $hitid.$align_counter;
403 15         24 $modelcounter{$hitid_alignctr} = 0;
404              
405             # The Hit Description from the Hit table can be truncated if
406             # it is too long, so use the '>>' line description instead
407 15         22 $hit_list[ $hitindex{$hitid_alignctr} ][1] = $desc;
408              
409             # Process hit information table
410 15         24 while ( defined( $buffer = $self->_readline ) ) {
411 75 100 66     541 if ( $buffer =~ m/^Internal CM pipeline statistics/
    100 66        
      100        
      100        
      66        
412             || $buffer =~ m/NC$/
413             || $buffer =~ m/^\>\>/ ) {
414 15         26 $self->_pushback($buffer);
415 15         25 last;
416             }
417             elsif ( $buffer =~ m/^\s+rank\s+E-value/
418             || $buffer =~ m/^\s----/
419             || $buffer =~ m/^$/
420             || $buffer =~ m/No hits detected/ ) {
421 45         66 next;
422             }
423              
424             # Get hsp data from table, push into @hsp;
425 15         72 my ( $rank, $threshold, $eval,
426             $score, $bias, $model,
427             $cm_start, $cm_stop, $cm_cov,
428             $seq_start, $seq_stop, $seq_strand, $seq_cov,
429             $acc, $trunc, $gc,
430             ) = split( " ", $buffer );
431              
432             # Try to get the Hit Length from the alignment information.
433             # For cmsearch, if sequence coverage ends in ']' it means that the
434             # alignment runs full-length flush to the end of the target.
435 15 100       39 my $hitlength = ( $seq_cov =~ m/\]$/ ) ? $seq_stop : 0;
436              
437 15         25 my $tmphit = $hit_list[ $hitindex{$hitid_alignctr} ];
438 15 50       23 if ( !defined $tmphit ) {
439 0         0 $self->warn("Incomplete information: can't find HSP $hitid in list of hits\n");
440 0         0 next;
441             }
442              
443 15         37 push @hsp_list, [ $hitid,
444             $cm_start, $cm_stop,
445             $seq_start, $seq_stop,
446             $score, $eval,
447             $hitlength];
448 15         17 $modelcounter{$hitid_alignctr}++;
449 15         25 my $hsp_key = $hitid_alignctr . "_" . $modelcounter{$hitid_alignctr};
450 15         42 $hspindex{$hsp_key} = $#hsp_list;
451             }
452             }
453             elsif ( $buffer =~ m/NC$/ ) { # start of HSP
454             # need CS line to get number of spaces before structure data
455 15         23 my $csline = $self->_readline;
456 15         35 $csline =~ m/^(\s+)\S+ CS$/;
457 15         29 my $offset = length($1);
458 15         24 $self->_pushback($csline);
459 15         20 $self->_pushback($buffer); # set up for loop
460              
461 15         13 my ($ct, $strln) = 0;
462 15         12 my $hspdata;
463 15         48 HSP:
464             my %hspline = ('0' => 'nc', '1' => 'meta',
465             '2' => 'query', '3' => 'midline',
466             '4' => 'hit', '5' => 'pp');
467             HSP:
468 15         24 while (defined ($buffer = $self->_readline)) {
469 168         129 chomp $buffer;
470 168 100 100     705 if ( $buffer =~ /^>>\s/
    100 100        
471             || $buffer =~ /^Internal CM pipeline statistics/) {
472 15         24 $self->_pushback($buffer);
473 15         16 last HSP;
474             }
475             elsif ( $ct % 6 == 0 && $buffer =~ /^$/ ) {
476 27         41 next;
477             }
478 126         83 my $iterator = $ct % 6;
479             # NC line ends with ' NC' so remove these from the strlen count
480 126 100       136 $strln = ( length($buffer) - 3 ) if $iterator == 0;
481 126         129 my $data = substr($buffer, $offset, $strln-$offset);
482 126         212 $hspdata->{ $hspline{$iterator} } .= $data;
483              
484 126         184 $ct++;
485             } # 'HSP' while loop
486              
487 15         12 my $strlen = 0;
488             # catch any insertions and add them into the actual length
489 15         41 while ($hspdata->{'query'} =~ m{\*\[\s*(\d+)\s*\]\*}g) {
490 2         10 $strlen += $1;
491             }
492             # add on the actual residues
493 15         22 $strlen += $hspdata->{'query'} =~ tr{A-Za-z}{A-Za-z};
494             my $metastr = ($self->convert_meta) ? ($self->simple_meta($hspdata->{'meta'})) :
495 15 50       28 $hspdata->{'meta'};
496              
497 15         20 my $hitid_alignctr = $hitid . $align_counter;
498 15         24 my $hsp_key = $hitid_alignctr . "_" . $modelcounter{$hitid_alignctr};
499 15         17 my $hsp = $hsp_list[ $hspindex{$hsp_key} ];
500             push (@$hsp, $hspdata->{'nc'}, $metastr,
501             $hspdata->{'query'}, $hspdata->{'midline'},
502 15         73 $hspdata->{'hit'}, $hspdata->{'pp'});
503             }
504             }
505             } # end of 'Hit alignments:' section of report
506              
507             # Process internal pipeline stats (end of report)
508             elsif ( $buffer =~ m/Internal CM pipeline statistics summary:/ ) {
509 4         10 while ( defined( $buffer = $self->_readline ) ) {
510 68 100       99 last if ( $buffer =~ m!^//! );
511              
512 64 100       113 if ( $buffer =~ /^Target sequences:\s+(\d+)\s+\((\d+) residues/ ) {
513 4         24 $self->element_hash( { 'Infernal_db-len' => $1,
514             'Infernal_db-let' => $2, } );
515             }
516             }
517 4         9 last; # of the outer while defined $self->readline
518             }
519              
520             # Leftovers
521             else {
522             #print STDERR "Missed line: $buffer\n";
523 19         37 $self->debug($buffer);
524             }
525 48         102 $last = $buffer;
526             } # PARSER end
527              
528             # Final processing of hits and hsps
529 4         5 my $hit_counter = 0;
530 4         9 foreach my $hit ( @hit_list ) {
531 15         15 $hit_counter++;
532 15         42 my ($hit_name, $hit_desc, $hit_signif, $hit_score) = @$hit;
533 15   50     42 my $num_hsp = $modelcounter{$hit_name . $hit_counter} || 0;
534              
535 15         40 $self->start_element( { 'Name' => 'Hit' } );
536 15         64 $self->element_hash( {'Hit_id' => $hit_name,
537             'Hit_desc' => $hit_desc,
538             'Hit_signif'=> $hit_signif,
539             'Hit_score' => $hit_score,
540             'Hit_bits' => $hit_score, } );
541 15         47 for my $i ( 1 .. $num_hsp ) {
542 15         29 my $hsp_key = $hit_name . $hit_counter . "_" . $i;
543 15         20 my $hsp = $hsp_list[ $hspindex{$hsp_key} ];
544 15 50       23 if ( defined $hsp ) {
545 15         16 my $hspid = shift @$hsp;
546              
547 15         34 my ($cm_start, $cm_stop, $seq_start, $seq_stop,
548             $score, $eval, $hitlength, $ncline,
549             $csline, $qseq, $midline, $hseq, $pline) = @$hsp;
550 15 100       29 if ( $hitlength != 0 ) {
551 10         26 $self->element(
552             { 'Name' => 'Hit_len', 'Data' => $hitlength }
553             );
554             }
555              
556 15         39 $self->start_element( { 'Name' => 'Hsp' } );
557 15         108 $self->element_hash( { 'Hsp_stranded' => 'HIT',
558             'Hsp_query-from' => $cm_start,
559             'Hsp_query-to' => $cm_stop,
560             'Hsp_hit-from' => $seq_start,
561             'Hsp_hit-to' => $seq_stop,
562             'Hsp_score' => $score,
563             'Hsp_bit-score' => $score,
564             'Hsp_evalue' => $eval,
565             'Hsp_ncline' => $ncline,
566             'Hsp_structure' => $csline,
567             'Hsp_qseq' => $qseq,
568             'Hsp_midline' => $midline,
569             'Hsp_hseq' => $hseq,
570             'Hsp_pline' => $pline,
571             } );
572              
573 15         65 $self->end_element( { 'Name' => 'Hsp' } );
574             }
575             }
576 15         35 $self->end_element( { 'Name' => 'Hit' } );
577             }
578              
579 4         13 $self->end_element( { 'Name' => 'Result' } );
580 4         14 my $result = $self->end_document();
581 4         50 return $result;
582             }
583              
584              
585             =head2 start_element
586              
587             Title : start_element
588             Usage : $eventgenerator->start_element
589             Function: Handles a start element event
590             Returns : none
591             Args : hashref with at least 2 keys 'Data' and 'Name'
592              
593              
594             =cut
595              
596             sub start_element {
597 69     69 1 63 my ( $self, $data ) = @_;
598              
599             # we currently don't care about attributes
600 69         68 my $nm = $data->{'Name'};
601 69         75 my $type = $MODEMAP{$nm};
602 69 50       108 if ($type) {
603 69 50       121 if ( $self->_eventHandler->will_handle($type) ) {
604 69         174 my $func = sprintf( "start_%s", lc $type );
605 69         94 $self->_eventHandler->$func( $data->{'Attributes'} );
606             }
607 69         90 unshift @{ $self->{'_elements'} }, $type;
  69         115  
608             }
609 69 100 66     306 if ( defined $type
610             && $type eq 'result' )
611             {
612 7         8 $self->{'_values'} = {};
613 7         17 $self->{'_result'} = undef;
614             }
615             }
616              
617             =head2 end_element
618              
619             Title : start_element
620             Usage : $eventgenerator->end_element
621             Function: Handles an end element event
622             Returns : none
623             Args : hashref with at least 2 keys, 'Data' and 'Name'
624              
625             =cut
626              
627             sub end_element {
628 83     83 1 94 my ( $self, $data ) = @_;
629 83         76 my $nm = $data->{'Name'};
630 83         80 my $type = $MODEMAP{$nm};
631 83         57 my $rc;
632              
633 83 100       122 if ($type) {
    50          
634 70 50       127 if ( $self->_eventHandler->will_handle($type) ) {
635 70         176 my $func = sprintf( "end_%s", lc $type );
636             $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
637 70         107 $self->{'_values'} );
638             }
639 70         63 my $lastelem = shift @{ $self->{'_elements'} };
  70         123  
640              
641             # Infernal 1.1 allows one to know hit->length in some instances
642             # so remove it so it doesn't carry over to next hit. Tried flushing
643             # all 'type' values from {_values} buffer but it breaks legacy tests
644 70 100       129 if ($type eq 'hit' ) {
645 23         28 delete $self->{_values}{'HIT-length'};
646 23         31 delete $self->{_values}{'HSP-hit_length'};
647             }
648             }
649             elsif ( $MAPPING{$nm} ) {
650 13 50       24 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
651 0         0 my $key = ( keys %{ $MAPPING{$nm} } )[0];
  0         0  
652             $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
653 0         0 $self->{'_last_data'};
654             }
655             else {
656 13         37 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
657             }
658             }
659             else {
660 0         0 $self->debug("unknown nm $nm, ignoring\n");
661             }
662 83         123 $self->{'_last_data'} = ''; # remove read data if we are at
663             # end of an element
664 83 100 100     269 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
665 83         225 return $rc;
666             }
667              
668             =head2 element
669              
670             Title : element
671             Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
672             Function: Convenience method that calls start_element, characters, end_element
673             Returns : none
674             Args : Hash ref with the keys 'Name' and 'Data'
675              
676             =cut
677              
678             sub element {
679 13     13 1 11 my ( $self, $data ) = @_;
680             # simple data calls (%MAPPING) do not need start_element
681 13         30 $self->characters($data);
682 13         21 $self->end_element($data);
683             }
684              
685             =head2 element_hash
686              
687             Title : element
688             Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
689             'Hsp_hit-to' => $end,
690             'Hsp_score' => $lastscore});
691             Function: Convenience method that takes multiple simple data elements and
692             maps to appropriate parameters
693             Returns : none
694             Args : Hash ref with the mapped key (in %MAPPING) and value
695              
696             =cut
697              
698             sub element_hash {
699 124     124 1 107 my ($self, $data) = @_;
700 124 50 33     301 $self->throw("Must provide data hash ref") if !$data || !ref($data);
701 124         88 for my $nm (sort keys %{$data}) {
  124         472  
702 715 100 100     2171 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
703 707 50       900 if ( $MAPPING{$nm} ) {
704 707 50       687 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
705 0         0 my $key = ( keys %{ $MAPPING{$nm} } )[0];
  0         0  
706             $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
707 0         0 $data->{$nm};
708             }
709             else {
710 707         1108 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
711             }
712             }
713             }
714             }
715              
716             =head2 characters
717              
718             Title : characters
719             Usage : $eventgenerator->characters($str)
720             Function: Send a character events
721             Returns : none
722             Args : string
723              
724              
725             =cut
726              
727             sub characters {
728 13     13 1 11 my ( $self, $data ) = @_;
729 13 50 33     95 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
730 13         22 $self->{'_last_data'} = $data->{'Data'};
731             }
732              
733             =head2 within_element
734              
735             Title : within_element
736             Usage : if( $eventgenerator->within_element($element) ) {}
737             Function: Test if we are within a particular element
738             This is different than 'in' because within can be tested
739             for a whole block.
740             Returns : boolean
741             Args : string element name
742              
743             =cut
744              
745             sub within_element {
746 43     43 1 48 my ( $self, $name ) = @_;
747             return 0
748             if ( !defined $name
749             || !defined $self->{'_elements'}
750 43 100 33     168 || scalar @{ $self->{'_elements'} } == 0 );
  43   66     122  
751 40         38 foreach ( @{ $self->{'_elements'} } ) {
  40         78  
752 52 100       153 return 1 if ( $_ eq $name );
753             }
754 12         34 return 0;
755             }
756              
757             =head2 in_element
758              
759             Title : in_element
760             Usage : if( $eventgenerator->in_element($element) ) {}
761             Function: Test if we are in a particular element
762             This is different than 'within' because 'in' only
763             tests its immediate parent.
764             Returns : boolean
765             Args : string element name
766              
767             =cut
768              
769             sub in_element {
770 86     86 1 97 my ( $self, $name ) = @_;
771 86 50       153 return 0 if !defined $self->{'_elements'}->[0];
772 86         273 return ( $self->{'_elements'}->[0] eq $name );
773             }
774              
775             =head2 start_document
776              
777             Title : start_document
778             Usage : $eventgenerator->start_document
779             Function: Handle a start document event
780             Returns : none
781             Args : none
782              
783             =cut
784              
785             sub start_document {
786 9     9 1 12 my ($self) = @_;
787 9         17 $self->{'_lasttype'} = '';
788 9         19 $self->{'_values'} = {};
789 9         31 $self->{'_result'} = undef;
790 9         20 $self->{'_elements'} = [];
791             }
792              
793             =head2 end_document
794              
795             Title : end_document
796             Usage : $eventgenerator->end_document
797             Function: Handles an end document event
798             Returns : Bio::Search::Result::ResultI object
799             Args : none
800              
801             =cut
802              
803             sub end_document {
804 8     8 1 7 my ($self) = @_;
805 8         34 return $self->{'_result'};
806             }
807              
808             =head2 result_count
809              
810             Title : result_count
811             Usage : my $count = $searchio->result_count
812             Function: Returns the number of results we have processed
813             Returns : integer
814             Args : none
815              
816             =cut
817              
818             sub result_count {
819 0     0 1 0 my $self = shift;
820 0         0 return $self->{'_result_count'};
821             }
822              
823             =head2 model
824              
825             Title : model
826             Usage : my $model = $parser->model();
827             Function: Get/Set model; Infernal currently does not output
828             the model name (Rfam ID)
829             Returns : String (name of model)
830             Args : [optional] String (name of model)
831              
832             =cut
833              
834             sub model {
835 5     5 1 10 my $self = shift;
836 5 100       18 return $self->{'_model'} = shift if @_;
837 2         6 return $self->{'_model'};
838             }
839              
840             =head2 database
841              
842             Title : database
843             Usage : my $database = $parser->database();
844             Function: Get/Set database; pre-v.1 versions of Infernal do not output
845             the database name
846             Returns : String (database name)
847             Args : [optional] String (database name)
848              
849             =cut
850              
851             sub database {
852 6     6 1 9 my $self = shift;
853 6 100       18 return $self->{'_database'} = shift if @_;
854 3         10 return $self->{'_database'};
855             }
856              
857             =head2 algorithm
858              
859             Title : algorithm
860             Usage : my $algorithm = $parser->algorithm();
861             Function: Get/Set algorithm; pre-v.1 versions of Infernal do not output
862             the algorithm name
863             Returns : String (algorithm name)
864             Args : [optional] String (algorithm name)
865              
866             =cut
867              
868             sub algorithm {
869 10     10 1 12 my $self = shift;
870 10 100       39 return $self->{'_algorithm'} = shift if @_;
871 3         8 return $self->{'_algorithm'};
872             }
873              
874             =head2 query_accession
875              
876             Title : query_accession
877             Usage : my $acc = $parser->query_accession();
878             Function: Get/Set query (model) accession; pre-v1.1 Infernal does not output
879             the accession number (Rfam accession #)
880             Returns : String (accession)
881             Args : [optional] String (accession)
882              
883             =cut
884              
885             sub query_accession {
886 13     13 1 14 my $self = shift;
887 13 100       33 return $self->{'_query_accession'} = shift if @_;
888 9         29 return $self->{'_query_accession'};
889             }
890              
891             =head2 query_description
892              
893             Title : query_description
894             Usage : my $acc = $parser->query_description();
895             Function: Get/Set query (model) description; pre-v1.1 Infernal does not output
896             the Rfam description
897             Returns : String (description)
898             Args : [optional] String (description)
899              
900             =cut
901              
902             sub query_description {
903 13     13 1 17 my $self = shift;
904 13 100       26 return $self->{'_query_description'} = shift if @_;
905 9         18 return $self->{'_query_description'};
906             }
907              
908             =head2 hsp_minscore
909              
910             Title : hsp_minscore
911             Usage : my $cutoff = $parser->hsp_minscore();
912             Function: Get/Set min bit score cutoff (for generating Hits/HSPs)
913             Returns : score (number)
914             Args : [optional] score (number)
915              
916             =cut
917              
918             sub hsp_minscore {
919 9     9 1 10 my $self = shift;
920 9 100       26 return $self->{'_hsp_minscore'} = shift if @_;
921 2         4 return $self->{'_hsp_minscore'};
922             }
923              
924             =head2 convert_meta
925              
926             Title : convert_meta
927             Usage : $parser->convert_meta(1);
928             Function: Get/Set boolean flag for converting Infernal WUSS format
929             to a simple bracketed format (simple WUSS by default)
930             Returns : boolean flag (TRUE or FALSE)
931             Args : [optional] boolean (eval's to TRUE or FALSE)
932              
933             =cut
934              
935             sub convert_meta {
936 42     42 1 37 my $self = shift;
937 42 100       63 return $self->{'_convert_meta'} = shift if @_;
938 39         87 return $self->{'_convert_meta'};
939             }
940              
941             =head2 version
942              
943             Title : version
944             Usage : $parser->version();
945             Function: Set the Infernal cmsearch version
946             Returns : version
947             Args : [optional] version
948              
949             =cut
950              
951             sub version {
952 9     9 1 10 my $self = shift;
953 9 100       29 return $self->{'_version'} = shift if @_;
954 2         5 return $self->{'_version'};
955             }
956              
957             =head2 structure_symbols
958              
959             Title : structure_symbols
960             Usage : my $hashref = $parser->structure_symbols();
961             Function: Get/Set RNA structure symbols
962             Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
963             : default = < (5-prime)
964             > (3-prime)
965             : (single-strand)
966             ? (unknown)
967             . (gap)
968             Args : Hash ref of substitute delimiters, using above keys.
969              
970             =cut
971              
972             sub structure_symbols {
973 10     10 1 10 my ($self, $delim) = @_;
974 10 100       25 if ($delim) {
975 7 50       28 if (ref($delim) =~ m{HASH}) {
976 7         8 my %data = %{ $delim };
  7         49  
977 7         14 for my $d (@VALID_SYMBOLS) {
978 35 50       54 if ( exists $data{$d} ) {
979 35         69 $self->{'_delimiter'}->{$d} = $data{$d};
980             }
981             }
982             } else {
983 0         0 $self->throw("Args to helix_delimiters() should be in a hash reference");
984             }
985             }
986 10         16 return $self->{'_delimiter'};
987             }
988              
989             =head2 simple_meta
990              
991             Title : simple_meta
992             Usage : my $string = $parser->simple_meta($str);
993             Function: converts more complex WUSS meta format into simple bracket format
994             using symbols defined in structure_symbols()
995             Returns : converted string
996             Args : [required] string to convert
997             Note : This is a very simple conversion method to get simple bracketed
998             format from Infernal data. If the convert_meta() flag is set,
999             this is the method used to convert the strings.
1000              
1001             =cut
1002              
1003             sub simple_meta {
1004 3     3 1 5 my ($self, $str) = @_;
1005 3 50       7 $self->throw("No string arg sent!") if !$str;
1006 3         5 my $structs = $self->structure_symbols();
1007             my ($ls, $rs, $ss, $unk, $gap) = ($structs->{'5-prime'}, $structs->{'3-prime'},
1008             $structs->{'single-strand'}, $structs->{'unknown'},
1009 3         7 $structs->{'gap'});
1010 3         27 $str =~ s{[\(\<\[\{]}{$ls}g;
1011 3         22 $str =~ s{[\)\>\]\}]}{$rs}g;
1012 3         56 $str =~ s{[:,_-]}{$ss}g;
1013 3         5 $str =~ s{\.}{$gap}g;
1014             # unknown not handled yet
1015 3         6 return $str;
1016             }
1017              
1018             ## private methods
1019              
1020             # this is a hack which guesses the format and sets the handler for parsing in
1021             # an instance; it'll be taken out when infernal 1.0 is released
1022              
1023             sub _parse_latest {
1024 1     1   3 my ($self) = @_;
1025 1         1 my $seentop = 0;
1026 1         5 local $/ = "\n";
1027 1         3 my ($accession, $description) = ($self->query_accession, $self->query_description);
1028 1         2 my ($maxscore, $mineval, $minpval);
1029 1         3 $self->start_document();
1030 1         1 my ($lasthit, $lastscore, $lasteval, $lastpval, $laststart, $lastend);
1031             PARSER:
1032 1         3 while (my $line = $self->_readline) {
1033 55 100       137 next if $line =~ m{^\s+$};
1034             # stats aren't parsed yet...
1035 48 100       292 if ($line =~ m{^\#\s+cmsearch}xms) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
1036 1         3 $seentop = 1;
1037 1         5 $self->start_element({'Name' => 'Result'});
1038 1         4 $self->element_hash({
1039             'Infernal_program' => 'CMSEARCH'
1040             });
1041             }
1042             elsif ($line =~ m{^\#\sINFERNAL\s+(\d+\.\d+)}xms) {
1043 1         4 $self->element_hash({
1044             'Infernal_version' => $1,
1045             });
1046             }
1047             elsif ($line =~ m{^\#\scommand:.*?\s(\S+)$}xms) {
1048 1         4 $self->element_hash({
1049             'Infernal_db' => $1,
1050             });
1051             }
1052             elsif ($line =~ m{^\#\s+dbsize\(Mb\):\s+(\d+\.\d+)}xms) {
1053             # store absolute DB length
1054 1         7 $self->element_hash({
1055             'Infernal_db-let' => $1 * 1e6
1056             });
1057             }
1058             elsif ($line =~ m{^CM(?:\s(\d+))?:\s*(\S+)}xms) {
1059             # not sure, but it's possible single reports may contain multiple
1060             # models; if so, they should be rolled over into a new ResultI
1061             #print STDERR "ACC: $accession\nDESC: $description\n";
1062 1         5 $self->element_hash({
1063             'Infernal_query-def' => $2, # present in output now
1064             'Infernal_query-acc' => $accession,
1065             'Infernal_querydesc' => $description
1066             });
1067             }
1068             elsif ($line =~ m{^>\s*(\S+)} ){
1069             #$self->debug("Start Hit: Found hit:$1\n");
1070 1 50       4 if ($self->in_element('hit')) {
1071 0         0 $self->element_hash({'Hit_score' => $maxscore,
1072             'Hit_bits' => $maxscore});
1073 0         0 ($maxscore, $minpval, $mineval) = undef;
1074 0         0 $self->end_element({'Name' => 'Hit'});
1075             }
1076 1         3 $lasthit = $1;
1077             }
1078             elsif ($line =~ m{
1079             ^\sQuery\s=\s\d+\s-\s\d+,\s # Query start/end
1080             Target\s=\s(\d+)\s-\s(\d+) # Target start/end
1081             }xmso) {
1082             # Query (model) start/end always the same, determined from
1083             # the HSP length
1084 3         9 ($laststart, $lastend) = ($1, $2);
1085             #$self->debug("Found hit coords:$laststart - $lastend\n");
1086             } elsif ($line =~ m{
1087             ^\sScore\s=\s([\d\.]+),\s # Score = Bitscore (for now)
1088             (?:E\s=\s([\d\.e-]+),\s # E-val optional
1089             P\s=\s([\d\.e-]+),\s)? # P-val optional
1090             GC\s= # GC not captured
1091             }xmso
1092             ) {
1093 3         7 ($lastscore, $lasteval, $lastpval) = ($1, $2, $3);
1094             #$self->debug(sprintf("Found hit data:Score:%s,Eval:%s,Pval:%s\n",$lastscore, $lasteval||'', $lastpval||''));
1095 3   66     9 $maxscore ||= $lastscore;
1096 3 50 33     11 if ($lasteval && $lastpval) {
1097 3   66     17 $mineval ||= $lasteval;
1098 3   66     7 $minpval ||= $lastpval;
1099 3 50       9 $mineval = ($mineval > $lasteval) ? $lasteval :
1100             $mineval;
1101 3 50       5 $minpval = ($minpval > $lastpval) ? $lastpval :
1102             $minpval;
1103             }
1104 3 50       6 $maxscore = ($maxscore < $lastscore) ? $lastscore :
1105             $maxscore;
1106 3 100       7 if (!$self->within_element('hit')) {
1107 1         10 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit);
1108 1         4 $self->start_element({'Name' => 'Hit'});
1109 1 50       8 $self->element_hash({
    50          
1110             'Hit_id' => $lasthit,
1111             'Hit_accession' => $ver ? "$acc.$ver" :
1112             $acc ? $acc : $lasthit,
1113             'Hit_gi' => $gi
1114             });
1115             }
1116 3 50       9 if (!$self->in_element('hsp')) {
1117 3         7 $self->start_element({'Name' => 'Hsp'});
1118             }
1119              
1120             # hsp is similar to older output
1121             } elsif ($line =~ m{^(\s+)[<>\{\}\(\)\[\]:_,-\.]+}xms) { # start of HSP
1122 3         6 $self->_pushback($line); # set up for loop
1123             #$self->debug("Start HSP\n");
1124             # what is length of the gap to the structure data?
1125 3         7 my $offset = length($1);
1126 3         3 my ($ct, $strln) = 0;
1127 3         2 my $hsp;
1128 3         10 HSP:
1129             my %hsp_key = ('0' => 'meta',
1130             '1' => 'query',
1131             '2' => 'midline',
1132             '3' => 'hit');
1133             HSP:
1134 3         5 while (defined ($line = $self->_readline)) {
1135 31         26 chomp $line;
1136 31 100       38 next if (!$line); # toss empty lines
1137             # next if $line =~ m{^\s*$}; # toss empty lines
1138             # it is possible to have homology lines consisting
1139             # entirely of spaces if the subject has a large
1140             # insertion where nothing matches the model
1141              
1142             # exit loop if at end of file or upon next hit/HSP
1143 23 100       46 if ($line =~ m{^\s{0,2}\S+}) {
1144 3         6 $self->_pushback($line);
1145 3         4 last HSP;
1146             }
1147             # iterate to keep track of each line (4 lines per hsp block)
1148 20         11 my $iterator = $ct % 4;
1149             # strlen set only with structure lines (proper length)
1150 20 100       28 $strln = length($line) if $iterator == 0;
1151             # only grab the data needed (hit start and stop in hit line above)
1152              
1153 20         22 my $data = substr($line, $offset, $strln-$offset);
1154 20         33 $hsp->{ $hsp_key{$iterator} } .= $data;
1155              
1156 20         27 $ct++;
1157             }
1158              
1159             # query start, end are from the actual query length (entire hit is
1160             # mapped to CM data, so all CM data is represented)
1161             # works for now...
1162 3 50       3 if ($self->in_element('hsp')) {
1163             # In some cases with HSPs unaligned residues are present in
1164             # the hit or query (Ex: '*[ 8]*' is 8 unaligned residues).
1165             # This info needs to be passed on unmodifed to the HSP class
1166             # and handled there as it is subjectively changed based on
1167             # use.
1168 3         4 my $strlen = 0;
1169              
1170             # catch any insertions and add them into the actual length
1171 3         12 while ($hsp->{'query'} =~ m{\*\[\s*(\d+)\s*\]\*}g) {
1172 2         7 $strlen += $1;
1173             }
1174             # add on the actual residues
1175 3         5 $strlen += $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z};
1176              
1177             my $metastr = ($self->convert_meta) ? ($self->simple_meta($hsp->{'meta'})) :
1178 3 50       10 $hsp->{'meta'};
1179             $self->element_hash(
1180             {'Hsp_stranded' => 'HIT',
1181             'Hsp_qseq' => $hsp->{'query'},
1182             'Hsp_hseq' => $hsp->{'hit'},
1183 3         24 'Hsp_midline' => $hsp->{'midline'},
1184             'Hsp_structure' => $metastr,
1185             'Hsp_query-from' => 1,
1186             'Infernal_query-len' => $strlen,
1187             'Hsp_query-to' => $strlen,
1188             'Hsp_hit-from' => $laststart,
1189             'Hsp_hit-to' => $lastend,
1190             'Hsp_score' => $lastscore,
1191             'Hsp_bit-score' => $lastscore,
1192             });
1193 3 50 33     22 $self->element_hash(
1194             {'Hsp_evalue' => $lasteval,
1195             'Hsp_pvalue' => $lastpval,
1196             }) if ($lasteval && $lastpval);
1197 3         10 $self->end_element({'Name' => 'Hsp'});
1198             }
1199             # result now ends with // and 'Fin'
1200             } elsif ($line =~ m{^//}xms ) {
1201 1 50 33     3 if ($self->within_element('result') && $seentop) {
1202 1 50       3 if ($self->in_element('hit')) {
1203 1         5 $self->element_hash({'Hit_score' => $maxscore,
1204             'Hit_bits' => $maxscore});
1205             # don't know where to put minpval yet
1206 1 50       5 $self->element_hash({'Hit_signif' => $mineval}) if $mineval;
1207 1 50       6 $self->element_hash({'Hit_p' => $minpval}) if $minpval;
1208 1         4 $self->end_element({'Name' => 'Hit'});
1209             }
1210 1         3 last PARSER;
1211             }
1212             }
1213             }
1214 1 50       4 $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } );
1215 1 50       6 $self->end_element( { 'Name' => 'Result' } ) if $seentop;
1216 1         3 return $self->end_document();
1217             }
1218              
1219             # cmsearch 0.81 (pre-1.0)
1220             sub _parse_pre {
1221 1     1   2 my ($self) = @_;
1222 1         2 my $seentop = 0;
1223 1         4 local $/ = "\n";
1224 1         4 my ($accession, $db, $algorithm, $description, $version) =
1225             ($self->query_accession, $self->database, $self->algorithm,
1226             $self->query_description, '0.81');
1227 1         2 my ($maxscore, $mineval, $minpval);
1228 1         6 $self->start_document();
1229 1         3 my ($lasthit, $lastscore, $lasteval, $lastpval, $laststart, $lastend);
1230             PARSER:
1231 1         4 while (my $line = $self->_readline) {
1232 85 100       271 next if $line =~ m{^\s+$};
1233             # stats aren't parsed yet...
1234 60 100       324 if ($line =~ m{CM\s\d+:\s*(\S+)}xms) {
    100          
    100          
    100          
    100          
    100          
1235             #$self->debug("Start Result: Found model:$1\n");
1236 1 50       8 if (!$self->within_element('result')) {
1237 1         3 $seentop = 1;
1238 1         7 $self->start_element({'Name' => 'Result'});
1239 1         9 $self->element_hash({
1240             'Infernal_program' => $algorithm,
1241             'Infernal_query-def' => $1, # present in output now
1242             'Infernal_query-acc' => $accession,
1243             'Infernal_querydesc' => $description,
1244             'Infernal_db' => $db
1245             });
1246             }
1247             } elsif ($line =~ m{^>\s*(\S+)} ){
1248             #$self->debug("Start Hit: Found hit:$1\n");
1249 3 100       7 if ($self->in_element('hit')) {
1250 2         9 $self->element_hash({'Hit_score' => $maxscore,
1251             'Hit_bits' => $maxscore});
1252 2         4 ($maxscore, $minpval, $mineval) = undef;
1253 2         7 $self->end_element({'Name' => 'Hit'});
1254             }
1255 3         14 $lasthit = $1;
1256             }
1257             elsif ($line =~ m{
1258             ^\sQuery\s=\s\d+\s-\s\d+,\s # Query start/end
1259             Target\s=\s(\d+)\s-\s(\d+) # Target start/end
1260             }xmso) {
1261             # Query (model) start/end always the same, determined from
1262             # the HSP length
1263 15         48 ($laststart, $lastend) = ($1, $2);
1264             #$self->debug("Found hit coords:$laststart - $lastend\n");
1265             } elsif ($line =~ m{
1266             ^\sScore\s=\s([\d\.]+),\s # Score = Bitscore (for now)
1267             (?:E\s=\s([\d\.e-]+),\s # E-val optional
1268             P\s=\s([\d\.e-]+),\s)? # P-val optional
1269             GC\s= # GC not captured
1270             }xmso
1271             ) {
1272 15         29 ($lastscore, $lasteval, $lastpval) = ($1, $2, $3);
1273             #$self->debug(sprintf("Found hit data:Score:%s,Eval:%s,Pval:%s\n",$lastscore, $lasteval||'', $lastpval||''));
1274 15   66     30 $maxscore ||= $lastscore;
1275 15 50 33     45 if ($lasteval && $lastpval) {
1276 15   66     26 $mineval ||= $lasteval;
1277 15   66     24 $minpval ||= $lastpval;
1278 15 50       42 $mineval = ($mineval > $lasteval) ? $lasteval :
1279             $mineval;
1280 15 50       27 $minpval = ($minpval > $lastpval) ? $lastpval :
1281             $minpval;
1282             }
1283 15 50       23 $maxscore = ($maxscore < $lastscore) ? $lastscore :
1284             $maxscore;
1285 15 100       27 if (!$self->within_element('hit')) {
1286 3         10 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit);
1287 3         10 $self->start_element({'Name' => 'Hit'});
1288 3 0       21 $self->element_hash({
    50          
1289             'Hit_id' => $lasthit,
1290             'Hit_accession' => $ver ? "$acc.$ver" :
1291             $acc ? $acc : $lasthit,
1292             'Hit_gi' => $gi
1293             });
1294             }
1295 15 50       25 if (!$self->in_element('hsp')) {
1296 15         31 $self->start_element({'Name' => 'Hsp'});
1297             }
1298              
1299             # hsp is similar to older output
1300             } elsif ($line =~ m{^(\s+)[<>\{\}\(\)\[\]:_,-\.]+}xms) { # start of HSP
1301 15         28 $self->_pushback($line); # set up for loop
1302             #$self->debug("Start HSP\n");
1303             # what is length of the gap to the structure data?
1304 15         25 my $offset = length($1);
1305 15         13 my ($ct, $strln) = 0;
1306 15         13 my $hsp;
1307 15         39 HSP:
1308             my %hsp_key = ('0' => 'meta',
1309             '1' => 'query',
1310             '2' => 'midline',
1311             '3' => 'hit');
1312             HSP:
1313 15         32 while (defined ($line = $self->_readline)) {
1314 180         149 chomp $line;
1315 180 100       223 next if (!$line); # toss empty lines
1316             # next if $line =~ m{^\s*$}; # toss empty lines
1317             # it is possible to have homology lines consisting
1318             # entirely of spaces if the subject has a large
1319             # insertion where nothing matches the model
1320              
1321             # exit loop if at end of file or upon next hit/HSP
1322 135 100       254 if ($line =~ m{^\s{0,2}\S+}) {
1323 15         19 $self->_pushback($line);
1324 15         16 last HSP;
1325             }
1326             # iterate to keep track of each line (4 lines per hsp block)
1327 120         93 my $iterator = $ct%4;
1328             # strlen set only with structure lines (proper length)
1329 120 100       135 $strln = length($line) if $iterator == 0;
1330             # only grab the data needed (hit start and stop in hit line above)
1331              
1332 120         124 my $data = substr($line, $offset, $strln-$offset);
1333 120         190 $hsp->{ $hsp_key{$iterator} } .= $data;
1334              
1335 120         179 $ct++;
1336             }
1337              
1338             # query start, end are from the actual query length (entire hit is
1339             # mapped to CM data, so all CM data is represented)
1340             # works for now...
1341 15 50       22 if ($self->in_element('hsp')) {
1342 15         20 my $strlen = $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z};
1343              
1344 15         10 my $metastr;
1345             $metastr = ($self->convert_meta) ? ($self->simple_meta($hsp->{'meta'})) :
1346 15 50       26 ($hsp->{'meta'});
1347             $self->element_hash(
1348             {'Hsp_stranded' => 'HIT',
1349             'Hsp_qseq' => $hsp->{'query'},
1350             'Hsp_hseq' => $hsp->{'hit'},
1351 15         103 'Hsp_midline' => $hsp->{'midline'},
1352             'Hsp_structure' => $metastr,
1353             'Hsp_query-from' => 1,
1354             'Infernal_query-len' => $strlen,
1355             'Hsp_query-to' => $strlen,
1356             'Hsp_hit-from' => $laststart,
1357             'Hsp_hit-to' => $lastend,
1358             'Hsp_score' => $lastscore,
1359             'Hsp_bit-score' => $lastscore,
1360             });
1361 15 50 33     121 $self->element_hash(
1362             {'Hsp_evalue' => $lasteval,
1363             'Hsp_pvalue' => $lastpval,
1364             }) if ($lasteval && $lastpval);
1365 15         43 $self->end_element({'Name' => 'Hsp'});
1366             }
1367             # result now ends with // and 'Fin'
1368             } elsif ($line =~ m{^//}xms ) {
1369 1 50 33     4 if ($self->within_element('result') && $seentop) {
1370 1         4 $self->element(
1371             {'Name' => 'Infernal_version',
1372             'Data' => $version}
1373             );
1374 1 50       4 if ($self->in_element('hit')) {
1375 1         4 $self->element_hash({'Hit_score' => $maxscore,
1376             'Hit_bits' => $maxscore});
1377             # don't know where to put minpval yet
1378 1 50       6 $self->element_hash({'Hit_signif' => $mineval}) if $mineval;
1379 1         4 $self->end_element({'Name' => 'Hit'});
1380             }
1381 1         4 last PARSER;
1382             }
1383             }
1384             }
1385 1 50       3 $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } );
1386 1 50       6 $self->end_element( { 'Name' => 'Result' } ) if $seentop;
1387 1         4 return $self->end_document();
1388             }
1389              
1390             # cmsearch 0.72 and below; will likely be dropped when Infernal 1.0 is released
1391             sub _parse_old {
1392 2     2   4 my ($self) = @_;
1393 2         3 my $seentop = 0;
1394 2         10 local $/ = "\n";
1395 2         5 my ($accession, $db, $algorithm, $model, $description, $version) =
1396             ($self->query_accession, $self->database, $self->algorithm,
1397             $self->model, $self->query_description, $self->version);
1398 2         16 my $maxscore;
1399 2         5 my $cutoff = $self->hsp_minscore;
1400 2         7 $self->start_document();
1401 2         3 local ($_);
1402 2         3 my $line;
1403 2         2 my ($lasthit, $lastscore, $laststart, $lastend);
1404 0         0 my $hitline;
1405             PARSER:
1406 2         7 while ( defined( $line = $self->_readline ) ) {
1407 88 50       249 next if $line =~ m{^\s+$};
1408             # bypass this for now...
1409 88 50       126 next if $line =~ m{^HMM\shit};
1410             # pre-0.81
1411 88 100 66     418 if ($line =~ m{^sequence:\s+(\S+)} ){
    100          
    100          
    100          
1412 4 100       14 if (!$self->within_element('result')) {
1413 2         3 $seentop = 1;
1414 2         10 $self->start_element({'Name' => 'Result'});
1415 2         14 $self->element_hash({
1416             'Infernal_program' => $algorithm,
1417             'Infernal_query-def' => $model,
1418             'Infernal_query-acc' => $accession,
1419             'Infernal_querydesc' => $description,
1420             'Infernal_db' => $db
1421             });
1422             }
1423 4 100       13 if ($self->in_element('hit')) {
1424 2         14 $self->element_hash({'Hit_score' => $maxscore,
1425             'Hit_bits' => $maxscore});
1426 2         5 $maxscore = undef;
1427 2         10 $self->end_element({'Name' => 'Hit'});
1428             }
1429 4         16 $lasthit = $1;
1430             } elsif ($line =~ m{^hit\s+\d+\s+:\s+(\d+)\s+(\d+)\s+(\d+\.\d+)\s+bits}xms) {
1431 52         105 ($laststart, $lastend, $lastscore) = ($1, $2, $3);
1432 52 100       73 $maxscore = $lastscore unless $maxscore;
1433 52 100       166 if ($lastscore > $cutoff) {
1434 12 100       23 if (!$self->within_element('hit')) {
1435 4         15 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit);
1436 4         17 $self->start_element({'Name' => 'Hit'});
1437 4 0       30 $self->element_hash({
    50          
1438             'Hit_id' => $lasthit,
1439             'Hit_accession' => $ver ? "$acc.$ver" :
1440             $acc ? $acc : $lasthit,
1441             'Hit_gi' => $gi
1442             });
1443             }
1444             # necessary as infernal 0.71 has repeated hit line
1445 12 100       29 if (!$self->in_element('hsp')) {
1446 6         19 $self->start_element({'Name' => 'Hsp'});
1447             }
1448 12 100       44 $maxscore = ($maxscore < $lastscore) ? $lastscore :
1449             $maxscore;
1450             }
1451             } elsif ($line =~ m{^(\s+)[<>\{\}\(\)\[\]:_,-\.]+}xms) { # start of HSP
1452 26         57 $self->_pushback($line); # set up for loop
1453             # what is length of the gap to the structure data?
1454 26         45 my $offset = length($1);
1455 26         28 my ($ct, $strln) = 0;
1456 26         30 my $hsp;
1457 26         76 HSP:
1458             my %hsp_key = ('0' => 'meta',
1459             '1' => 'query',
1460             '2' => 'midline',
1461             '3' => 'hit');
1462             HSP:
1463 26         51 while ($line = $self->_readline) {
1464 288 100       688 next if $line =~ m{^\s*$}; # toss empty lines
1465 234         193 chomp $line;
1466             # exit loop if at end of file or upon next hit/HSP
1467 234 100 66     722 if (!defined($line) || $line =~ m{^\S+}) {
1468 26         44 $self->_pushback($line);
1469 26         31 last HSP;
1470             }
1471             # iterate to keep track of each line (4 lines per hsp block)
1472 208         174 my $iterator = $ct%4;
1473             # strlen set only with structure lines (proper length)
1474 208 100       263 $strln = length($line) if $iterator == 0;
1475             # only grab the data needed (hit start and stop in hit line above)
1476              
1477 208         240 my $data = substr($line, $offset, $strln-$offset);
1478 208         406 $hsp->{ $hsp_key{$iterator} } .= $data;
1479 208         364 $ct++;
1480             }
1481             # query start, end are from the actual query length (entire hit is
1482             # mapped to CM data, so all CM data is represented)
1483             # works for now...
1484 26 100       45 if ($self->in_element('hsp')) {
1485 6         12 my $strlen = $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z};
1486            
1487 6         7 my $metastr;
1488             # Ugh...these should be passed in a hash
1489             $metastr = ($self->convert_meta) ? ($self->simple_meta($hsp->{'meta'})) :
1490 6 100       14 ($hsp->{'meta'});
1491             $self->element_hash(
1492             {'Hsp_stranded' => 'HIT',
1493             'Hsp_qseq' => $hsp->{'query'},
1494             'Hsp_hseq' => $hsp->{'hit'},
1495 6         60 'Hsp_midline' => $hsp->{'midline'},
1496             'Hsp_structure' => $metastr,
1497             'Hsp_query-from' => 1,
1498             'Infernal_query-len' => $strlen,
1499             'Hsp_query-to' => $strlen,
1500             'Hsp_hit-from' => $laststart,
1501             'Hsp_hit-to' => $lastend,
1502             'Hsp_score' => $lastscore,
1503             'Hsp_bit-score' => $lastscore
1504             });
1505 6         39 $self->end_element({'Name' => 'Hsp'});
1506             }
1507             } elsif ($line =~ m{^memory}xms || $line =~ m{^CYK\smemory}xms ) {
1508 2 50 33     9 if ($self->within_element('result') && $seentop) {
1509 2         19 $self->element(
1510             {'Name' => 'Infernal_version',
1511             'Data' => $version}
1512             );
1513 2 50       8 if ($self->in_element('hit')) {
1514 2         10 $self->element_hash({'Hit_score' => $maxscore,
1515             'Hit_bits' => $maxscore});
1516 2         10 $self->end_element({'Name' => 'Hit'});
1517             }
1518 2         7 last PARSER;
1519             }
1520             }
1521             }
1522 2 50       8 $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } );
1523 2 50       11 $self->end_element( { 'Name' => 'Result' } ) if $seentop;
1524 2         9 return $self->end_document();
1525             }
1526              
1527             1;