File Coverage

Bio/Search/SearchUtils.pm
Criterion Covered Total %
statement 273 336 81.2
branch 89 142 62.6
condition 26 29 89.6
subroutine 8 11 72.7
pod 6 6 100.0
total 402 524 76.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Bio::Search::SearchUtils - Utility functions for Bio::Search:: objects
4              
5             =head1 SYNOPSIS
6              
7             # This module is just a collection of subroutines, not an object.
8              
9             =head1 DESCRIPTION
10              
11             The SearchUtils.pm module is a collection of subroutines used
12             primarily by Bio::Search::Hit::HitI objects for some of the additional
13             functionality, such as HSP tiling. Right now, the SearchUtils is just
14             a collection of methods, not an object.
15              
16             =head1 AUTHOR
17              
18             Steve Chervitz Esac@bioperl.orgE
19              
20             =head1 CONTRIBUTORS
21              
22             Sendu Bala, bix@sendu.me.uk
23              
24             =cut
25              
26             package Bio::Search::SearchUtils;
27 29     29   1181 use Bio::Root::Version;
  29         33  
  29         177  
28              
29 29     29   773 use strict;
  29         36  
  29         70662  
30              
31             =head2 tile_hsps
32              
33             Usage : tile_hsps( $sbjct );
34             : This is called automatically by methods in Bio::Search::Hit::GenericHit
35             : that rely on having tiled data.
36             :
37             : If you are interested in getting data about the constructed HSP contigs:
38             : my ($qcontigs, $scontigs) = Bio::Search::SearchUtils::tile_hsps($hit);
39             : if (ref $qcontigs) {
40             : print STDERR "Query contigs:\n";
41             : foreach (@{$qcontigs}) {
42             : print "contig start is $_->{'start'}\n";
43             : print "contig stop is $_->{'stop'}\n";
44             : }
45             : }
46             : See below for more information about the contig data structure.
47             :
48             Purpose : Collect statistics about the aligned sequences in a set of HSPs.
49             : Calculates the following data across all HSPs:
50             : -- total alignment length
51             : -- total identical residues
52             : -- total conserved residues
53             Returns : If there was only a single HSP (so no tiling was necessary)
54             tile_hsps() returns a list of two non-zero integers.
55             If there were multiple HSP,
56             tile_hsps() returns a list of two array references containin HSP contig data.
57             The first array ref contains a list of HSP contigs on the query sequence.
58             The second array ref contains a list of HSP contigs on the subject sequence.
59             Each contig is a hash reference with the following data fields:
60             'start' => start coordinate of the contig
61             'stop' => start coordinate of the contig
62             'iden' => number of identical residues in the contig
63             'cons' => number of conserved residues in the contig
64             'strand'=> strand of the contig
65             'frame' => frame of the contig
66             Argument : A Bio::Search::Hit::HitI object
67             Throws : n/a
68             Comments :
69             : This method performs more careful summing of data across
70             : all HSPs in the Sbjct object. Only HSPs that are in the same strand
71             : and frame are tiled. Simply summing the data from all HSPs
72             : in the same strand and frame will overestimate the actual
73             : length of the alignment if there is overlap between different HSPs
74             : (often the case).
75             :
76             : The strategy is to tile the HSPs and sum over the
77             : contigs, collecting data separately from overlapping and
78             : non-overlapping regions of each HSP. To facilitate this, the
79             : HSP.pm object now permits extraction of data from sub-sections
80             : of an HSP.
81             :
82             : Additional useful information is collected from the results
83             : of the tiling. It is possible that sub-sequences in
84             : different HSPs will overlap significantly. In this case, it
85             : is impossible to create a single unambiguous alignment by
86             : concatenating the HSPs. The ambiguity may indicate the
87             : presence of multiple, similar domains in one or both of the
88             : aligned sequences. This ambiguity is recorded using the
89             : ambiguous_aln() method.
90             :
91             : This method does not attempt to discern biologically
92             : significant vs. insignificant overlaps. The allowable amount of
93             : overlap can be set with the overlap() method or with the -OVERLAP
94             : parameter used when constructing the Hit object.
95             :
96             : For a given hit, both the query and the sbjct sequences are
97             : tiled independently.
98             :
99             : -- If only query sequence HSPs overlap,
100             : this may suggest multiple domains in the sbjct.
101             : -- If only sbjct sequence HSPs overlap,
102             : this may suggest multiple domains in the query.
103             : -- If both query & sbjct sequence HSPs overlap,
104             : this suggests multiple domains in both.
105             : -- If neither query & sbjct sequence HSPs overlap,
106             : this suggests either no multiple domains in either
107             : sequence OR that both sequences have the same
108             : distribution of multiple similar domains.
109             :
110             : This method can deal with the special case of when multiple
111             : HSPs exactly overlap.
112             :
113             : Efficiency concerns:
114             : Speed will be an issue for sequences with numerous HSPs.
115             :
116             Bugs : Currently, tile_hsps() does not properly account for
117             : the number of non-tiled but overlapping HSPs, which becomes a problem
118             : as overlap() grows. Large values overlap() may thus lead to
119             : incorrect statistics for some hits. For best results, keep overlap()
120             : below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and
121             : Ambiguous Alignments" section in L.
122              
123             See Also : L<_adjust_contigs>(), L
124              
125             =cut
126              
127             #--------------
128             sub tile_hsps {
129             #--------------
130 48     48 1 94 my $sbjct = shift;
131              
132             #print STDERR "Calling tile_hsps(): $sbjct\n";
133             #$sbjct->verbose(1); # to activate debugging
134 48         125 $sbjct->tiled_hsps(1);
135              
136             # changed to not rely on n() (which is unreliable here) --cjfields 4/6/10
137 48 50       125 if( $sbjct->num_hsps == 0) {
    100          
138             #print STDERR "_tile_hsps(): no hsps, nothing to tile! (", $sbjct->num_hsps, ")\n";
139 0         0 _warn_about_no_hsps($sbjct);
140 0         0 return (undef, undef);
141              
142             } elsif($sbjct->num_hsps == 1) {
143             ## Simple summation scheme. Valid if there is only one HSP.
144             #print STDERR "_tile_hsps(): single HSP, easy stats.\n";
145 38         99 my $hsp = $sbjct->hsp;
146 38         118 $sbjct->length_aln('query', $hsp->length('query'));
147 38         88 $sbjct->length_aln('hit', $hsp->length('sbjct'));
148 38         98 $sbjct->length_aln('total', $hsp->length('total'));
149 38         170 $sbjct->matches( $hsp->matches() );
150 38         123 $sbjct->gaps('query', $hsp->gaps('query'));
151 38         85 $sbjct->gaps('sbjct', $hsp->gaps('sbjct'));
152              
153 38         85 _adjust_length_aln($sbjct);
154 38         61 return (1, 1);
155             } else {
156             #print STDERR "Sbjct: _tile_hsps: summing multiple HSPs\n";
157 10         37 $sbjct->length_aln('query', 0);
158 10         21 $sbjct->length_aln('sbjct', 0);
159 10         24 $sbjct->length_aln('total', 0);
160 10         31 $sbjct->matches( 0, 0);
161 10         28 $sbjct->gaps('query', 0);
162 10         22 $sbjct->gaps('hit', 0);
163             }
164              
165             ## More than one HSP. Must tile HSPs.
166             # print "\nTiling HSPs for $sbjct\n";
167 10         18 my($hsp, $qstart, $qstop, $sstart, $sstop);
168 0         0 my($frame, $strand, $qstrand, $sstrand);
169 0         0 my(@qcontigs, @scontigs);
170 10         13 my $qoverlap = 0;
171 10         11 my $soverlap = 0;
172 10         46 my $max_overlap = $sbjct->overlap;
173 10         14 my $hit_qgaps = 0;
174 10         11 my $hit_sgaps = 0;
175 10         12 my $hit_len_aln = 0;
176 10         11 my %start_stop;
177 10         48 my $v = $sbjct->verbose;
178 10         35 foreach $hsp ( $sbjct->hsps() ) {
179             #$sbjct->debug( sprintf(" HSP: %s %d..%d\n",$hsp->query->seq_id, $hsp->query->start, $hsp->hit->end)) if $v > 0; #$hsp->str('query');
180             # printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'),
181             # $hsp->length(-TYPE=>'cons'),
182             # $hsp->length(-TYPE=>'cons',
183             # -START=>0,-STOP=>10);
184              
185 53         141 ($qstart, $qstop) = $hsp->range('query');
186 53         113 ($sstart, $sstop) = $hsp->range('sbjct');
187 53         132 $frame = $hsp->frame('hit');
188 53 50       100 $frame = -1 unless defined $frame;
189            
190 53         109 ($qstrand, $sstrand) = ($hsp->query->strand,
191             $hsp->hit->strand);
192              
193             # Note: No correction for overlap.
194            
195 53         133 my ($qgaps, $sgaps) = ($hsp->gaps('query'), $hsp->gaps('hit'));
196 53         61 $hit_qgaps += $qgaps;
197 53         52 $hit_sgaps += $sgaps;
198 53         106 $hit_len_aln += $hsp->length;
199              
200             ## Collect contigs in the query sequence.
201 53         144 $qoverlap += &_adjust_contigs('query', $hsp, $qstart, $qstop,
202             \@qcontigs, $max_overlap, $frame,
203             $qstrand);
204              
205             ## Collect contigs in the sbjct sequence
206             # (needed for domain data and gapped Blast).
207 53         97 $soverlap += &_adjust_contigs('sbjct', $hsp, $sstart, $sstop,
208             \@scontigs, $max_overlap, $frame,
209             $sstrand);
210              
211             ## Collect overall start and stop data for query and
212             # sbjct over all HSPs.
213 53 100       87 unless ( defined $start_stop{'qstart'} ) {
214 10         47 $start_stop{'qstart'} = $qstart;
215 10         19 $start_stop{'qstop'} = $qstop;
216 10         14 $start_stop{'sstart'} = $sstart;
217 10         24 $start_stop{'sstop'} = $sstop;
218             } else {
219             $start_stop{'qstart'} = ($qstart < $start_stop{'qstart'} ?
220 43 100       82 $qstart : $start_stop{'qstart'} );
221             $start_stop{'qstop'} = ($qstop > $start_stop{'qstop'} ?
222 43 100       69 $qstop : $start_stop{'qstop'} );
223             $start_stop{'sstart'} = ($sstart < $start_stop{'sstart'} ?
224 43 100       74 $sstart : $start_stop{'sstart'} );
225             $start_stop{'sstop'} = ($sstop > $start_stop{'sstop'} ?
226 43 100       92 $sstop : $start_stop{'sstop'} );
227             }
228             }
229              
230             # Store the collected data in the Hit object
231 10         41 $sbjct->gaps('query', $hit_qgaps);
232 10         23 $sbjct->gaps('hit', $hit_sgaps);
233 10         25 $sbjct->length_aln('total', $hit_len_aln);
234            
235 10         47 $sbjct->start('query',$start_stop{'qstart'});
236 10         38 $sbjct->end('query', $start_stop{'qstop'});
237 10         23 $sbjct->start('hit', $start_stop{'sstart'});
238 10         25 $sbjct->end('hit', $start_stop{'sstop'});
239             ## Collect data across the collected contigs.
240              
241             #$sbjct->debug( "\nQUERY CONTIGS:\n"." gaps = $sbjct->{'_gaps_query'}\n");
242              
243             # Account for strand/frame.
244             # Strategy: collect data on a per strand+frame basis and
245             # save the most significant one.
246 10         15 my (%qctg_dat);
247 10         19 foreach (@qcontigs) {
248 21         40 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'});
249            
250 21 50       41 if( $v > 0 ) {
251             #$sbjct->debug(sprintf( "$frame/$strand len is getting %d for %d..%d\n",
252             # ($_->{'stop'} - $_->{'start'} + 1), $_->{'start'}, $_->{'stop'}));
253             }
254            
255 21         56 $qctg_dat{ "$frame$strand" }->{'length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1;
256 21         39 $qctg_dat{ "$frame$strand" }->{'totalIdentical'} += $_->{'iden'};
257 21         35 $qctg_dat{ "$frame$strand" }->{'totalConserved'} += $_->{'cons'};
258 21         39 $qctg_dat{ "$frame$strand" }->{'qstrand'} = $strand;
259             }
260              
261             # Find longest contig.
262 10         43 my @sortedkeys = sort { $qctg_dat{$b}->{'length_aln_query'}
263 1         4 <=> $qctg_dat{$a}->{'length_aln_query'} }
264             keys %qctg_dat;
265              
266             # Save the largest to the sbjct:
267 10         17 my $longest = $sortedkeys[0];
268             #$sbjct->debug( "longest is ". $qctg_dat{ $longest }->{'length_aln_query'}. "\n");
269 10         29 $sbjct->length_aln('query', $qctg_dat{ $longest }->{'length_aln_query'});
270             $sbjct->matches($qctg_dat{ $longest }->{'totalIdentical'},
271 10         29 $qctg_dat{ $longest }->{'totalConserved'});
272 10         37 $sbjct->strand('query', $qctg_dat{ $longest }->{'qstrand'});
273              
274             ## Collect data for sbjct contigs. Important for gapped Blast.
275             ## The totalIdentical and totalConserved numbers will be the same
276             ## as determined for the query contigs.
277              
278             #$sbjct->debug( "\nSBJCT CONTIGS:\n"." gaps = ". $sbjct->gaps('sbjct'). "\n");
279 10         10 my (%sctg_dat);
280 10         21 foreach(@scontigs) {
281             #$sbjct->debug(" sbjct contig: $_->{'start'} - $_->{'stop'}\n".
282             # " iden = $_->{'iden'}; cons = $_->{'cons'}\n");
283 35         68 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'});
284 35         62 $sctg_dat{ "$frame$strand" }->{'length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1;
285 35         39 $sctg_dat{ "$frame$strand" }->{'frame'} = $frame;
286 35         42 $sctg_dat{ "$frame$strand" }->{'sstrand'} = $strand;
287             }
288            
289 10         34 @sortedkeys = sort { $sctg_dat{ $b }->{'length_aln_sbjct'}
290 2         9 <=> $sctg_dat{ $a }->{'length_aln_sbjct'}
291             } keys %sctg_dat;
292              
293             # Save the largest to the sbjct:
294 10         15 $longest = $sortedkeys[0];
295              
296 10         27 $sbjct->length_aln('sbjct', $sctg_dat{ $longest }->{'length_aln_sbjct'});
297 10         36 $sbjct->frame( $sctg_dat{ $longest }->{'frame'} );
298 10         23 $sbjct->strand('hit', $sctg_dat{ $longest }->{'sstrand'});
299              
300 10 100       26 if($qoverlap) {
    100          
301 7 100       12 if($soverlap) { $sbjct->ambiguous_aln('qs');
  5         19  
302             #$sbjct->debug("\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n");
303             }
304 2         7 else { $sbjct->ambiguous_aln('q');
305             #$sbjct->debug( "\n*** AMBIGUOUS ALIGNMENT: Query\n\n");
306             }
307             } elsif($soverlap) {
308 2         6 $sbjct->ambiguous_aln('s');
309             #$sbjct->debug( "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n");
310             }
311              
312 10         23 _adjust_length_aln($sbjct);
313              
314 10         76 return ( [@qcontigs], [@scontigs] );
315             }
316              
317              
318              
319             # Title : _adjust_length_aln
320             # Usage : n/a; internal use only; called by tile_hsps.
321             # Purpose : Adjust length of aligment based on BLAST flavor.
322             # Comments : See comments in logica_length()
323             sub _adjust_length_aln {
324 48     48   62 my $sbjct = shift;
325 48         111 my $algo = $sbjct->algorithm;
326 48         95 my $hlen = $sbjct->length_aln('sbjct');
327 48         91 my $qlen = $sbjct->length_aln('query');
328              
329 48         166 $sbjct->length_aln('sbjct', logical_length($algo, 'sbjct', $hlen));
330 48         90 $sbjct->length_aln('query', logical_length($algo, 'query', $qlen));
331             }
332              
333             =head2 logical_length
334              
335             Usage : logical_length( $alg_name, $seq_type, $length );
336             Purpose : Determine the logical length of an aligned sequence based on
337             : algorithm name and sequence type.
338             Returns : integer representing the logical aligned length.
339             Argument : $alg_name = name of algorigthm (e.g., blastx, tblastn)
340             : $seq_type = type of sequence (e.g., query or hit)
341             : $length = physical length of the sequence in the alignment.
342             Throws : n/a
343             Comments : This function is used to account for the fact that number of identities
344             and conserved residues is reported in peptide space while the query
345             length (in the case of BLASTX and TBLASTX) and/or the hit length
346             (in the case of TBLASTN and TBLASTX) are in nucleotide space.
347             The adjustment affects the values reported by the various frac_XXX
348             methods in GenericHit and GenericHSP.
349              
350             =cut
351              
352             sub logical_length {
353 128     128 1 131 my ($algo, $type, $len) = @_;
354 128         100 my $logical = $len;
355 128 50       616 if($algo =~ /^(?:PSI)?T(?:BLASTN|FAST(?:X|Y|XY))/oi ) {
    100          
    100          
356 0 0       0 $logical = $len/3 if $type =~ /sbjct|hit|tot/i;
357             } elsif($algo =~ /^(?:BLASTX|FAST(?:X|Y|XY))/oi ) {
358 4 100       17 $logical = $len/3 if $type =~ /query|tot/i;
359             } elsif($algo =~ /^TBLASTX/oi ) {
360 40         40 $logical = $len/3;
361             }
362 128         253 return $logical;
363             }
364              
365              
366             #=head2 _adjust_contigs
367             #
368             # Usage : n/a; internal function called by tile_hsps
369             # Purpose : Builds HSP contigs for a given BLAST hit.
370             # : Utility method called by _tile_hsps()
371             # Returns :
372             # Argument :
373             # Throws : Exceptions propagated from Bio::Search::Hit::BlastHSP::matches()
374             # : for invalid sub-sequence ranges.
375             # Status : Experimental
376             # Comments : This method supports gapped alignments through a patch by maj
377             # : to B:S:HSP:HSPI::matches().
378             # : It does not keep track of the number of HSPs that
379             # : overlap within the amount specified by overlap().
380             # : This will lead to significant tracking errors for large
381             # : overlap values.
382             #
383             #See Also : L(), L
384             #
385             #=cut
386              
387             sub _adjust_contigs {
388 106     106   173 my ($seqType, $hsp, $start, $stop, $contigs_ref,
389             $max_overlap, $frame, $strand) = @_;
390 106         79 my $overlap = 0;
391 106         73 my ($numID, $numCons);
392            
393 106         152 foreach (@$contigs_ref) {
394             # Don't merge things unless they have matching strand/frame.
395 211 100 100     706 next unless ($_->{'frame'} == $frame && $_->{'strand'} == $strand);
396            
397             # Test special case of a nested HSP. Skip it.
398 189 100 100     476 if ($start >= $_->{'start'} && $stop <= $_->{'stop'}) {
399 18         20 $overlap = 1;
400 18         28 next;
401             }
402            
403             # Test for overlap at beginning of contig, or precedes consecutively
404 171 100 100     392 if ($start < $_->{'start'} && $stop >= ($_->{'start'} + $max_overlap - 1)) {
405 12         14 eval {
406             ($numID, $numCons) = $hsp->matches(-SEQ =>$seqType,
407             -START => $start,
408 12         66 -STOP => $_->{'start'} - 1);
409 12 50       27 if ($numID eq '') {
410 0         0 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
411 0         0 $numID = 0;
412             }
413 12 50       29 if ($numCons eq '') {
414 0         0 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
415 0         0 $numCons = 0;
416             }
417             };
418 12 50       16 if($@) { warn "\a\n$@\n"; }
  0         0  
419             else {
420 12         16 $_->{'start'} = $start; # Assign a new start coordinate to the contig
421 12         19 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
422 12         11 $_->{'cons'} += $numCons;
423 12         12 push(@{$_->{hsps}}, $hsp);
  12         23  
424 12         12 $overlap = 1;
425             }
426             }
427            
428             # Test for overlap at end of contig, or follows consecutively
429 171 100 100     422 if ($stop > $_->{'stop'} and $start <= ($_->{'stop'} - $max_overlap + 1)) {
430 17         540 eval {
431             ($numID,$numCons) = $hsp->matches(-SEQ =>$seqType,
432 17         81 -START => $_->{'stop'} + 1,
433             -STOP => $stop);
434 17 50       45 if ($numID eq '') {
435 0         0 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
436 0         0 $numID = 0;
437             }
438 17 50       50 if ($numCons eq '') {
439 0         0 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
440 0         0 $numCons = 0;
441             }
442             };
443 17 50       28 if($@) { warn "\a\n$@\n"; }
  0         0  
444             else {
445 17         27 $_->{'stop'} = $stop; # Assign a new stop coordinate to the contig
446 17         24 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
447 17         20 $_->{'cons'} += $numCons;
448 17         14 push(@{$_->{hsps}}, $hsp);
  17         39  
449 17         22 $overlap = 1;
450             }
451             }
452            
453 171 100       255 last if $overlap;
454             }
455            
456 106 100 100     314 if ($overlap && @$contigs_ref > 1) {
    100          
457             ## Merge any contigs that now overlap
458 20         15 my $max = $#{$contigs_ref};
  20         29  
459 20         51 for my $i (0..$max) {
460 50 100       41 ${$contigs_ref}[$i] || next;
  50         92  
461 44         33 my ($i_start, $i_stop) = (${$contigs_ref}[$i]->{start}, ${$contigs_ref}[$i]->{stop});
  44         42  
  44         54  
462            
463 44         68 for my $u ($i+1..$max) {
464 55 100       42 ${$contigs_ref}[$u] || next;
  55         81  
465 52         40 my ($u_start, $u_stop) = (${$contigs_ref}[$u]->{start}, ${$contigs_ref}[$u]->{stop});
  52         48  
  52         47  
466            
467 52 100 100     326 if ($u_start < $i_start && $u_stop >= ($i_start + $max_overlap - 1)) {
    100 100        
    100 100        
468             # find the hsps within the contig that have sequence
469             # extending before $i_start
470 2         2 my ($ids, $cons) = (0, 0);
471 2         3 my $use_start = $i_start;
472 2         3 foreach my $hsp (sort { $b->end($seqType) <=> $a->end($seqType) } @{${$contigs_ref}[$u]->{hsps}}) {
  2         5  
  2         2  
  2         21  
473 4         12 my $hsp_start = $hsp->start($seqType);
474 4 50       78 $hsp_start < $use_start || next;
475            
476 4         6 my ($these_ids, $these_cons);
477 4         4 eval {
478 4         18 ($these_ids, $these_cons) = $hsp->matches(-SEQ => $seqType, -START => $hsp_start, -STOP => $use_start - 1);
479 4 50       10 if ($these_ids eq '') {
480 0         0 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
481 0         0 $these_ids = 0;
482             }
483 4 50       8 if ($these_cons eq '') {
484 0         0 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
485 0         0 $these_cons = 0;
486             }
487             };
488 4 50       7 if($@) { warn "\a\n$@\n"; }
  0         0  
489             else {
490 4         4 $ids += $these_ids;
491 4         2 $cons += $these_cons;
492             }
493            
494 4 100       10 last if $hsp_start == $u_start;
495 2         7 $use_start = $hsp_start;
496             }
497 2         3 ${$contigs_ref}[$i]->{start} = $u_start;
  2         5  
498 2         2 ${$contigs_ref}[$i]->{'iden'} += $ids;
  2         3  
499 2         2 ${$contigs_ref}[$i]->{'cons'} += $cons;
  2         3  
500 2         2 push(@{${$contigs_ref}[$i]->{hsps}}, @{${$contigs_ref}[$u]->{hsps}});
  2         1  
  2         4  
  2         1  
  2         6  
501            
502 2         2 ${$contigs_ref}[$u] = undef;
  2         7  
503             }
504             elsif ($u_stop > $i_stop && $u_start <= ($i_stop - $max_overlap + 1)) {
505             # find the hsps within the contig that have sequence
506             # extending beyond $i_stop
507 3         7 my ($ids, $cons) = (0, 0);
508 3         4 my $use_stop = $i_stop;
509 3         5 foreach my $hsp (sort { $a->start($seqType) <=> $b->start($seqType) } @{${$contigs_ref}[$u]->{hsps}}) {
  1         4  
  3         4  
  3         16  
510 4         15 my $hsp_end = $hsp->end($seqType);
511 4 50       12 $hsp_end > $use_stop || next;
512            
513 4         3 my ($these_ids, $these_cons);
514 4         5 eval {
515 4         19 ($these_ids, $these_cons) = $hsp->matches(-SEQ => $seqType, -START => $use_stop + 1, -STOP => $hsp_end);
516 4 50       10 if ($these_ids eq '') {
517 0         0 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
518 0         0 $these_ids = 0;
519             }
520 4 50       11 if ($these_cons eq '') {
521 0         0 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
522 0         0 $these_cons = 0;
523             }
524             };
525 4 50       7 if($@) { warn "\a\n$@\n"; }
  0         0  
526             else {
527 4         5 $ids += $these_ids;
528 4         5 $cons += $these_cons;
529             }
530            
531 4 100       10 last if $hsp_end == $u_stop;
532 1         1 $use_stop = $hsp_end;
533             }
534 3         4 ${$contigs_ref}[$i]->{'stop'} = $u_stop;
  3         6  
535 3         3 ${$contigs_ref}[$i]->{'iden'} += $ids;
  3         7  
536 3         4 ${$contigs_ref}[$i]->{'cons'} += $cons;
  3         3  
537 3         3 push(@{${$contigs_ref}[$i]->{hsps}}, @{${$contigs_ref}[$u]->{hsps}});
  3         4  
  3         6  
  3         4  
  3         5  
538            
539 3         4 ${$contigs_ref}[$u] = undef;
  3         13  
540             }
541             elsif ($u_start >= $i_start && $u_stop <= $i_stop) {
542             # nested, drop this contig
543             #*** ideally we might do some magic to keep the stats of the
544             # better hsp...
545 1         2 ${$contigs_ref}[$u] = undef;
  1         4  
546             }
547             }
548             }
549            
550 20         19 my @merged;
551 20         26 foreach (@$contigs_ref) {
552 50   100     84 push(@merged, $_ || next);
553             }
554 20         19 @{$contigs_ref} = @merged;
  20         37  
555             }
556             elsif (! $overlap) {
557             ## If there is no overlap, add the complete HSP data.
558 62         200 ($numID,$numCons) = $hsp->matches(-SEQ=>$seqType);
559 62 50       118 if ($numID eq '') {
560 0         0 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
561 0         0 $numID = 0;
562             }
563 62 50       87 if ($numCons eq '') {
564 0         0 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
565 0         0 $numCons = 0;
566             }
567              
568 62         282 push @$contigs_ref, {'start' =>$start, 'stop' =>$stop,
569             'iden' =>$numID, 'cons' =>$numCons,
570             'strand'=>$strand,'frame'=>$frame,'hsps'=>[$hsp]};
571             }
572            
573 106         153 return $overlap;
574             }
575              
576             =head2 get_exponent
577              
578             Usage : &get_exponent( number );
579             Purpose : Determines the power of 10 exponent of an integer, float,
580             : or scientific notation number.
581             Example : &get_exponent("4.0e-206");
582             : &get_exponent("0.00032");
583             : &get_exponent("10.");
584             : &get_exponent("1000.0");
585             : &get_exponent("e+83");
586             Argument : Float, Integer, or scientific notation number
587             Returns : Integer representing the exponent part of the number (+ or -).
588             : If argument == 0 (zero), return value is "-999".
589             Comments : Exponents are rounded up (less negative) if the mantissa is >= 5.
590             : Exponents are rounded down (more negative) if the mantissa is <= -5.
591              
592             =cut
593              
594             sub get_exponent {
595 0     0 1 0 my $data = shift;
596              
597 0         0 my($num, $exp) = split /[eE]/, $data;
598              
599 0 0       0 if( defined $exp) {
    0          
    0          
600 0 0       0 $num = 1 if not $num;
601 0 0       0 $num >= 5 and $exp++;
602 0 0       0 $num <= -5 and $exp--;
603             } elsif( $num == 0) {
604 0         0 $exp = -999;
605             } elsif( not $num =~ /\./) {
606 0         0 $exp = CORE::length($num) -1;
607             } else {
608 0         0 $exp = 0;
609 0 0       0 $num .= '0' if $num =~ /\.$/;
610 0         0 my ($c);
611 0         0 my $rev = 0;
612 0 0       0 if($num !~ /^0/) {
613 0         0 $num = reverse($num);
614 0         0 $rev = 1;
615             }
616 0         0 do { $c = chop($num);
  0         0  
617 0 0       0 $c == 0 && $exp++;
618             } while( $c ne '.');
619              
620 0 0 0     0 $exp = -$exp if $num == 0 and not $rev;
621 0 0       0 $exp -= 1 if $rev;
622             }
623 0         0 return $exp;
624             }
625              
626             =head2 collapse_nums
627              
628             Usage : @cnums = collapse_nums( @numbers );
629             Purpose : Collapses a list of numbers into a set of ranges of consecutive terms:
630             : Useful for condensing long lists of consecutive numbers.
631             : EXPANDED:
632             : 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32
633             : COLLAPSED:
634             : 1-6 10 12-15 17 18 20-22 24 26 30-32
635             Argument : List of numbers sorted numerically.
636             Returns : List of numbers mixed with ranges of numbers (see above).
637             Throws : n/a
638              
639             See Also : L
640              
641             =cut
642              
643             sub collapse_nums {
644             # This is probably not the slickest connectivity algorithm, but will do for now.
645 50     50 1 434 my @a = @_;
646 50         50 my ($from, $to, $i, @ca, $consec);
647            
648 50         62 $consec = 0;
649 50         122 for($i=0; $i < @a; $i++) {
650 6512 100       6421 not $from and do{ $from = $a[$i]; next; };
  47         59  
  47         94  
651             # pass repeated positions (gap inserts)
652 6465 100       7400 next if $a[$i] == $a[$i-1];
653 6290 100       5704 if($a[$i] == $a[$i-1]+1) {
654 4436         2694 $to = $a[$i];
655 4436         5385 $consec++;
656             } else {
657 1854 100       1573 if($consec == 1) { $from .= ",$to"; }
  290         275  
658 1564 100       1785 else { $from .= $consec>1 ? "\-$to" : ""; }
659 1854         1782 push @ca, split(',', $from);
660 1854         1304 $from = $a[$i];
661 1854         1063 $consec = 0;
662 1854         2373 $to = undef;
663             }
664             }
665 50 100       81 if(defined $to) {
666 23 100       38 if($consec == 1) { $from .= ",$to"; }
  3         7  
667 20 50       42 else { $from .= $consec>1 ? "\-$to" : ""; }
668             }
669 50 100       100 push @ca, split(',', $from) if $from;
670              
671 50         929 @ca;
672             }
673              
674              
675             =head2 strip_blast_html
676              
677             Usage : $boolean = &strip_blast_html( string_ref );
678             : This method is exported.
679             Purpose : Removes HTML formatting from a supplied string.
680             : Attempts to restore the Blast report to enable
681             : parsing by Bio::SearchIO::blast.pm
682             Returns : Boolean: true if string was stripped, false if not.
683             Argument : string_ref = reference to a string containing the whole Blast
684             : report containing HTML formatting.
685             Throws : Croaks if the argument is not a scalar reference.
686             Comments : Based on code originally written by Alex Dong Li
687             : (ali@genet.sickkids.on.ca).
688             : This method does some Blast-specific stripping
689             : (adds back a '>' character in front of each HSP
690             : alignment listing).
691             :
692             : THIS METHOD IS VERY SENSITIVE TO BLAST FORMATTING CHANGES!
693             :
694             : Removal of the HTML tags and accurate reconstitution of the
695             : non-HTML-formatted report is highly dependent on structure of
696             : the HTML-formatted version. For example, it assumes that first
697             : line of each alignment section (HSP listing) starts with a
698             : anchor tag. This permits the reconstruction of the
699             : original report in which these lines begin with a ">".
700             : This is required for parsing.
701             :
702             : If the structure of the Blast report itself is not intended to
703             : be a standard, the structure of the HTML-formatted version
704             : is even less so. Therefore, the use of this method to
705             : reconstitute parsable Blast reports from HTML-format versions
706             : should be considered a temorary solution.
707              
708             =cut
709              
710             sub strip_blast_html {
711             # This may not best way to remove html tags. However, it is simple.
712             # it won't work under following conditions:
713             # 1) if quoted > appears in a tag (does this ever happen?)
714             # 2) if a tag is split over multiple lines and this method is
715             # used to process one line at a time.
716            
717 0     0 1 0 my ($string_ref) = shift;
718              
719 0 0       0 ref $string_ref eq 'SCALAR' or
720             croak ("Can't strip HTML: ".
721 0         0 "Argument is should be a SCALAR reference not a ${\ref $string_ref}\n");
722              
723 0         0 my $str = $$string_ref;
724 0         0 my $stripped = 0;
725              
726             # Removing "" and adding the '>' character for
727             # HSP alignment listings.
728 0 0       0 $str =~ s/(\A|\n)]+> ?/>/sgi and $stripped = 1;
729              
730             # Removing all "<>" tags.
731 0 0       0 $str =~ s/<[^>]+>| //sgi and $stripped = 1;
732              
733             # Re-uniting any lone '>' characters.
734 0 0       0 $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1;
735              
736 0         0 $$string_ref = $str;
737 0         0 $stripped;
738             }
739              
740             =head2 result2hash
741              
742             Title : result2hash
743             Usage : my %data = &Bio::Search::SearchUtils($result)
744             Function : converts ResultI data to simple hash
745             Returns : hash
746             Args : ResultI
747             Note : used mainly as a utility for running SearchIO tests
748              
749             =cut
750              
751             sub result2hash {
752 4     4 1 766 my ($result) = @_;
753 4         6 my %hash;
754 4         12 $hash{'query_name'} = $result->query_name;
755 4         7 my $hitcount = 1;
756 4         5 my $hspcount = 1;
757 4         19 foreach my $hit ( $result->hits ) {
758 8         17 $hash{"hit$hitcount\_name"} = $hit->name;
759             # only going to test order of magnitude
760             # too hard as these don't always match
761             # $hash{"hit$hitcount\_signif"} =
762             # ( sprintf("%.0e",$hit->significance) =~ /e\-?(\d+)/ );
763 8         20 $hash{"hit$hitcount\_bits"} = sprintf("%d",$hit->bits);
764              
765 8         21 foreach my $hsp ( $hit->hsps ) {
766 22         41 $hash{"hsp$hspcount\_bits"} = sprintf("%d",$hsp->bits);
767             # only going to test order of magnitude
768             # too hard as these don't always match
769             # $hash{"hsp$hspcount\_evalue"} =
770             # ( sprintf("%.0e",$hsp->evalue) =~ /e\-?(\d+)/ );
771 22         42 $hash{"hsp$hspcount\_qs"} = $hsp->query->start;
772 22         40 $hash{"hsp$hspcount\_qe"} = $hsp->query->end;
773 22         33 $hash{"hsp$hspcount\_qstr"} = $hsp->query->strand;
774 22         45 $hash{"hsp$hspcount\_hs"} = $hsp->hit->start;
775 22         43 $hash{"hsp$hspcount\_he"} = $hsp->hit->end;
776 22         40 $hash{"hsp$hspcount\_hstr"} = $hsp->hit->strand;
777              
778             #$hash{"hsp$hspcount\_pid"} = sprintf("%d",$hsp->percent_identity);
779             #$hash{"hsp$hspcount\_fid"} = sprintf("%.2f",$hsp->frac_identical);
780 22         47 $hash{"hsp$hspcount\_gaps"} = $hsp->gaps('total');
781 22         35 $hspcount++;
782             }
783 8         10 $hitcount++;
784             }
785 4         123 return %hash;
786             }
787              
788             sub _warn_about_no_hsps {
789 0     0     my $hit = shift;
790 0           my $prev_func=(caller(1))[3];
791 0           $hit->warn("There is no HSP data for hit '".$hit->name."'.\n".
792             "You have called a method ($prev_func)\n".
793             "that requires HSP data and there was no HSP data for this hit,\n".
794             "most likely because it was absent from the BLAST report.\n".
795             "Note that by default, BLAST lists alignments for the first 250 hits,\n".
796             "but it lists descriptions for 500 hits. If this is the case,\n".
797             "and you care about these hits, you should re-run BLAST using the\n".
798             "-b option (or equivalent if not using blastall) to increase the number\n".
799             "of alignments.\n"
800             );
801             }
802              
803             1;