File Coverage

Bio/Search/HSP/GenericHSP.pm
Criterion Covered Total %
statement 508 564 90.0
branch 274 354 77.4
condition 127 178 71.3
subroutine 50 50 100.0
pod 35 35 100.0
total 994 1181 84.1


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Search::HSP::GenericHSP
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
17              
18             =head1 SYNOPSIS
19              
20             my $hsp = Bio::Search::HSP::GenericHSP->new( -algorithm => 'blastp',
21             -evalue => '1e-30',
22             );
23              
24             $r_type = $hsp->algorithm;
25              
26             $pvalue = $hsp->p();
27              
28             $evalue = $hsp->evalue();
29              
30             $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
31              
32             $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
33              
34             $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
35              
36             $qseq = $hsp->query_string;
37              
38             $hseq = $hsp->hit_string;
39              
40             $homo_string = $hsp->homology_string;
41              
42             $len = $hsp->length( ['query'|'hit'|'total'] );
43              
44             $len = $hsp->length( ['query'|'hit'|'total'] );
45              
46             $rank = $hsp->rank;
47              
48             # TODO: Describe how to configure a SearchIO stream so that it generates
49             # GenericHSP objects.
50              
51             =head1 DESCRIPTION
52              
53             This implementation is "Generic", meaning it is is suitable for
54             holding information about High Scoring pairs from most Search reports
55             such as BLAST and FastA. Specialized objects can be derived from
56             this.
57              
58             Unless you're writing a parser, you won't ever need to create a
59             GenericHSP or any other HSPI-implementing object. If you use
60             the SearchIO system, HSPI objects are created automatically from
61             a SearchIO stream which returns Bio::Search::Result::ResultI objects
62             and you get the HSPI objects via the ResultI API.
63              
64             For documentation on what you can do with GenericHSP (and other HSPI
65             objects), please see the API documentation in
66             L.
67              
68             =head1 FEEDBACK
69              
70             =head2 Mailing Lists
71              
72             User feedback is an integral part of the evolution of this and other
73             Bioperl modules. Send your comments and suggestions preferably to
74             the Bioperl mailing list. Your participation is much appreciated.
75              
76             bioperl-l@bioperl.org - General discussion
77             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
78              
79             =head2 Support
80              
81             Please direct usage questions or support issues to the mailing list:
82              
83             I
84              
85             rather than to the module maintainer directly. Many experienced and
86             reponsive experts will be able look at the problem and quickly
87             address it. Please include a thorough description of the problem
88             with code and data examples if at all possible.
89              
90             =head2 Reporting Bugs
91              
92             Report bugs to the Bioperl bug tracking system to help us keep track
93             of the bugs and their resolution. Bug reports can be submitted via the
94             web:
95              
96             https://github.com/bioperl/bioperl-live/issues
97              
98             =head1 AUTHOR - Jason Stajich and Steve Chervitz
99              
100             Email jason-at-bioperl.org
101             Email sac-at-bioperl.org
102              
103             =head1 CONTRIBUTORS
104              
105             Sendu Bala, bix@sendu.me.uk
106              
107             =head1 APPENDIX
108              
109             The rest of the documentation details each of the object methods.
110             Internal methods are usually preceded with a _
111              
112             =cut
113              
114             # Let the code begin...
115              
116             package Bio::Search::HSP::GenericHSP;
117 27     27   106 use strict;
  27         35  
  27         670  
118              
119 27     27   102 use Bio::Root::Root;
  27         29  
  27         554  
120 27     27   9601 use Bio::SeqFeature::Similarity;
  27         67  
  27         796  
121              
122 27     27   131 use base qw(Bio::Search::HSP::HSPI);
  27         30  
  27         13051  
123              
124             =head2 new
125              
126             Title : new
127             Usage : my $obj = Bio::Search::HSP::GenericHSP->new();
128             Function: Builds a new Bio::Search::HSP::GenericHSP object
129             Returns : Bio::Search::HSP::GenericHSP
130             Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
131             -evalue => evalue
132             -pvalue => pvalue
133             -bits => bit value for HSP
134             -score => score value for HSP (typically z-score but depends on
135             analysis)
136             -hsp_length=> Length of the HSP (including gaps)
137             -identical => # of residues that that matched identically
138             -percent_identity => (optional) percent identity
139             -conserved => # of residues that matched conservatively
140             (only protein comparisions;
141             conserved == identical in nucleotide comparisons)
142             -hsp_gaps => # of gaps in the HSP
143             -query_gaps => # of gaps in the query in the alignment
144             -hit_gaps => # of gaps in the subject in the alignment
145             -query_name => HSP Query sequence name (if available)
146             -query_start => HSP Query start (in original query sequence coords)
147             -query_end => HSP Query end (in original query sequence coords)
148             -query_length=> total length of the query sequence
149             -query_seq => query sequence portion of the HSP
150             -query_desc => textual description of the query
151             -hit_name => HSP Hit sequence name (if available)
152             -hit_start => HSP Hit start (in original hit sequence coords)
153             -hit_end => HSP Hit end (in original hit sequence coords)
154             -hit_length => total length of the hit sequence
155             -hit_seq => hit sequence portion of the HSP
156             -hit_desc => textual description of the hit
157             -homology_seq=> homology sequence for the HSP
158             -hit_frame => hit frame (only if hit is translated protein)
159             -query_frame => query frame (only if query is translated protein)
160             -rank => HSP rank
161             -links => HSP links information (WU-BLAST only)
162             -hsp_group => HSP Group informat (WU-BLAST only)
163             -gap_symbol => symbol representing a gap (default = '-')
164             -hit_features=> string of features found in or near HSP hit
165             region (reported in some BLAST text output,
166             v. 2.2.13 and up)
167             -stranded => If the algorithm isn't known (i.e. defaults to
168             'generic'), setting this will indicate start/end
169             coordinates are to be used to determine the strand
170             for 'query', 'subject', 'hit', 'both', or 'none'
171             (default = 'none')
172              
173             =cut
174              
175             sub new {
176 1331     1331 1 8295 my($class,%args) = @_;
177              
178             # don't pass anything to SUPER; complex hierarchy results in lots of work
179             # for nothing
180            
181 1331         3701 my $self = $class->SUPER::new();
182              
183             # for speed, don't use _rearrange and just store all input data directly
184             # with no method calls and no work done. work can be carried
185             # out just-in-time later if desired
186 1331         3799 while (my ($arg, $value) = each %args) {
187 26240         19962 $arg =~ tr/a-z\055/A-Z/d;
188 26240         54148 $self->{$arg} = $value;
189             }
190 1331         1519 my $bits = $self->{BITS};
191              
192 1331 100       3638 defined $self->{VERBOSE} && $self->verbose($self->{VERBOSE});
193 1331 50       2056 if (exists $self->{GAP_SYMBOL}) {
194             # not checking anything else but the length (must be 1 as only one gap
195             # symbol allowed currently); can add in support for symbol checks or
196             # multiple symbols later if needed
197             $self->throw("Gap symbol must be of length 1") if
198 0 0       0 CORE::length($self->{GAP_SYMBOL}) != 1;
199             } else {
200             # dafault
201 1331         2218 $self->{GAP_SYMBOL} = '-';
202             }
203 1331   100     2583 $self->{ALGORITHM} ||= 'GENERIC';
204 1331   100     3349 $self->{STRANDED} ||= 'NONE';
205            
206 1331 50 33     3822 if (! defined $self->{QUERY_LENGTH} || ! defined $self->{HIT_LENGTH}) {
207 0         0 $self->throw("Must define hit and query length");
208             }
209              
210 1331         1548 $self->{'_sequenceschanged'} = 1;
211            
212 1331         1338 $self->{_finished_new} = 1;
213 1331         7045 return $self;
214             }
215              
216             sub _logical_length {
217 313     313   329 my ($self, $type) = @_;
218 313 100 66     803 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
219 105         246 $self->_calculate_seq_offsets();
220             }
221 313 100       1038 my $key = $type =~ /sbjct|hit|tot/i ? 'sbjct' : 'query';
222            
223 313         541 my $offset = $self->{"_${key}_offset"};
224 313         424 return $self->length($type) / $offset ;
225             }
226              
227             =head2 L methods
228              
229             Implementation of L methods follow
230              
231             =head2 algorithm
232              
233             Title : algorithm
234             Usage : my $r_type = $hsp->algorithm
235             Function: Obtain the name of the algorithm used to obtain the HSP
236             Returns : string (e.g., BLASTP)
237             Args : [optional] scalar string to set value
238              
239             =cut
240              
241             sub algorithm{
242 7397     7397 1 8281 my ($self,$value) = @_;
243 7397         6946 my $previous = $self->{'ALGORITHM'};
244 7397 50 33     20223 if( defined $value || ! defined $previous ) {
245 0 0       0 $value = $previous = '' unless defined $value;
246 0         0 $self->{'ALGORITHM'} = $value;
247             }
248              
249 7397         10305 return $previous;
250             }
251              
252             =head2 pvalue
253              
254             Title : pvalue
255             Usage : my $pvalue = $hsp->pvalue();
256             Function: Returns the P-value for this HSP or undef
257             Returns : float or exponential (2e-10)
258             P-value is not defined with NCBI Blast2 reports.
259             Args : [optional] numeric to set value
260              
261             =cut
262              
263             sub pvalue {
264 63     63 1 67 my ($self,$value) = @_;
265 63         71 my $previous = $self->{'PVALUE'};
266 63 50       113 if( defined $value ) {
267 0         0 $self->{'PVALUE'} = $value;
268             }
269 63         137 return $previous;
270             }
271              
272             =head2 evalue
273              
274             Title : evalue
275             Usage : my $evalue = $hsp->evalue();
276             Function: Returns the e-value for this HSP
277             Returns : float or exponential (2e-10)
278             Args : [optional] numeric to set value
279              
280             =cut
281              
282             sub evalue {
283 170     170 1 226 my ($self,$value) = @_;
284 170         238 my $previous = $self->{'EVALUE'};
285 170 50       335 if( defined $value ) {
286 0         0 $self->{'EVALUE'} = $value;
287             }
288 170         719 return $previous;
289             }
290              
291             =head2 frac_identical
292              
293             Title : frac_identical
294             Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
295             Function: Returns the fraction of identitical positions for this HSP
296             Returns : Float in range 0.0 -> 1.0
297             Args : arg 1: 'query' = num identical / length of query seq (without gaps)
298             'hit' = num identical / length of hit seq (without gaps)
299             synonyms: 'sbjct', 'subject'
300             'total' = num identical / length of alignment (with gaps)
301             synonyms: 'hsp'
302             default = 'total'
303             arg 2: [optional] frac identical value to set for the type requested
304             Note : for translated sequences, this adjusts the length accordingly
305              
306             =cut
307              
308             sub frac_identical {
309 602     602 1 683 my ($self, $type,$value) = @_;
310              
311 602 100       981 unless ($self->{_did_prefrac}) {
312 105         232 $self->_pre_frac;
313             }
314              
315 602 50       1080 $type = lc $type if defined $type;
316 602 50 33     1862 $type = 'hit' if( defined $type &&
317             $type =~ /subject|sbjct/);
318 602 100 66     3585 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
      66        
319             $type !~ /query|hit|subject|sbjct|total/);
320 602         707 my $previous = $self->{'_frac_identical'}->{$type};
321 602 100 66     1343 if( defined $value || ! defined $previous ) {
322 313 50       425 $value = $previous = '' unless defined $value;
323 313 100 100     822 if( $type eq 'hit' || $type eq 'query' ) {
324 208         388 $self->$type()->frac_identical( $value);
325             }
326 313         473 $self->{'_frac_identical'}->{$type} = $value;
327             }
328 602         1923 return $previous;
329              
330             }
331              
332             =head2 frac_conserved
333              
334             Title : frac_conserved
335             Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
336             Function : Returns the fraction of conserved positions for this HSP.
337             This is the fraction of symbols in the alignment with a
338             positive score.
339             Returns : Float in range 0.0 -> 1.0
340             Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
341             'hit' = num conserved / length of hit seq (without gaps)
342             synonyms: 'sbjct', 'subject'
343             'total' = num conserved / length of alignment (with gaps)
344             synonyms: 'hsp'
345             default = 'total'
346             arg 2: [optional] frac conserved value to set for the type requested
347              
348             =cut
349              
350             sub frac_conserved {
351 440     440 1 458 my ($self, $type,$value) = @_;
352              
353 440 50       679 unless ($self->{_did_prefrac}) {
354 0         0 $self->_pre_frac;
355             }
356              
357 440 100       787 $type = lc $type if defined $type;
358 440 50 66     1390 $type = 'hit' if( defined $type && $type =~ /subject|sbjct/);
359 440 100 100     2790 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
      66        
360             $type !~ /query|hit|subject|sbjct|total/);
361 440         506 my $previous = $self->{'_frac_conserved'}->{$type};
362 440 100 66     909 if( defined $value || ! defined $previous ) {
363 313 50       410 $value = $previous = '' unless defined $value;
364 313         366 $self->{'_frac_conserved'}->{$type} = $value;
365             }
366 440         1176 return $previous;
367             }
368              
369             =head2 gaps
370              
371             Title : gaps
372             Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
373             Function : Get the number of gap characters in the query, hit, or total alignment.
374             Returns : Integer, number of gaps or 0 if none
375             Args : arg 1: 'query' = num gap characters in query seq
376             'hit' = num gap characters in hit seq; synonyms: 'sbjct', 'subject'
377             'total' = num gap characters in whole alignment; synonyms: 'hsp'
378             default = 'total'
379             arg 2: [optional] integer gap value to set for the type requested
380              
381             =cut
382              
383             sub gaps {
384 3125     3125 1 2749 my ($self, $type, $value) = @_;
385              
386 3125 100       4237 unless ($self->{_did_pregaps}) {
387 378         722 $self->_pre_gaps;
388             }
389              
390 3125 100       4604 $type = lc $type if defined $type;
391 3125 50 66     13950 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
      66        
392             $type !~ /query|hit|subject|sbjct|total/);
393 3125 100       4423 $type = 'hit' if $type =~ /sbjct|subject/;
394 3125         3333 my $previous = $self->{'_gaps'}->{$type};
395 3125 100 100     6895 if( defined $value || ! defined $previous ) {
396 1110 100       1458 $value = $previous = '' unless defined $value;
397 1110         1600 $self->{'_gaps'}->{$type} = $value;
398             }
399 3125   100     8041 return $previous || 0;
400             }
401              
402             =head2 query_string
403              
404             Title : query_string
405             Usage : my $qseq = $hsp->query_string;
406             Function: Retrieves the query sequence of this HSP as a string
407             Returns : string
408             Args : [optional] string to set for query sequence
409              
410              
411             =cut
412              
413             sub query_string{
414 610     610 1 911 my ($self,$value) = @_;
415 610         601 my $previous = $self->{QUERY_SEQ};
416 610 100 66     1922 if( defined $value || ! defined $previous ) {
417 3 50       8 $value = $previous = '' unless defined $value;
418 3         5 $self->{QUERY_SEQ} = $value;
419             # do some housekeeping so we know when to
420             # re-run _calculate_seq_positions
421 3         6 $self->{'_sequenceschanged'} = 1;
422             }
423 610         1181 return $previous;
424             }
425              
426             =head2 hit_string
427              
428             Title : hit_string
429             Usage : my $hseq = $hsp->hit_string;
430             Function: Retrieves the hit sequence of this HSP as a string
431             Returns : string
432             Args : [optional] string to set for hit sequence
433              
434              
435             =cut
436              
437             sub hit_string{
438 618     618 1 2885 my ($self,$value) = @_;
439 618         678 my $previous = $self->{HIT_SEQ};
440 618 100 66     1971 if( defined $value || ! defined $previous ) {
441 1 50       5 $value = $previous = '' unless defined $value;
442 1         3 $self->{HIT_SEQ} = $value;
443             # do some housekeeping so we know when to
444             # re-run _calculate_seq_positions
445 1         2 $self->{'_sequenceschanged'} = 1;
446             }
447 618         1158 return $previous;
448             }
449              
450             =head2 homology_string
451              
452             Title : homology_string
453             Usage : my $homo_string = $hsp->homology_string;
454             Function: Retrieves the homology sequence for this HSP as a string.
455             : The homology sequence is the string of symbols in between the
456             : query and hit sequences in the alignment indicating the degree
457             : of conservation (e.g., identical, similar, not similar).
458             Returns : string
459             Args : [optional] string to set for homology sequence
460              
461             =cut
462              
463             sub homology_string{
464 6905     6905 1 5363 my ($self,$value) = @_;
465 6905         6741 my $previous = $self->{HOMOLOGY_SEQ};
466 6905 100 66     18400 if( defined $value || ! defined $previous ) {
467 3 50       8 $value = $previous = '' unless defined $value;
468 3         4 $self->{HOMOLOGY_SEQ} = $value;
469             # do some housekeeping so we know when to
470             # re-run _calculate_seq_positions
471 3         5 $self->{'_sequenceschanged'} = 1;
472             }
473 6905         18444 return $previous;
474             }
475              
476             =head2 consensus_string
477              
478             Title : consensus_string
479             Usage : my $cs_string = $hsp->consensus_string;
480             Function: Retrieves the consensus structure line for this HSP as a string (HMMER).
481             : If the model had any consensus structure or reference line annotation
482             : that it inherited from a multiple alignment (#=GC SS cons,
483             : #=GC RF annotation in Stockholm files), that information is shown
484             : as CS or RF annotation line.
485             Returns : string
486             Args : [optional] string to set for consensus structure
487              
488             =cut
489              
490             sub consensus_string {
491 8     8 1 14 my ($self,$value) = @_;
492 8         17 my $previous = $self->{CS_SEQ};
493 8 50 33     50 if( defined $value || ! defined $previous ) {
494 0 0       0 $value = $previous = '' unless defined $value;
495 0         0 $self->{CS_SEQ} = $value;
496             # do some housekeeping so we know when to
497             # re-run _calculate_seq_positions
498 0         0 $self->{'_sequenceschanged'} = 1;
499             }
500 8         31 return $previous;
501             }
502              
503             =head2 posterior_string
504              
505             Title : posterior_string
506             Usage : my $pp_string = $hsp->posterior_string;
507             Function: Retrieves the posterior probability line for this HSP as a string (HMMer3).
508             : The posterior probability is the string of symbols at the bottom
509             : of the alignment indicating the expected accuracy of each aligned residue.
510             : A 0 means 0-5%, 1 means 5-15%, and so on; 9 means 85-95%,
511             : and a * means 95-100% posterior probability.
512             Returns : string
513             Args : [optional] string to set for posterior probability
514              
515             =cut
516              
517             sub posterior_string {
518 9     9 1 16 my ($self,$value) = @_;
519 9         42 my $previous = $self->{PP_SEQ};
520 9 100 66     63 if( defined $value || ! defined $previous ) {
521 2 50       6 $value = $previous = '' unless defined $value;
522 2         4 $self->{PP_SEQ} = $value;
523             # do some housekeeping so we know when to
524             # re-run _calculate_seq_positions
525 2         5 $self->{'_sequenceschanged'} = 1;
526             }
527 9         43 return $previous;
528             }
529              
530             =head2 length
531              
532             Title : length
533             Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
534             Function : Returns the length of the query or hit in the alignment
535             (without gaps)
536             or the aggregate length of the HSP (including gaps;
537             this may be greater than either hit or query )
538             Returns : integer
539             Args : arg 1: 'query' = length of query seq (without gaps)
540             'hit' = length of hit seq (without gaps) (synonyms: sbjct, subject)
541             'total' = length of alignment (with gaps)
542             default = 'total'
543             arg 2: [optional] integer length value to set for specific type
544              
545             =cut
546              
547             sub length {
548              
549 3839     3839 1 3103 my $self = shift;
550 3839         2878 my $type = shift;
551              
552 3839 100       5332 $type = 'total' unless defined $type;
553 3839         3760 $type = lc $type;
554              
555 3839 100       9568 if( $type =~ /^q/i ) {
    100          
556 1615         2245 return $self->query()->length(shift);
557             } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
558 989         1441 return $self->hit()->length(shift);
559             } else {
560 1235         938 my $v = shift;
561 1235 100       1876 if( defined $v ) {
562 105         147 $self->{HSP_LENGTH} = $v;
563             }
564 1235         2739 return $self->{HSP_LENGTH};
565             }
566 0         0 return 0; # should never get here
567             }
568              
569             =head2 hsp_length
570              
571             Title : hsp_length
572             Usage : my $len = $hsp->hsp_length()
573             Function: shortcut length('hsp')
574             Returns : floating point between 0 and 100
575             Args : none
576              
577             =cut
578              
579 16     16 1 65 sub hsp_length { return shift->length('hsp', shift); }
580              
581             =head2 percent_identity
582              
583             Title : percent_identity
584             Usage : my $percentid = $hsp->percent_identity()
585             Function: Returns the calculated percent identity for an HSP
586             Returns : floating point between 0 and 100
587             Args : none
588              
589              
590             =cut
591              
592             sub percent_identity {
593 84     84 1 118 my $self = shift;
594              
595 84 100       204 unless ($self->{_did_prepi}) {
596 42         147 $self->_pre_pi;
597             }
598              
599 84         252 return $self->SUPER::percent_identity(@_);
600             }
601              
602             =head2 frame
603              
604             Title : frame
605             Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
606             Function: Set the Frame for both query and subject and insure that
607             they agree.
608             This overrides the frame() method implementation in
609             FeaturePair.
610             Returns : array of query and subject frame if return type wants an array
611             or query frame if defined or subject frame if not defined
612             Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
613             'query' to retrieve the query frame
614             'list' or 'array' to retrieve both query and hit frames together
615             Note : Frames are stored in the GFF way (0-2) not 1-3
616             as they are in BLAST (negative frames are deduced by checking
617             the strand of the query or hit)
618              
619             =cut
620              
621             # Note: changed 4/19/08 - bug 2485
622             #
623             # frame() is supposed to be a getter/setter (as is implied by the Function desc
624             # above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
625             # the API is not consistent with the other HSP/SimilarityPair methods such as
626             # strand(), start(), end(), etc.
627             #
628             # In order to make this consistent with other methods all work is now done
629             # when the features are instantiated and not delayed. We compromise by
630             # defaulting back 'to hit' w/o passed args. Setting is now allowed.
631              
632             sub frame {
633 610     610 1 1475 my $self = shift;
634 610         474 my $val = shift;
635 610 50       837 if (!defined $val) {
636             # unfortunately, w/o args we need to warn about API changes
637 0         0 $self->warn("API for frame() has changed.\n".
638             "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
639             "returning query frame");
640 0         0 $val = 'query';
641             }
642 610         1083 $val =~ s/^\s+//;
643              
644 610 100       1383 if( $val =~ /^q/i ) {
    50          
    0          
    0          
645 284         369 return $self->query->frame(@_);
646             } elsif( $val =~ /^hi|^s/i ) {
647 326         459 return $self->hit->frame(@_);
648             } elsif ( $val =~ /^list|array/i ) {
649 0         0 return ($self->query->frame($_[0]),
650             $self->hit->frame($_[1]) );
651             } elsif ( $val =~ /^\d+$/) {
652             # old API i.e. frame($query_frame, $hit_frame). This catches all but one
653             # case, where no arg is passed (so the hit is wanted).
654 0         0 $self->warn("API for frame() has changed.\n".
655             "Please refer to documentation for Bio::Search::HSP::GenericHSP");
656             wantarray ?
657 0 0       0 return ($self->query->frame($val),
658             $self->hit->frame(@_) ) :
659             return $self->hit->frame($val,@_);
660             } else {
661 0         0 $self->warn("unrecognized component '$val' requested\n");
662             }
663 0         0 return 0;
664             }
665              
666             =head2 get_aln
667              
668             Title : get_aln
669             Usage : my $aln = $hsp->gel_aln
670             Function: Returns a L object representing the HSP alignment
671             Returns : L
672             Args : none
673              
674             =cut
675              
676             sub get_aln {
677 19     19 1 26 my ($self) = @_;
678 19         103 require Bio::LocatableSeq;
679 19         4876 require Bio::SimpleAlign;
680              
681 19         85 my $aln = Bio::SimpleAlign->new();
682 19         48 my $hs = $self->hit_string();
683 19         47 my $qs = $self->query_string();
684             # FASTA specific stuff moved to the FastaHSP object
685 19         23 my $seqonly = $qs;
686 19         110 $seqonly =~ s/[\-\s]//g;
687 19         54 my ($q_nm,$s_nm) = ($self->query->seq_id(),
688             $self->hit->seq_id());
689             # Should we silently change the name of the query or hit if it isn't
690             # defined? May need revisiting... cjfields 2008-12-3 (commented out below)
691            
692             #unless( defined $q_nm && CORE::length ($q_nm) ) {
693             # $q_nm = 'query';
694             #}
695             #unless( defined $s_nm && CORE::length ($s_nm) ) {
696             # $s_nm = 'hit';
697             #}
698            
699             # mapping: 1 residues for every x coordinate positions
700             my $query = Bio::LocatableSeq->new(
701             -seq => $qs,
702             -id => $q_nm,
703             -start => $self->query->start,
704             -end => $self->query->end,
705             -strand => $self->query->strand,
706             -force_nse => $q_nm ? 0 : 1,
707 19 100       65 -mapping => [ 1, $self->{_query_mapping} ]
708             );
709 19         44 $seqonly = $hs;
710 19         133 $seqonly =~ s/[\-\s]//g;
711             my $hit = Bio::LocatableSeq->new(
712             -seq => $hs,
713             -id => $s_nm,
714             -start => $self->hit->start,
715             -end => $self->hit->end,
716             -strand => $self->hit->strand,
717             -force_nse => $s_nm ? 0 : 1,
718 19 50       62 -mapping => [ 1, $self->{_hit_mapping} ]
719             );
720 19         78 $aln->add_seq($query);
721 19         42 $aln->add_seq($hit);
722 19         54 return $aln;
723             }
724              
725             =head2 num_conserved
726              
727             Title : num_conserved
728             Usage : $obj->num_conserved($newval)
729             Function: returns the number of conserved residues in the alignment
730             Returns : integer
731             Args : integer (optional)
732              
733              
734             =cut
735              
736             sub num_conserved{
737 1914     1914 1 1452 my ($self,$value) = @_;
738              
739 1914 100       2480 unless ($self->{_did_presimilar}) {
740 1         4 $self->_pre_similar_stats;
741             }
742              
743 1914 50       2348 if (defined $value) {
744 0         0 $self->{CONSERVED} = $value;
745             }
746 1914         2815 return $self->{CONSERVED};
747             }
748              
749             =head2 num_identical
750              
751             Title : num_identical
752             Usage : $obj->num_identical($newval)
753             Function: returns the number of identical residues in the alignment
754             Returns : integer
755             Args : integer (optional)
756              
757              
758             =cut
759              
760             sub num_identical{
761 1923     1923 1 1634 my ($self,$value) = @_;
762              
763 1923 100       2718 unless ($self->{_did_presimilar}) {
764 437         789 $self->_pre_similar_stats;
765             }
766              
767 1923 50       2774 if( defined $value) {
768 0         0 $self->{IDENTICAL} = $value;
769             }
770 1923         3876 return $self->{IDENTICAL};
771             }
772              
773             =head2 rank
774              
775             Usage : $hsp->rank( [string] );
776             Purpose : Get the rank of the HSP within a given Blast hit.
777             Example : $rank = $hsp->rank;
778             Returns : Integer (1..n) corresponding to the order in which the HSP
779             appears in the BLAST report.
780              
781             =cut
782              
783             sub rank {
784 4514     4514 1 2875 my ($self,$value) = @_;
785 4514 50       4645 if( defined $value) {
786 0         0 $self->{RANK} = $value;
787             }
788 4514         8515 return $self->{RANK};
789             }
790              
791             =head2 seq_inds
792              
793             Title : seq_inds
794             Purpose : Get a list of residue positions (indices) for all identical
795             : or conserved residues in the query or sbjct sequence.
796             Example : @s_ind = $hsp->seq_inds('query', 'identical');
797             : @h_ind = $hsp->seq_inds('hit', 'conserved');
798             : @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
799             : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
800             Returns : List of integers
801             : May include ranges if collapse is true.
802             Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
803             : ('sbjct' is synonymous with 'hit')
804             : class = 'identical' - identical positions
805             : 'conserved' - conserved positions
806             : 'nomatch' - mismatched residue or gap positions
807             : 'mismatch' - mismatched residue positions (no gaps)
808             : 'gap' - gap positions only
809             : 'frameshift'- frameshift positions only
810             : 'conserved-not-identical' - conserved positions w/o
811             : identical residues
812             : The name can be shortened to 'id' or 'cons' unless
813             : the name is . The default value is
814             : 'identical'
815             :
816             : collapse = boolean, if true, consecutive positions are merged
817             : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
818             : collapses to "1-5 7 9-11". This is useful for
819             : consolidating long lists. Default = no collapse.
820             :
821             Throws : n/a.
822             Comments : For HSPs partially or completely derived from translated sequences
823             : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
824             : cannot easily be attributed to a single position (i.e. the
825             : positional data is ambiguous). In these cases all three codon
826             : positions are reported. Under these conditions you can check
827             : ambiguous_seq_inds() to determine whether the query, subject,
828             : or both are ambiguous.
829             :
830             See Also : L,
831             L
832              
833             =cut
834              
835             sub seq_inds{
836 194     194 1 302 my ($self, $seqType, $class, $collapse) = @_;
837              
838             # prepare the internal structures - this is cached so
839             # if the strings have not changed we're okay
840 194         373 $self->_calculate_seq_positions();
841              
842 194   50     295 $seqType ||= 'query';
843 194   50     257 $class ||= 'identical';
844 194   100     445 $collapse ||= 0;
845 194 100       344 $seqType = 'sbjct' if $seqType eq 'hit';
846 194         285 my $t = lc(substr($seqType,0,1));
847 194 100 33     413 if( $t eq 'q' ) {
    50          
848 100         124 $seqType = 'query';
849             } elsif ( $t eq 's' || $t eq 'h' ) {
850 94         123 $seqType = 'sbjct';
851             } else {
852 0         0 $self->warn("unknown seqtype $seqType using 'query'");
853 0         0 $seqType = 'query';
854             }
855            
856 194         207 $t = lc(substr($class,0,1));
857              
858 194 100       598 if( $t eq 'c' ) {
    50          
    100          
    100          
    100          
    50          
859 8 50       15 if( $class =~ /conserved\-not\-identical/ ) {
860 0         0 $class = 'conserved';
861             } else {
862 8         7 $class = 'conservedall';
863             }
864             } elsif( $t eq 'i' ) {
865 0         0 $class = 'identical';
866             } elsif( $t eq 'n' ) {
867 18         24 $class = 'nomatch';
868             } elsif( $t eq 'm' ) {
869 10         10 $class = 'mismatch';
870             } elsif( $t eq 'g' ) {
871 150         129 $class = 'gap';
872             } elsif( $t eq 'f' ) {
873 8         6 $class = 'frameshift';
874             } else {
875 0         0 $self->warn("unknown sequence class $class using 'identical'");
876 0         0 $class = 'identical';
877             }
878              
879             ## Sensitive to member name changes.
880 194         236 $seqType = "_\L$seqType\E";
881 194         177 $class = "_\L$class\E";
882 194         174 my @ary;
883              
884 194 100       315 if( $class eq '_gap' ) {
    100          
885             # this means that we are remapping the gap length that is stored
886             # in the hash (for example $self->{'_gapRes_query'} )
887             # so we'll return an array which has the values of the position of the
888             # of the gap (the key in the hash) + the gap length (value in the
889             # hash for this key - 1.
890            
891             # changing this; since the index is the position prior to the insertion,
892             # repeat the position based on the number of gaps inserted
893 521         325 @ary = map { my @tmp;
894             # position holds number of gaps inserted
895 521         899 for my $g (1..$self->{seqinds}{"${class}Res$seqType"}->{$_}) {
896 1148         952 push @tmp, $_ ;
897             }
898 521         753 @tmp}
899 150         139 sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
  1911         1251  
  150         835  
900             } elsif( $class eq '_conservedall' ) {
901 13458         7410 @ary = sort { $a <=> $b }
902 8         90 keys %{ $self->{seqinds}{"_conservedRes$seqType"}},
903 8         7 keys %{ $self->{seqinds}{"_identicalRes$seqType"}},
  8         194  
904             } else {
905 36         30 @ary = sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
  28366         16019  
  36         772  
906             }
907 194 100       4348 require Bio::Search::BlastUtils if $collapse;
908              
909 194 100       569 return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary;
910             }
911              
912             =head2 ambiguous_seq_inds
913              
914             Title : ambiguous_seq_inds
915             Purpose : returns a string denoting whether sequence indices for query,
916             : subject, or both are ambiguous
917             Returns : String; 'query', 'subject', 'query/subject', or empty string ''
918             Argument : none
919             Comments : For HSPs partially or completely derived from translated sequences
920             : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
921             : cannot easily be attributed to a single position (i.e. the
922             : positional data is ambiguous). In these cases all three codon
923             : positions are reported when using seq_inds(). Under these
924             : conditions you can check ambiguous_seq_inds() to determine whether
925             : the query, subject, or both are ambiguous.
926             See Also : L
927              
928             =cut
929              
930             sub ambiguous_seq_inds {
931 6     6 1 9 my $self = shift;
932 6         14 $self->_calculate_seq_positions();
933             my $type = ($self->{_query_offset} == 3 && $self->{_sbjct_offset} == 3) ?
934             'query/subject' :
935             ($self->{_query_offset} == 3) ? 'query' :
936 6 100 100     48 ($self->{_sbjct_offset} == 3) ? 'subject' : '';
    100          
    100          
937 6         22 return $type;
938             }
939              
940             =head2 Inherited from L
941              
942             These methods come from L
943              
944             =head2 query
945              
946             Title : query
947             Usage : my $query = $hsp->query
948             Function: Returns a SeqFeature representing the query in the HSP
949             Returns : L
950             Args : [optional] new value to set
951              
952             =cut
953              
954             sub query {
955 5590     5590 1 6651 my $self = shift;
956 5590 100       8354 unless ($self->{_created_qff}) {
957 1167         1916 $self->_query_seq_feature;
958             }
959 5590         10581 return $self->SUPER::query(@_);
960             }
961              
962             sub feature1 {
963 6939     6939 1 5134 my $self = shift;
964 6939 100 66     16456 if (! $self->{_finished_new} || $self->{_making_qff}) {
965 1331 50       1907 return $self->{_sim1} if $self->{_sim1};
966 1331         3246 $self->{_sim1} = Bio::SeqFeature::Similarity->new();
967 1331         3112 return $self->{_sim1};
968             }
969 5608 100       7219 unless ($self->{_created_qff}) {
970 9         30 $self->_query_seq_feature;
971             }
972 5608         8494 return $self->SUPER::feature1(@_);
973             }
974              
975             =head2 hit
976              
977             Title : hit
978             Usage : my $hit = $hsp->hit
979             Function: Returns a SeqFeature representing the hit in the HSP
980             Returns : L
981             Args : [optional] new value to set
982              
983             =cut
984              
985             sub hit {
986 4390     4390 1 11518 my $self = shift;
987 4390 100       6329 unless ($self->{_created_sff}) {
988 550         1011 $self->_subject_seq_feature;
989             }
990 4390         7831 return $self->SUPER::hit(@_);
991             }
992              
993             sub feature2 {
994 4408     4408 1 5278 my $self = shift;
995 4408 50 33     10865 if (! $self->{_finished_new} || $self->{_making_sff}) {
996 0 0       0 return $self->{_sim2} if $self->{_sim2};
997 0         0 $self->{_sim2} = Bio::SeqFeature::Similarity->new();
998 0         0 return $self->{_sim2};
999             }
1000 4408 100       5456 unless ($self->{_created_sff}) {
1001 12         34 $self->_subject_seq_feature;
1002             }
1003 4408         6984 return $self->SUPER::feature2(@_);
1004             }
1005              
1006             =head2 significance
1007              
1008             Title : significance
1009             Usage : $evalue = $obj->significance();
1010             $obj->significance($evalue);
1011             Function: Get/Set the significance value
1012             Returns : numeric
1013             Args : [optional] new value to set
1014              
1015             =cut
1016              
1017             # Override significance to return the e-value or, if this is
1018             # not defined (WU-BLAST), return the p-value.
1019             sub significance {
1020 19     19 1 33 my ($self, $val) = @_;
1021 19 50 33     78 if (!defined $self->{SIGNIFICANCE} || defined $val) {
1022 19 50       81 $self->{SIGNIFICANCE} = defined $val ? $val :
    100          
    50          
1023             defined $self->evalue ? $self->evalue :
1024             defined $self->pvalue ? $$self->pvalue :
1025             undef;
1026 19         43 $self->query->significance($self->{SIGNIFICANCE});
1027             }
1028 19         73 return $self->{SIGNIFICANCE};
1029             }
1030              
1031             =head2 strand
1032              
1033             Title : strand
1034             Usage : $hsp->strand('query')
1035             Function: Retrieves the strand for the HSP component requested
1036             Returns : +1 or -1
1037             Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject,
1038             'query' to retrieve the query strand (default)
1039              
1040             =cut
1041              
1042             sub strand {
1043 587     587 1 743 my ($self,$type) = @_;
1044              
1045 587 100 100     3933 if( $type =~ /^q/i && defined $self->{'QUERY_STRAND'} ) {
    100 100        
1046 2         9 return $self->{'QUERY_STRAND'};
1047             } elsif( $type =~ /^(hit|subject|sbjct)/i && defined $self->{'HIT_STRAND'} ) {
1048 2         44 return $self->{'HIT_STRAND'};
1049             }
1050              
1051 583         1237 return $self->SUPER::strand($type)
1052             }
1053              
1054             =head2 score
1055              
1056             Title : score
1057             Usage : $score = $obj->score();
1058             $obj->score($value);
1059             Function: Get/Set the score value
1060             Returns : numeric
1061             Args : [optional] new value to set
1062              
1063             =head2 bits
1064              
1065             Title : bits
1066             Usage : $bits = $obj->bits();
1067             $obj->bits($value);
1068             Function: Get/Set the bits value
1069             Returns : numeric
1070             Args : [optional] new value to set
1071              
1072             =head1 Private methods
1073              
1074             =cut
1075              
1076             =head2 _calculate_seq_positions
1077              
1078             Title : _calculate_seq_positions
1079             Usage : $self->_calculate_seq_positions
1080             Function: Internal function
1081             Returns :
1082             Args :
1083              
1084              
1085             =cut
1086              
1087             sub _calculate_seq_positions {
1088 205     205   230 my ($self,@args) = @_;
1089 205 100       416 return unless ( $self->{'_sequenceschanged'} );
1090 73         88 $self->{'_sequenceschanged'} = 0;
1091 73         135 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
1092             $self->query_string(),
1093             $self->hit_string() );
1094 73         136 my ($mlen, $qlen, $slen) = (CORE::length($seqString), CORE::length($qseq), CORE::length($sseq));
1095 73   100     134 my $qdir = $self->query->strand || 1;
1096 73   100     130 my $sdir = $self->hit->strand || 1;
1097 73 100       212 my ($resCount_query, $endpoint_query) = ($qdir <=0) ? ($self->query->end, $self->query->start)
1098             : ($self->query->start, $self->query->end);
1099 73 100       220 my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ? ($self->hit->end, $self->hit->start)
1100             : ($self->hit->start, $self->hit->end);
1101            
1102 73         148 my $prog = $self->algorithm;
1103            
1104 73 100       323 if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1105            
1106             # we infer the end of the regional sequence where the first and last
1107             # non spaces are in the homology string
1108             # then we use the HSP->length to tell us how far to read
1109             # to cut off the end of the sequence
1110            
1111 4         7 my ($start, $rest) = (0,0);
1112 4 50       92 if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1113 4 100       16 ($start, $rest) = ($1 ? CORE::length($1) : 0, CORE::length($2));
1114             }
1115            
1116 4         11 $seqString = substr($seqString, $start, $rest);
1117 4         5 $qseq = substr($qseq, $start, $rest);
1118 4         7 $sseq = substr($sseq, $start, $rest);
1119              
1120             # commented out 10/29/08
1121             # removing frameshift symbols doesn't take into account the following:
1122             # 1) does not remove the same point in the homology string (get
1123             # positional errors)
1124             # 2) adjustments to the overall position in the string due to the
1125             # frameshift must be taken into consideration (get balancing errors)
1126             #
1127             # Frameshifts will be handled directly in the main loop.
1128             # --chris
1129            
1130             # fasta reports some extra 'regional' sequence information
1131             # we need to clear out first
1132             # this seemed a bit insane to me at first, but it appears to
1133             # work --jason
1134            
1135             #$qseq =~ s![\\\/]!!g;
1136             #$sseq =~ s![\\\/]!!g;
1137             }
1138              
1139 73 100 66     291 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
1140 2         6 $self->_calculate_seq_offsets();
1141             }
1142            
1143 73         94 my ($qfs, $sfs) = (0,0);
1144             CHAR_LOOP:
1145 73         193 for my $pos (0..CORE::length($seqString)-1) {
1146 30602         37843 my @qrange = (0 - $qfs)..($self->{_query_offset} - 1);
1147 30602         28222 my @srange = (0 - $sfs)..($self->{_sbjct_offset} - 1);
1148             # $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1149 30602         22273 ($qfs, $sfs) = (0,0);
1150 30602 50 100     133599 my ($mchar, $qchar, $schar) = (
    50          
1151             unpack("x$pos A1",$seqString) || ' ',
1152             $pos < CORE::length($qseq) ? unpack("x$pos A1",$qseq) : '-',
1153             $pos < CORE::length($sseq) ? unpack("x$pos A1",$sseq) : '-'
1154             );
1155 30602         27090 my $matchtype = '';
1156 30602         19639 my ($qgap, $sgap) = (0,0);
1157 30602 100 100     122842 if( $mchar eq '+' || $mchar eq '.') { # conserved
    100 100        
    50          
1158 757         3534 $self->{seqinds}{_conservedRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1159 757         3362 $self->{seqinds}{_conservedRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1160 757         597 $matchtype = 'conserved';
1161             } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1162 25371         62710 $self->{seqinds}{_identicalRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1163 25371         49196 $self->{seqinds}{_identicalRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1164 25371         19424 $matchtype = 'identical';
1165             } elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1166 4474 100       5740 $qfs = $qchar eq '/' ? -1 : # base inserted to match frame
    50          
1167             $qchar eq '\\' ? 1 : # base deleted to match frame
1168             0;
1169 4474 50       5363 $sfs = $schar eq '/' ? -1 :
    50          
1170             $schar eq '\\' ? 1 :
1171             0;
1172 4474 100       9412 if ($qfs) {
    50          
    100          
    100          
1173             # Frameshifts are tricky.
1174            
1175             # '/' indicates that the next residue is shifted back one
1176             # (-1) frame position and is a deletion of one base (so to
1177             # correctly align, a base is inserted). That residue should only
1178             # occupy two sequence positions instead of three.
1179            
1180             # '\' indicates that the next residue is shifted forward
1181             # one (+1) frame position and is an insertion of one base (so to
1182             # correctly align, a base is removed). That residue should
1183             # occupy four sequence positions instead of three.
1184            
1185             # Note that gaps are not counted across from frameshifts.
1186             # Frameshift indices are a range of positions starting in the
1187             # previous sequence position in which the frameshift occurs;
1188             # they are ambiguous by nature.
1189 2         9 $self->{seqinds}{_frameshiftRes_query}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1190 2         3 $matchtype = "$qfs frameshift-query";
1191 2         3 $sgap = $qgap = 1;
1192             }
1193             elsif ($sfs) {
1194 0         0 $self->{seqinds}{_frameshiftRes_sbjct}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1195 0         0 $matchtype = "$sfs frameshift-sbcjt";
1196 0         0 $sgap = $qgap = 1;
1197             }
1198             elsif ($qchar eq $self->{GAP_SYMBOL}) {
1199             # gap are counted as being in the immediately preceeding residue
1200             # position; for translated sequences this is not the start of
1201             # the previous codon, but the third codon position
1202 512         1294 $self->{seqinds}{_gapRes_query}{ $resCount_query - $qdir }++ for @qrange;
1203 512         442 $matchtype = 'gap-query';
1204 512         380 $qgap++;
1205             }
1206             elsif ($schar eq $self->{GAP_SYMBOL}) {
1207 491         1081 $self->{seqinds}{_gapRes_sbjct}{ $resCount_sbjct - $sdir }++ for @srange;
1208 491         389 $matchtype = 'gap-sbjct';
1209 491         393 $sgap++;
1210             }
1211             else {
1212             # if not a gap or frameshift in either seq, must be mismatch
1213 3469         9247 $self->{seqinds}{_mismatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1214 3469         7447 $self->{seqinds}{_mismatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1215 3469         2826 $matchtype = 'mismatch';
1216             }
1217             # always add a nomatch unless the current position in the seq is a gap
1218 4474 100       5788 if (!$sgap) {
1219 3981         7689 $self->{seqinds}{_nomatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1220             }
1221 4474 100       5115 if (!$qgap) {
1222 3960         8042 $self->{seqinds}{_nomatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1223             }
1224             } else {
1225 0         0 $self->warn("Unknown midline character: [$mchar]");
1226             }
1227             # leave in and uncomment for future debugging
1228             #$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1229             # $resCount_query,
1230             # $qchar,
1231             # $mchar,
1232             # $schar,
1233             # $resCount_sbjct,
1234             # $matchtype,
1235             # ($self->{_query_offset} * $qdir),
1236             # ($self->{_sbjct_offset} * $sdir)));
1237 30602 100       42096 $resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1238 30602 100       49810 $resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;
1239             }
1240 73         156 return 1;
1241             }
1242              
1243             sub _calculate_seq_offsets {
1244 107     107   100 my $self = shift;
1245 107         263 my $prog = $self->algorithm;
1246 107         239 ($self->{_sbjct_offset}, $self->{_query_offset}) = (1,1);
1247 107 100       487 if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
    100          
1248 68         64 $self->{_sbjct_offset} = 3;
1249 68 100 66     304 if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1250 67         70 $self->{_query_offset} = 3;
1251             }
1252             } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
1253 3         6 $self->{_query_offset} = 3;
1254             }
1255 107         132 1;
1256             }
1257              
1258             =head2 n
1259              
1260             See documentation in L
1261              
1262             =cut
1263              
1264             sub n {
1265 113     113 1 202 my $self = shift;
1266 113 50       195 if(@_) { $self->{'N'} = shift; }
  0         0  
1267             # note that returning 1 is completely an assumption
1268 113 100       352 defined $self->{'N'} ? $self->{'N'} : 1;
1269             }
1270              
1271             =head2 range
1272              
1273             See documentation in L
1274              
1275             =cut
1276              
1277             sub range {
1278 1659     1659 1 1510 my ($self, $seqType) = @_;
1279              
1280 1659   100     2321 $seqType ||= 'query';
1281 1659 100       2515 $seqType = 'sbjct' if $seqType eq 'hit';
1282              
1283 1659         1123 my ($start, $end);
1284 1659 100       1837 if( $seqType eq 'query' ) {
1285 840         1040 $start = $self->query->start;
1286 840         1090 $end = $self->query->end;
1287             }
1288             else {
1289 819         1226 $start = $self->hit->start;
1290 819         1096 $end = $self->hit->end;
1291             }
1292 1659         3750 return ($start, $end);
1293             }
1294              
1295              
1296             =head2 links
1297              
1298             Title : links
1299             Usage : $obj->links($newval)
1300             Function: Get/Set the Links value (from WU-BLAST)
1301             Indicates the placement of the alignment in the group of HSPs
1302             Returns : Value of links
1303             Args : On set, new value (a scalar or undef, optional)
1304              
1305              
1306             =cut
1307              
1308             sub links{
1309 54     54 1 58 my $self = shift;
1310              
1311 54 50       88 return $self->{LINKS} = shift if @_;
1312 54         167 return $self->{LINKS};
1313             }
1314              
1315             =head2 hsp_group
1316              
1317             Title : hsp_group
1318             Usage : $obj->hsp_group($newval)
1319             Function: Get/Set the Group value (from WU-BLAST)
1320             Indicates a grouping of HSPs
1321             Returns : Value of group
1322             Args : On set, new value (a scalar or undef, optional)
1323              
1324              
1325             =cut
1326              
1327             sub hsp_group {
1328 12     12 1 19 my $self = shift;
1329              
1330 12 50       40 return $self->{HSP_GROUP} = shift if @_;
1331 12         35 return $self->{HSP_GROUP};
1332             }
1333              
1334             =head2 hit_features
1335              
1336             Title : hit_features
1337             Usage : $obj->hit_features($newval)
1338             Function: Get/Set the HSP hit feature string (from some BLAST 2.2.13 text
1339             output), which is a string of overlapping or nearby features in HSP
1340             hit
1341             Returns : Value of hit features, if present
1342             Args : On set, new value (a scalar or undef, optional)
1343              
1344              
1345             =cut
1346              
1347             sub hit_features {
1348 6     6 1 30 my $self = shift;
1349              
1350 6 50       24 return $self->{HIT_FEATURES} = shift if @_;
1351 6         32 return $self->{HIT_FEATURES};
1352             }
1353              
1354             # The cigar string code is written by Juguang Xiao
1355              
1356             =head1 Brief introduction on cigar string
1357              
1358             NOTE: the concept is originally from EnsEMBL docs at
1359             http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html
1360             Please append to these docs if you have a better definition.
1361              
1362             Sequence alignment hits can be stored in a database as ungapped alignments.
1363             This imposes 2 major constraints on alignments:
1364              
1365             a) alignments for a single hit record require multiple rows in the database,
1366             and
1367             b) it is not possible to accurately retrieve the exact original alignment.
1368              
1369             Alternatively, sequence alignments can be stored as gapped alignments using
1370             the CIGAR line format (where CIGAR stands for Concise Idiosyncratic Gapped
1371             Alignment Report).
1372              
1373             In the cigar line format alignments are stored as follows:
1374              
1375             M: Match
1376             D: Deletion
1377             I: Insertion
1378              
1379             An example of an alignment for a hypthetical protein match is shown below:
1380              
1381              
1382             Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
1383              
1384             PG P G GP R PLGP
1385              
1386             Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
1387              
1388              
1389             protein_align_feature table as the following cigar line:
1390              
1391             7M4D12M2I2MD7M
1392              
1393             =head2 cigar_string
1394              
1395             Name: cigar_string
1396             Usage: $cigar_string = $hsp->cigar_string
1397             Function: Generate and return cigar string for this HSP alignment
1398             Args: No input needed
1399             Return: a cigar string
1400              
1401             =cut
1402              
1403              
1404             sub cigar_string {
1405 8     8 1 16 my ($self, $arg) = @_;
1406 8 50       16 $self->warn("this is not a setter") if(defined $arg);
1407              
1408 8 100       15 unless(defined $self->{_cigar_string}){ # generate cigar string
1409 7         18 my $cigar_string = $self->generate_cigar_string($self->query_string, $self->hit_string);
1410 7         13 $self->{_cigar_string} = $cigar_string;
1411             } # end of unless
1412              
1413 8         26 return $self->{_cigar_string};
1414             }
1415              
1416             =head2 generate_cigar_string
1417              
1418             Name: generate_cigar_string
1419             Usage: my $cigar_string = Bio::Search::HSP::GenericHSP::generate_cigar_string ($qstr, $hstr);
1420             Function: generate cigar string from a simple sequence of alignment.
1421             Args: the string of query and subject
1422             Return: cigar string
1423              
1424             =cut
1425              
1426             sub generate_cigar_string {
1427 7     7 1 8 my ($self, $qstr, $hstr) = @_;
1428 7         185 my @qchars = split //, $qstr;
1429 7         179 my @hchars = split //, $hstr;
1430              
1431 7 50       15 unless(scalar(@qchars) == scalar(@hchars)){
1432 0         0 $self->throw("two sequences are not equal in lengths");
1433             }
1434              
1435 7         11 $self->{_count_for_cigar_string} = 0;
1436 7         9 $self->{_state_for_cigar_string} = 'M';
1437              
1438 7         8 my $cigar_string = '';
1439 7         18 for(my $i=0; $i <= $#qchars; $i++){
1440 1917         1523 my $qchar = $qchars[$i];
1441 1917         1112 my $hchar = $hchars[$i];
1442 1917 100 100     4192 if($qchar ne $self->{GAP_SYMBOL} && $hchar ne $self->{GAP_SYMBOL}){ # Match
    100          
    50          
1443 1837         1609 $cigar_string .= $self->_sub_cigar_string('M');
1444             }elsif($qchar eq $self->{GAP_SYMBOL}){ # Deletion
1445 13         16 $cigar_string .= $self->_sub_cigar_string('D');
1446             }elsif($hchar eq $self->{GAP_SYMBOL}){ # Insertion
1447 67         58 $cigar_string .= $self->_sub_cigar_string('I');
1448             }else{
1449 0         0 $self->throw("Impossible state that 2 gaps on each seq aligned");
1450             }
1451             }
1452 7         11 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
1453 7         138 return $cigar_string;
1454             }
1455              
1456             # an internal method to help generate cigar string
1457              
1458             sub _sub_cigar_string {
1459 1924     1924   1277 my ($self, $new_state) = @_;
1460              
1461 1924         1049 my $sub_cigar_string = '';
1462 1924 100       1722 if($self->{_state_for_cigar_string} eq $new_state){
1463 1835         1189 $self->{_count_for_cigar_string} += 1; # Remain the state and increase the counter
1464             }else{
1465             $sub_cigar_string .= $self->{_count_for_cigar_string}
1466 89 100       128 unless $self->{_count_for_cigar_string} == 1;
1467 89         58 $sub_cigar_string .= $self->{_state_for_cigar_string};
1468 89         58 $self->{_count_for_cigar_string} = 1;
1469 89         64 $self->{_state_for_cigar_string} = $new_state;
1470             }
1471 1924         3259 return $sub_cigar_string;
1472             }
1473              
1474             # needed before seqfeatures can be made
1475             sub _pre_seq_feature {
1476 1177     1177   1077 my $self = shift;
1477 1177         1312 my $algo = $self->{ALGORITHM};
1478              
1479 1177         1184 my ($queryfactor, $hitfactor) = (0,0);
1480 1177         990 my ($hitmap, $querymap) = (1,1);
1481 1177 100 100     13154 if( $algo =~ /^(?:PSI)?T(?:BLAST|FAST|SW)[NY]/oi ) {
    100 100        
    100 100        
    100          
1482 7         9 $hitfactor = 1;
1483 7         10 $hitmap = 3;
1484             }
1485             elsif ($algo =~ /^(?:FAST|BLAST)(?:X|Y|XY)/oi || $algo =~ /^P?GENEWISE/oi ) {
1486 25         27 $queryfactor = 1;
1487 25         26 $querymap = 3;
1488             }
1489             elsif ($algo =~ /^T(BLAST|FAST|SW)(X|Y|XY)/oi || $algo =~ /^(BLAST|FAST|SW)N/oi || $algo =~ /^WABA|AXT|BLAT|BLASTZ|PSL|MEGABLAST|EXONERATE|SW|SSEARCH|SMITH\-WATERMAN|SIM4$/){
1490 286 100       765 if ($2) {
1491 126         156 $hitmap = $querymap = 3;
1492             }
1493 286         273 $hitfactor = 1;
1494 286         302 $queryfactor = 1;
1495             }
1496             elsif ($algo =~ /^RPS-BLAST/) {
1497 1 50       5 if ($algo =~ /^RPS-BLAST\(BLASTX\)/) {
1498 0         0 $queryfactor = 1;
1499 0         0 $querymap = 3;
1500             }
1501 1         3 $hitfactor = 0;
1502             }
1503             else {
1504 858         1501 my $stranded = lc substr($self->{STRANDED}, 0,1);
1505 858 100 66     2572 $queryfactor = ($stranded eq 'q' || $stranded eq 'b') ? 1 : 0;
1506 858 100 100     3718 $hitfactor = ($stranded eq 'h' || $stranded eq 's' || $stranded eq 'b') ? 1 : 0;
1507             }
1508 1177         1375 $self->{_query_factor} = $queryfactor;
1509 1177         1362 $self->{_hit_factor} = $hitfactor;
1510 1177         1266 $self->{_hit_mapping} = $hitmap;
1511 1177         1607 $self->{_query_mapping} = $querymap;
1512             }
1513              
1514             # make query seq feature
1515             sub _query_seq_feature {
1516 1176     1176   961 my $self = shift;
1517 1176         1466 $self->{_making_qff} = 1;
1518 1176         1313 my $qs = $self->{QUERY_START};
1519 1176         1153 my $qe = $self->{QUERY_END};
1520 1176 100       2231 unless (defined $self->{_query_factor}) {
1521 1133         1872 $self->_pre_seq_feature;
1522             }
1523 1176         1055 my $queryfactor = $self->{_query_factor};
1524              
1525 1176 50 33     3882 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin"); }
  0         0  
1526              
1527 1176         950 my $strand;
1528 1176 100       1844 if ($qe > $qs) { # normal query: start < end
1529 1106 100       1319 if ($queryfactor) {
1530 843         734 $strand = 1;
1531             }
1532             else {
1533 263         273 $strand = undef;
1534             }
1535             }
1536             else {
1537 70 50       102 if ($queryfactor) {
1538 70         71 $strand = -1;
1539             }
1540             else {
1541 0         0 $strand = undef;
1542             }
1543 70         137 ($qs,$qe) = ($qe,$qs);
1544             }
1545              
1546             # Note: many of these data are not query- and hit-specific.
1547             # Only start, end, name, length are.
1548             # We could be more efficient by only storing this info once.
1549             # steve chervitz --- Sat Apr 5 00:55:07 2003
1550              
1551 1176   33     2137 my $sim1 = $self->{_sim1} || Bio::SeqFeature::Similarity->new();
1552 1176         2620 $sim1->start($qs);
1553 1176         2219 $sim1->end($qe);
1554 1176         3243 $sim1->significance($self->{EVALUE});
1555 1176         3081 $sim1->bits($self->{BITS});
1556 1176         2749 $sim1->score($self->{SCORE});
1557 1176         2055 $sim1->strand($strand);
1558 1176         2684 $sim1->seq_id($self->{QUERY_NAME});
1559 1176         2371 $sim1->seqlength($self->{QUERY_LENGTH});
1560 1176         2312 $sim1->source_tag($self->{ALGORITHM});
1561 1176         2225 $sim1->seqdesc($self->{QUERY_DESC});
1562 1176 100       3921 $sim1->add_tag_value('meta', $self->{META}) if $self->can('meta');
1563             # to determine frame from something like FASTXY which doesn't
1564             # report the frame
1565 1176         1347 my $qframe = $self->{QUERY_FRAME};
1566            
1567 1176 100 100     4888 if (defined $strand && !defined $qframe && $queryfactor) {
    100 66        
1568 603         682 $qframe = ( $qs % 3 ) * $strand;
1569             }
1570             elsif (!defined $strand) {
1571 263         291 $qframe = 0;
1572             }
1573            
1574 1176 50       4116 if( $qframe =~ /^([+-])?([0-3])/ ) {
1575 1176   100     4219 my $dir = $1 || '+';
1576 1176 50 33     4832 if($qframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
      66        
1577 0         0 $self->warn("Query frame ($qframe) did not match strand of query ($strand)");
1578             }
1579 1176 100       3540 $qframe = $2 != 0 ? $2 - 1 : $2;
1580             } else {
1581 0         0 $self->warn("Unknown query frame ($qframe)");
1582 0         0 $qframe = 0;
1583             }
1584            
1585 1176         2418 $sim1->frame($qframe);
1586 1176         2560 $self->SUPER::feature1($sim1);
1587              
1588 1176         1306 $self->{_created_qff} = 1;
1589 1176         1534 $self->{_making_qff} = 0;
1590             }
1591              
1592             # make subject seq feature
1593             sub _subject_seq_feature {
1594 562     562   507 my $self = shift;
1595 562         773 $self->{_making_sff} = 1;
1596 562         675 my $hs = $self->{HIT_START};
1597 562         609 my $he = $self->{HIT_END};
1598 562 100       985 unless (defined $self->{_hit_factor}) {
1599 44         99 $self->_pre_seq_feature;
1600             }
1601 562         572 my $hitfactor = $self->{_hit_factor};
1602              
1603 562 50 33     2006 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
  0         0  
1604              
1605 562         430 my $strand;
1606 562 100       1115 if ($he > $hs) { # normal subject
1607 455 100       570 if ($hitfactor) {
1608 199         231 $strand = 1;
1609             }
1610             else {
1611 256         279 $strand = undef;
1612             }
1613             }
1614             else {
1615 107 100       183 if ($hitfactor) {
1616 106         149 $strand = -1;
1617             }
1618             else {
1619 1         3 $strand = undef;
1620             }
1621 107         212 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
1622             }
1623              
1624 562   33     1956 my $sim2 = $self->{_sim2} || Bio::SeqFeature::Similarity->new();
1625 562         1136 $sim2->start($hs);
1626 562         1014 $sim2->end($he);
1627 562         1201 $sim2->significance($self->{EVALUE});
1628 562         1318 $sim2->bits($self->{BITS});
1629 562         1212 $sim2->score($self->{SCORE});
1630 562         968 $sim2->strand($strand);
1631 562         1160 $sim2->seq_id($self->{HIT_NAME});
1632 562         1025 $sim2->seqlength($self->{HIT_LENGTH});
1633 562         1194 $sim2->source_tag($self->{ALGORITHM});
1634 562         1024 $sim2->seqdesc($self->{HIT_DESC});
1635 562 100       1840 $sim2->add_tag_value('meta', $self->{META}) if $self->can('meta');
1636 562         692 my $hframe = $self->{HIT_FRAME};
1637            
1638 562 100 100     2496 if (defined $strand && !defined $hframe && $hitfactor) {
    100 66        
1639 3         6 $hframe = ( $hs % 3 ) * $strand;
1640             }
1641             elsif (!defined $strand) {
1642 257         246 $hframe = 0;
1643             }
1644            
1645 562 50       2018 if( $hframe =~ /^([+-])?([0-3])/ ) {
1646 562   100     2009 my $dir = $1 || '+';
1647 562 50 33     1604 if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
      66        
1648 0         0 $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
1649             }
1650 562 100       1801 $hframe = $2 != 0 ? $2 - 1 : $2;
1651             } else {
1652 0         0 $self->warn("Unknown subject frame ($hframe)");
1653 0         0 $hframe = 0;
1654             }
1655            
1656 562         1276 $sim2->frame($hframe);
1657 562         1540 $self->SUPER::feature2($sim2);
1658              
1659 562         710 $self->{_created_sff} = 1;
1660 562         795 $self->{_making_sff} = 0;
1661             }
1662              
1663             # before calling the num_* methods
1664             sub _pre_similar_stats {
1665 438     438   417 my $self = shift;
1666 438         491 my $identical = $self->{IDENTICAL};
1667 438         412 my $conserved = $self->{CONSERVED};
1668 438         433 my $percent_id = $self->{PERCENT_IDENTITY};
1669              
1670 438 50       678 if (! defined $identical) {
1671 0 0       0 if (! defined $percent_id) {
1672 0         0 $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
1673 0         0 $identical = 0;
1674             }
1675             else {
1676 0         0 $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH});
1677             }
1678             }
1679              
1680 438 50       755 if (! defined $conserved) {
1681             $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
1682 0 0       0 if( $self->{ALGORITHM} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
1683 0         0 $conserved = $identical;
1684             }
1685 438         460 $self->{IDENTICAL} = $identical;
1686 438         414 $self->{CONSERVED} = $conserved;
1687 438         660 $self->{_did_presimilar} = 1;
1688             }
1689              
1690             # before calling the frac_* methods
1691              
1692             sub _pre_frac {
1693 105     105   137 my $self = shift;
1694            
1695 105         129 my $hsp_len = $self->{HSP_LENGTH};
1696 105         113 my $hit_len = $self->{HIT_LENGTH};
1697 105         122 my $query_len = $self->{QUERY_LENGTH};
1698              
1699 105         219 my $identical = $self->num_identical;
1700 105         200 my $conserved = $self->num_conserved;
1701              
1702 105         142 $self->{_did_prefrac} = 1;
1703 105         92 my $logical;
1704 105 50       184 if( $hsp_len ) {
1705 105         203 $self->length('total', $hsp_len);
1706 105         226 $logical = $self->_logical_length('total');
1707 105         270 $self->frac_identical( 'total', $identical / $hsp_len);
1708 105         275 $self->frac_conserved( 'total', $conserved / $hsp_len);
1709             }
1710 105 100       201 if( $hit_len ) {
1711 104         157 $logical = $self->_logical_length('hit');
1712 104         262 $self->frac_identical( 'hit', $identical / $logical);
1713 104         237 $self->frac_conserved( 'hit', $conserved / $logical);
1714             }
1715 105 100       213 if( $query_len ) {
1716 104         188 $logical = $self->_logical_length('query');
1717 104         269 $self->frac_identical( 'query', $identical / $logical) ;
1718 104         196 $self->frac_conserved( 'query', $conserved / $logical);
1719             }
1720             }
1721              
1722             # before calling gaps()
1723             # This relies first on passed parameters (parser-dependent), then on gaps
1724             # calculated by seq_inds() (if set), then falls back to directly checking
1725             # for '-' or '.' as a last resort
1726              
1727             sub _pre_gaps {
1728 378     378   369 my $self = shift;
1729 378         382 my $query_gaps = $self->{QUERY_GAPS};
1730 378         462 my $query_seq = $self->{QUERY_SEQ};
1731 378         372 my $hit_gaps = $self->{HIT_GAPS};
1732 378         445 my $hit_seq = $self->{HIT_SEQ};
1733 378         342 my $gaps = $self->{HSP_GAPS};
1734              
1735 378         467 $self->{_did_pregaps} = 1; # well, we're in the process; avoid recursion
1736 378 50       849 if( defined $query_gaps ) {
    100          
1737 0         0 $self->gaps('query', $query_gaps);
1738             } elsif( defined $query_seq ) {
1739 363 50       947 my $qg = (defined $self->{'_query_offset'}) ? $self->seq_inds('query','gaps')
    100          
1740             : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1741             : scalar( $query_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1742 363   100     1013 my $offset = $self->{'_query_offset'} || 1;
1743 363         880 $self->gaps('query', $qg/$offset);
1744             }
1745 378 50       925 if( defined $hit_gaps ) {
    100          
1746 0         0 $self->gaps('hit', $hit_gaps);
1747             } elsif( defined $hit_seq ) {
1748 358 100       813 my $hg = (defined $self->{'_sbjct_offset'}) ? $self->seq_inds('hit','gaps')
    100          
1749             : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1750             : scalar( $hit_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1751 358   100     949 my $offset = $self->{'_sbjct_offset'} || 1;
1752 358         605 $self->gaps('hit', $hg/$offset);
1753             }
1754 378 100       606 if( ! defined $gaps ) {
1755 329         482 $gaps = $self->gaps("query") + $self->gaps("hit");
1756             }
1757 378         659 $self->gaps('total', $gaps);
1758             }
1759              
1760             # before percent_identity
1761             sub _pre_pi {
1762 42     42   57 my $self = shift;
1763 42         81 $self->{_did_prepi} = 1;
1764 42 50 33     337 $self->percent_identity($self->{PERCENT_IDENTITY} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH} > 0 );
1765             }
1766              
1767             1;