File Coverage

Bio/Search/HSP/GenericHSP.pm
Criterion Covered Total %
statement 508 564 90.0
branch 274 354 77.4
condition 127 178 71.3
subroutine 50 50 100.0
pod 35 35 100.0
total 994 1181 84.1


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Search::HSP::GenericHSP
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
17              
18             =head1 SYNOPSIS
19              
20             my $hsp = Bio::Search::HSP::GenericHSP->new( -algorithm => 'blastp',
21             -evalue => '1e-30',
22             );
23              
24             $r_type = $hsp->algorithm;
25              
26             $pvalue = $hsp->p();
27              
28             $evalue = $hsp->evalue();
29              
30             $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
31              
32             $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
33              
34             $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
35              
36             $qseq = $hsp->query_string;
37              
38             $hseq = $hsp->hit_string;
39              
40             $homo_string = $hsp->homology_string;
41              
42             $len = $hsp->length( ['query'|'hit'|'total'] );
43              
44             $len = $hsp->length( ['query'|'hit'|'total'] );
45              
46             $rank = $hsp->rank;
47              
48             # TODO: Describe how to configure a SearchIO stream so that it generates
49             # GenericHSP objects.
50              
51             =head1 DESCRIPTION
52              
53             This implementation is "Generic", meaning it is is suitable for
54             holding information about High Scoring pairs from most Search reports
55             such as BLAST and FastA. Specialized objects can be derived from
56             this.
57              
58             Unless you're writing a parser, you won't ever need to create a
59             GenericHSP or any other HSPI-implementing object. If you use
60             the SearchIO system, HSPI objects are created automatically from
61             a SearchIO stream which returns Bio::Search::Result::ResultI objects
62             and you get the HSPI objects via the ResultI API.
63              
64             For documentation on what you can do with GenericHSP (and other HSPI
65             objects), please see the API documentation in
66             L.
67              
68             =head1 FEEDBACK
69              
70             =head2 Mailing Lists
71              
72             User feedback is an integral part of the evolution of this and other
73             Bioperl modules. Send your comments and suggestions preferably to
74             the Bioperl mailing list. Your participation is much appreciated.
75              
76             bioperl-l@bioperl.org - General discussion
77             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
78              
79             =head2 Support
80              
81             Please direct usage questions or support issues to the mailing list:
82              
83             I
84              
85             rather than to the module maintainer directly. Many experienced and
86             reponsive experts will be able look at the problem and quickly
87             address it. Please include a thorough description of the problem
88             with code and data examples if at all possible.
89              
90             =head2 Reporting Bugs
91              
92             Report bugs to the Bioperl bug tracking system to help us keep track
93             of the bugs and their resolution. Bug reports can be submitted via the
94             web:
95              
96             https://github.com/bioperl/bioperl-live/issues
97              
98             =head1 AUTHOR - Jason Stajich and Steve Chervitz
99              
100             Email jason-at-bioperl.org
101             Email sac-at-bioperl.org
102              
103             =head1 CONTRIBUTORS
104              
105             Sendu Bala, bix@sendu.me.uk
106              
107             =head1 APPENDIX
108              
109             The rest of the documentation details each of the object methods.
110             Internal methods are usually preceded with a _
111              
112             =cut
113              
114             # Let the code begin...
115              
116             package Bio::Search::HSP::GenericHSP;
117 27     27   100 use strict;
  27         36  
  27         718  
118              
119 27     27   91 use Bio::Root::Root;
  27         31  
  27         515  
120 27     27   9397 use Bio::SeqFeature::Similarity;
  27         59  
  27         778  
121              
122 27     27   134 use base qw(Bio::Search::HSP::HSPI);
  27         28  
  27         12883  
123              
124             =head2 new
125              
126             Title : new
127             Usage : my $obj = Bio::Search::HSP::GenericHSP->new();
128             Function: Builds a new Bio::Search::HSP::GenericHSP object
129             Returns : Bio::Search::HSP::GenericHSP
130             Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
131             -evalue => evalue
132             -pvalue => pvalue
133             -bits => bit value for HSP
134             -score => score value for HSP (typically z-score but depends on
135             analysis)
136             -hsp_length=> Length of the HSP (including gaps)
137             -identical => # of residues that that matched identically
138             -percent_identity => (optional) percent identity
139             -conserved => # of residues that matched conservatively
140             (only protein comparisions;
141             conserved == identical in nucleotide comparisons)
142             -hsp_gaps => # of gaps in the HSP
143             -query_gaps => # of gaps in the query in the alignment
144             -hit_gaps => # of gaps in the subject in the alignment
145             -query_name => HSP Query sequence name (if available)
146             -query_start => HSP Query start (in original query sequence coords)
147             -query_end => HSP Query end (in original query sequence coords)
148             -query_length=> total length of the query sequence
149             -query_seq => query sequence portion of the HSP
150             -query_desc => textual description of the query
151             -hit_name => HSP Hit sequence name (if available)
152             -hit_start => HSP Hit start (in original hit sequence coords)
153             -hit_end => HSP Hit end (in original hit sequence coords)
154             -hit_length => total length of the hit sequence
155             -hit_seq => hit sequence portion of the HSP
156             -hit_desc => textual description of the hit
157             -homology_seq=> homology sequence for the HSP
158             -hit_frame => hit frame (only if hit is translated protein)
159             -query_frame => query frame (only if query is translated protein)
160             -rank => HSP rank
161             -links => HSP links information (WU-BLAST only)
162             -hsp_group => HSP Group informat (WU-BLAST only)
163             -gap_symbol => symbol representing a gap (default = '-')
164             -hit_features=> string of features found in or near HSP hit
165             region (reported in some BLAST text output,
166             v. 2.2.13 and up)
167             -stranded => If the algorithm isn't known (i.e. defaults to
168             'generic'), setting this will indicate start/end
169             coordinates are to be used to determine the strand
170             for 'query', 'subject', 'hit', 'both', or 'none'
171             (default = 'none')
172              
173             =cut
174              
175             sub new {
176 1331     1331 1 8203 my($class,%args) = @_;
177              
178             # don't pass anything to SUPER; complex hierarchy results in lots of work
179             # for nothing
180            
181 1331         3837 my $self = $class->SUPER::new();
182              
183             # for speed, don't use _rearrange and just store all input data directly
184             # with no method calls and no work done. work can be carried
185             # out just-in-time later if desired
186 1331         3731 while (my ($arg, $value) = each %args) {
187 26240         20105 $arg =~ tr/a-z\055/A-Z/d;
188 26240         53750 $self->{$arg} = $value;
189             }
190 1331         1569 my $bits = $self->{BITS};
191              
192 1331 100       3469 defined $self->{VERBOSE} && $self->verbose($self->{VERBOSE});
193 1331 50       1819 if (exists $self->{GAP_SYMBOL}) {
194             # not checking anything else but the length (must be 1 as only one gap
195             # symbol allowed currently); can add in support for symbol checks or
196             # multiple symbols later if needed
197             $self->throw("Gap symbol must be of length 1") if
198 0 0       0 CORE::length($self->{GAP_SYMBOL}) != 1;
199             } else {
200             # dafault
201 1331         2157 $self->{GAP_SYMBOL} = '-';
202             }
203 1331   100     2299 $self->{ALGORITHM} ||= 'GENERIC';
204 1331   100     3268 $self->{STRANDED} ||= 'NONE';
205            
206 1331 50 33     3857 if (! defined $self->{QUERY_LENGTH} || ! defined $self->{HIT_LENGTH}) {
207 0         0 $self->throw("Must define hit and query length");
208             }
209              
210 1331         1475 $self->{'_sequenceschanged'} = 1;
211            
212 1331         1377 $self->{_finished_new} = 1;
213 1331         6991 return $self;
214             }
215              
216             sub _logical_length {
217 313     313   322 my ($self, $type) = @_;
218 313 100 66     818 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
219 105         250 $self->_calculate_seq_offsets();
220             }
221 313 100       1011 my $key = $type =~ /sbjct|hit|tot/i ? 'sbjct' : 'query';
222            
223 313         501 my $offset = $self->{"_${key}_offset"};
224 313         410 return $self->length($type) / $offset ;
225             }
226              
227             =head2 L methods
228              
229             Implementation of L methods follow
230              
231             =head2 algorithm
232              
233             Title : algorithm
234             Usage : my $r_type = $hsp->algorithm
235             Function: Obtain the name of the algorithm used to obtain the HSP
236             Returns : string (e.g., BLASTP)
237             Args : [optional] scalar string to set value
238              
239             =cut
240              
241             sub algorithm{
242 7397     7397 1 8917 my ($self,$value) = @_;
243 7397         8256 my $previous = $self->{'ALGORITHM'};
244 7397 50 33     20520 if( defined $value || ! defined $previous ) {
245 0 0       0 $value = $previous = '' unless defined $value;
246 0         0 $self->{'ALGORITHM'} = $value;
247             }
248              
249 7397         10654 return $previous;
250             }
251              
252             =head2 pvalue
253              
254             Title : pvalue
255             Usage : my $pvalue = $hsp->pvalue();
256             Function: Returns the P-value for this HSP or undef
257             Returns : float or exponential (2e-10)
258             P-value is not defined with NCBI Blast2 reports.
259             Args : [optional] numeric to set value
260              
261             =cut
262              
263             sub pvalue {
264 63     63 1 68 my ($self,$value) = @_;
265 63         73 my $previous = $self->{'PVALUE'};
266 63 50       123 if( defined $value ) {
267 0         0 $self->{'PVALUE'} = $value;
268             }
269 63         140 return $previous;
270             }
271              
272             =head2 evalue
273              
274             Title : evalue
275             Usage : my $evalue = $hsp->evalue();
276             Function: Returns the e-value for this HSP
277             Returns : float or exponential (2e-10)
278             Args : [optional] numeric to set value
279              
280             =cut
281              
282             sub evalue {
283 170     170 1 215 my ($self,$value) = @_;
284 170         239 my $previous = $self->{'EVALUE'};
285 170 50       324 if( defined $value ) {
286 0         0 $self->{'EVALUE'} = $value;
287             }
288 170         707 return $previous;
289             }
290              
291             =head2 frac_identical
292              
293             Title : frac_identical
294             Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
295             Function: Returns the fraction of identitical positions for this HSP
296             Returns : Float in range 0.0 -> 1.0
297             Args : arg 1: 'query' = num identical / length of query seq (without gaps)
298             'hit' = num identical / length of hit seq (without gaps)
299             synonyms: 'sbjct', 'subject'
300             'total' = num identical / length of alignment (with gaps)
301             synonyms: 'hsp'
302             default = 'total'
303             arg 2: [optional] frac identical value to set for the type requested
304             Note : for translated sequences, this adjusts the length accordingly
305              
306             =cut
307              
308             sub frac_identical {
309 602     602 1 664 my ($self, $type,$value) = @_;
310              
311 602 100       927 unless ($self->{_did_prefrac}) {
312 105         236 $self->_pre_frac;
313             }
314              
315 602 50       1020 $type = lc $type if defined $type;
316 602 50 33     1913 $type = 'hit' if( defined $type &&
317             $type =~ /subject|sbjct/);
318 602 100 66     3508 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
      66        
319             $type !~ /query|hit|subject|sbjct|total/);
320 602         677 my $previous = $self->{'_frac_identical'}->{$type};
321 602 100 66     1331 if( defined $value || ! defined $previous ) {
322 313 50       463 $value = $previous = '' unless defined $value;
323 313 100 100     812 if( $type eq 'hit' || $type eq 'query' ) {
324 208         390 $self->$type()->frac_identical( $value);
325             }
326 313         489 $self->{'_frac_identical'}->{$type} = $value;
327             }
328 602         1896 return $previous;
329              
330             }
331              
332             =head2 frac_conserved
333              
334             Title : frac_conserved
335             Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
336             Function : Returns the fraction of conserved positions for this HSP.
337             This is the fraction of symbols in the alignment with a
338             positive score.
339             Returns : Float in range 0.0 -> 1.0
340             Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
341             'hit' = num conserved / length of hit seq (without gaps)
342             synonyms: 'sbjct', 'subject'
343             'total' = num conserved / length of alignment (with gaps)
344             synonyms: 'hsp'
345             default = 'total'
346             arg 2: [optional] frac conserved value to set for the type requested
347              
348             =cut
349              
350             sub frac_conserved {
351 440     440 1 449 my ($self, $type,$value) = @_;
352              
353 440 50       673 unless ($self->{_did_prefrac}) {
354 0         0 $self->_pre_frac;
355             }
356              
357 440 100       757 $type = lc $type if defined $type;
358 440 50 66     1355 $type = 'hit' if( defined $type && $type =~ /subject|sbjct/);
359 440 100 100     2609 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
      66        
360             $type !~ /query|hit|subject|sbjct|total/);
361 440         523 my $previous = $self->{'_frac_conserved'}->{$type};
362 440 100 66     912 if( defined $value || ! defined $previous ) {
363 313 50       408 $value = $previous = '' unless defined $value;
364 313         366 $self->{'_frac_conserved'}->{$type} = $value;
365             }
366 440         1143 return $previous;
367             }
368              
369             =head2 gaps
370              
371             Title : gaps
372             Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
373             Function : Get the number of gap characters in the query, hit, or total alignment.
374             Returns : Integer, number of gaps or 0 if none
375             Args : arg 1: 'query' = num gap characters in query seq
376             'hit' = num gap characters in hit seq; synonyms: 'sbjct', 'subject'
377             'total' = num gap characters in whole alignment; synonyms: 'hsp'
378             default = 'total'
379             arg 2: [optional] integer gap value to set for the type requested
380              
381             =cut
382              
383             sub gaps {
384 3125     3125 1 2821 my ($self, $type, $value) = @_;
385              
386 3125 100       4425 unless ($self->{_did_pregaps}) {
387 378         778 $self->_pre_gaps;
388             }
389              
390 3125 100       4554 $type = lc $type if defined $type;
391 3125 50 66     13835 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
      66        
392             $type !~ /query|hit|subject|sbjct|total/);
393 3125 100       4472 $type = 'hit' if $type =~ /sbjct|subject/;
394 3125         3374 my $previous = $self->{'_gaps'}->{$type};
395 3125 100 100     6900 if( defined $value || ! defined $previous ) {
396 1110 100       1309 $value = $previous = '' unless defined $value;
397 1110         1680 $self->{'_gaps'}->{$type} = $value;
398             }
399 3125   100     7803 return $previous || 0;
400             }
401              
402             =head2 query_string
403              
404             Title : query_string
405             Usage : my $qseq = $hsp->query_string;
406             Function: Retrieves the query sequence of this HSP as a string
407             Returns : string
408             Args : [optional] string to set for query sequence
409              
410              
411             =cut
412              
413             sub query_string{
414 610     610 1 982 my ($self,$value) = @_;
415 610         670 my $previous = $self->{QUERY_SEQ};
416 610 100 66     1990 if( defined $value || ! defined $previous ) {
417 3 50       6 $value = $previous = '' unless defined $value;
418 3         5 $self->{QUERY_SEQ} = $value;
419             # do some housekeeping so we know when to
420             # re-run _calculate_seq_positions
421 3         3 $self->{'_sequenceschanged'} = 1;
422             }
423 610         1254 return $previous;
424             }
425              
426             =head2 hit_string
427              
428             Title : hit_string
429             Usage : my $hseq = $hsp->hit_string;
430             Function: Retrieves the hit sequence of this HSP as a string
431             Returns : string
432             Args : [optional] string to set for hit sequence
433              
434              
435             =cut
436              
437             sub hit_string{
438 618     618 1 2644 my ($self,$value) = @_;
439 618         710 my $previous = $self->{HIT_SEQ};
440 618 100 66     2044 if( defined $value || ! defined $previous ) {
441 1 50       3 $value = $previous = '' unless defined $value;
442 1         2 $self->{HIT_SEQ} = $value;
443             # do some housekeeping so we know when to
444             # re-run _calculate_seq_positions
445 1         2 $self->{'_sequenceschanged'} = 1;
446             }
447 618         1178 return $previous;
448             }
449              
450             =head2 homology_string
451              
452             Title : homology_string
453             Usage : my $homo_string = $hsp->homology_string;
454             Function: Retrieves the homology sequence for this HSP as a string.
455             : The homology sequence is the string of symbols in between the
456             : query and hit sequences in the alignment indicating the degree
457             : of conservation (e.g., identical, similar, not similar).
458             Returns : string
459             Args : [optional] string to set for homology sequence
460              
461             =cut
462              
463             sub homology_string{
464 6905     6905 1 6034 my ($self,$value) = @_;
465 6905         6938 my $previous = $self->{HOMOLOGY_SEQ};
466 6905 100 66     19071 if( defined $value || ! defined $previous ) {
467 3 50       6 $value = $previous = '' unless defined $value;
468 3         5 $self->{HOMOLOGY_SEQ} = $value;
469             # do some housekeeping so we know when to
470             # re-run _calculate_seq_positions
471 3         5 $self->{'_sequenceschanged'} = 1;
472             }
473 6905         18668 return $previous;
474             }
475              
476             =head2 consensus_string
477              
478             Title : consensus_string
479             Usage : my $cs_string = $hsp->consensus_string;
480             Function: Retrieves the consensus structure line for this HSP as a string (HMMER).
481             : If the model had any consensus structure or reference line annotation
482             : that it inherited from a multiple alignment (#=GC SS cons,
483             : #=GC RF annotation in Stockholm files), that information is shown
484             : as CS or RF annotation line.
485             Returns : string
486             Args : [optional] string to set for consensus structure
487              
488             =cut
489              
490             sub consensus_string {
491 8     8 1 14 my ($self,$value) = @_;
492 8         19 my $previous = $self->{CS_SEQ};
493 8 50 33     44 if( defined $value || ! defined $previous ) {
494 0 0       0 $value = $previous = '' unless defined $value;
495 0         0 $self->{CS_SEQ} = $value;
496             # do some housekeeping so we know when to
497             # re-run _calculate_seq_positions
498 0         0 $self->{'_sequenceschanged'} = 1;
499             }
500 8         30 return $previous;
501             }
502              
503             =head2 posterior_string
504              
505             Title : posterior_string
506             Usage : my $pp_string = $hsp->posterior_string;
507             Function: Retrieves the posterior probability line for this HSP as a string (HMMer3).
508             : The posterior probability is the string of symbols at the bottom
509             : of the alignment indicating the expected accuracy of each aligned residue.
510             : A 0 means 0-5%, 1 means 5-15%, and so on; 9 means 85-95%,
511             : and a * means 95-100% posterior probability.
512             Returns : string
513             Args : [optional] string to set for posterior probability
514              
515             =cut
516              
517             sub posterior_string {
518 9     9 1 14 my ($self,$value) = @_;
519 9         15 my $previous = $self->{PP_SEQ};
520 9 100 66     58 if( defined $value || ! defined $previous ) {
521 2 50       6 $value = $previous = '' unless defined $value;
522 2         4 $self->{PP_SEQ} = $value;
523             # do some housekeeping so we know when to
524             # re-run _calculate_seq_positions
525 2         3 $self->{'_sequenceschanged'} = 1;
526             }
527 9         32 return $previous;
528             }
529              
530             =head2 length
531              
532             Title : length
533             Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
534             Function : Returns the length of the query or hit in the alignment
535             (without gaps)
536             or the aggregate length of the HSP (including gaps;
537             this may be greater than either hit or query )
538             Returns : integer
539             Args : arg 1: 'query' = length of query seq (without gaps)
540             'hit' = length of hit seq (without gaps) (synonyms: sbjct, subject)
541             'total' = length of alignment (with gaps)
542             default = 'total'
543             arg 2: [optional] integer length value to set for specific type
544              
545             =cut
546              
547             sub length {
548              
549 3839     3839 1 2933 my $self = shift;
550 3839         2710 my $type = shift;
551              
552 3839 100       5380 $type = 'total' unless defined $type;
553 3839         3961 $type = lc $type;
554              
555 3839 100       9548 if( $type =~ /^q/i ) {
    100          
556 1615         2374 return $self->query()->length(shift);
557             } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
558 989         1536 return $self->hit()->length(shift);
559             } else {
560 1235         919 my $v = shift;
561 1235 100       1649 if( defined $v ) {
562 105         121 $self->{HSP_LENGTH} = $v;
563             }
564 1235         2656 return $self->{HSP_LENGTH};
565             }
566 0         0 return 0; # should never get here
567             }
568              
569             =head2 hsp_length
570              
571             Title : hsp_length
572             Usage : my $len = $hsp->hsp_length()
573             Function: shortcut length('hsp')
574             Returns : floating point between 0 and 100
575             Args : none
576              
577             =cut
578              
579 16     16 1 72 sub hsp_length { return shift->length('hsp', shift); }
580              
581             =head2 percent_identity
582              
583             Title : percent_identity
584             Usage : my $percentid = $hsp->percent_identity()
585             Function: Returns the calculated percent identity for an HSP
586             Returns : floating point between 0 and 100
587             Args : none
588              
589              
590             =cut
591              
592             sub percent_identity {
593 84     84 1 134 my $self = shift;
594              
595 84 100       206 unless ($self->{_did_prepi}) {
596 42         172 $self->_pre_pi;
597             }
598              
599 84         250 return $self->SUPER::percent_identity(@_);
600             }
601              
602             =head2 frame
603              
604             Title : frame
605             Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
606             Function: Set the Frame for both query and subject and insure that
607             they agree.
608             This overrides the frame() method implementation in
609             FeaturePair.
610             Returns : array of query and subject frame if return type wants an array
611             or query frame if defined or subject frame if not defined
612             Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
613             'query' to retrieve the query frame
614             'list' or 'array' to retrieve both query and hit frames together
615             Note : Frames are stored in the GFF way (0-2) not 1-3
616             as they are in BLAST (negative frames are deduced by checking
617             the strand of the query or hit)
618              
619             =cut
620              
621             # Note: changed 4/19/08 - bug 2485
622             #
623             # frame() is supposed to be a getter/setter (as is implied by the Function desc
624             # above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
625             # the API is not consistent with the other HSP/SimilarityPair methods such as
626             # strand(), start(), end(), etc.
627             #
628             # In order to make this consistent with other methods all work is now done
629             # when the features are instantiated and not delayed. We compromise by
630             # defaulting back 'to hit' w/o passed args. Setting is now allowed.
631              
632             sub frame {
633 610     610 1 1825 my $self = shift;
634 610         505 my $val = shift;
635 610 50       1027 if (!defined $val) {
636             # unfortunately, w/o args we need to warn about API changes
637 0         0 $self->warn("API for frame() has changed.\n".
638             "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
639             "returning query frame");
640 0         0 $val = 'query';
641             }
642 610         1091 $val =~ s/^\s+//;
643              
644 610 100       1509 if( $val =~ /^q/i ) {
    50          
    0          
    0          
645 284         463 return $self->query->frame(@_);
646             } elsif( $val =~ /^hi|^s/i ) {
647 326         504 return $self->hit->frame(@_);
648             } elsif ( $val =~ /^list|array/i ) {
649 0         0 return ($self->query->frame($_[0]),
650             $self->hit->frame($_[1]) );
651             } elsif ( $val =~ /^\d+$/) {
652             # old API i.e. frame($query_frame, $hit_frame). This catches all but one
653             # case, where no arg is passed (so the hit is wanted).
654 0         0 $self->warn("API for frame() has changed.\n".
655             "Please refer to documentation for Bio::Search::HSP::GenericHSP");
656             wantarray ?
657 0 0       0 return ($self->query->frame($val),
658             $self->hit->frame(@_) ) :
659             return $self->hit->frame($val,@_);
660             } else {
661 0         0 $self->warn("unrecognized component '$val' requested\n");
662             }
663 0         0 return 0;
664             }
665              
666             =head2 get_aln
667              
668             Title : get_aln
669             Usage : my $aln = $hsp->gel_aln
670             Function: Returns a L object representing the HSP alignment
671             Returns : L
672             Args : none
673              
674             =cut
675              
676             sub get_aln {
677 19     19 1 25 my ($self) = @_;
678 19         110 require Bio::LocatableSeq;
679 19         4815 require Bio::SimpleAlign;
680              
681 19         80 my $aln = Bio::SimpleAlign->new();
682 19         52 my $hs = $self->hit_string();
683 19         49 my $qs = $self->query_string();
684             # FASTA specific stuff moved to the FastaHSP object
685 19         23 my $seqonly = $qs;
686 19         114 $seqonly =~ s/[\-\s]//g;
687 19         50 my ($q_nm,$s_nm) = ($self->query->seq_id(),
688             $self->hit->seq_id());
689             # Should we silently change the name of the query or hit if it isn't
690             # defined? May need revisiting... cjfields 2008-12-3 (commented out below)
691            
692             #unless( defined $q_nm && CORE::length ($q_nm) ) {
693             # $q_nm = 'query';
694             #}
695             #unless( defined $s_nm && CORE::length ($s_nm) ) {
696             # $s_nm = 'hit';
697             #}
698            
699             # mapping: 1 residues for every x coordinate positions
700             my $query = Bio::LocatableSeq->new(
701             -seq => $qs,
702             -id => $q_nm,
703             -start => $self->query->start,
704             -end => $self->query->end,
705             -strand => $self->query->strand,
706             -force_nse => $q_nm ? 0 : 1,
707 19 100       64 -mapping => [ 1, $self->{_query_mapping} ]
708             );
709 19         43 $seqonly = $hs;
710 19         127 $seqonly =~ s/[\-\s]//g;
711             my $hit = Bio::LocatableSeq->new(
712             -seq => $hs,
713             -id => $s_nm,
714             -start => $self->hit->start,
715             -end => $self->hit->end,
716             -strand => $self->hit->strand,
717             -force_nse => $s_nm ? 0 : 1,
718 19 50       69 -mapping => [ 1, $self->{_hit_mapping} ]
719             );
720 19         86 $aln->add_seq($query);
721 19         34 $aln->add_seq($hit);
722 19         68 return $aln;
723             }
724              
725             =head2 num_conserved
726              
727             Title : num_conserved
728             Usage : $obj->num_conserved($newval)
729             Function: returns the number of conserved residues in the alignment
730             Returns : integer
731             Args : integer (optional)
732              
733              
734             =cut
735              
736             sub num_conserved{
737 1914     1914 1 1415 my ($self,$value) = @_;
738              
739 1914 100       2597 unless ($self->{_did_presimilar}) {
740 1         3 $self->_pre_similar_stats;
741             }
742              
743 1914 50       2567 if (defined $value) {
744 0         0 $self->{CONSERVED} = $value;
745             }
746 1914         3082 return $self->{CONSERVED};
747             }
748              
749             =head2 num_identical
750              
751             Title : num_identical
752             Usage : $obj->num_identical($newval)
753             Function: returns the number of identical residues in the alignment
754             Returns : integer
755             Args : integer (optional)
756              
757              
758             =cut
759              
760             sub num_identical{
761 1923     1923 1 1760 my ($self,$value) = @_;
762              
763 1923 100       2974 unless ($self->{_did_presimilar}) {
764 437         834 $self->_pre_similar_stats;
765             }
766              
767 1923 50       2501 if( defined $value) {
768 0         0 $self->{IDENTICAL} = $value;
769             }
770 1923         3601 return $self->{IDENTICAL};
771             }
772              
773             =head2 rank
774              
775             Usage : $hsp->rank( [string] );
776             Purpose : Get the rank of the HSP within a given Blast hit.
777             Example : $rank = $hsp->rank;
778             Returns : Integer (1..n) corresponding to the order in which the HSP
779             appears in the BLAST report.
780              
781             =cut
782              
783             sub rank {
784 4514     4514 1 2786 my ($self,$value) = @_;
785 4514 50       4933 if( defined $value) {
786 0         0 $self->{RANK} = $value;
787             }
788 4514         8682 return $self->{RANK};
789             }
790              
791             =head2 seq_inds
792              
793             Title : seq_inds
794             Purpose : Get a list of residue positions (indices) for all identical
795             : or conserved residues in the query or sbjct sequence.
796             Example : @s_ind = $hsp->seq_inds('query', 'identical');
797             : @h_ind = $hsp->seq_inds('hit', 'conserved');
798             : @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
799             : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
800             Returns : List of integers
801             : May include ranges if collapse is true.
802             Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
803             : ('sbjct' is synonymous with 'hit')
804             : class = 'identical' - identical positions
805             : 'conserved' - conserved positions
806             : 'nomatch' - mismatched residue or gap positions
807             : 'mismatch' - mismatched residue positions (no gaps)
808             : 'gap' - gap positions only
809             : 'frameshift'- frameshift positions only
810             : 'conserved-not-identical' - conserved positions w/o
811             : identical residues
812             : The name can be shortened to 'id' or 'cons' unless
813             : the name is . The default value is
814             : 'identical'
815             :
816             : collapse = boolean, if true, consecutive positions are merged
817             : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
818             : collapses to "1-5 7 9-11". This is useful for
819             : consolidating long lists. Default = no collapse.
820             :
821             Throws : n/a.
822             Comments : For HSPs partially or completely derived from translated sequences
823             : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
824             : cannot easily be attributed to a single position (i.e. the
825             : positional data is ambiguous). In these cases all three codon
826             : positions are reported. Under these conditions you can check
827             : ambiguous_seq_inds() to determine whether the query, subject,
828             : or both are ambiguous.
829             :
830             See Also : L,
831             L
832              
833             =cut
834              
835             sub seq_inds{
836 194     194 1 285 my ($self, $seqType, $class, $collapse) = @_;
837              
838             # prepare the internal structures - this is cached so
839             # if the strings have not changed we're okay
840 194         387 $self->_calculate_seq_positions();
841              
842 194   50     313 $seqType ||= 'query';
843 194   50     277 $class ||= 'identical';
844 194   100     411 $collapse ||= 0;
845 194 100       328 $seqType = 'sbjct' if $seqType eq 'hit';
846 194         308 my $t = lc(substr($seqType,0,1));
847 194 100 33     413 if( $t eq 'q' ) {
    50          
848 100         109 $seqType = 'query';
849             } elsif ( $t eq 's' || $t eq 'h' ) {
850 94         110 $seqType = 'sbjct';
851             } else {
852 0         0 $self->warn("unknown seqtype $seqType using 'query'");
853 0         0 $seqType = 'query';
854             }
855            
856 194         205 $t = lc(substr($class,0,1));
857              
858 194 100       591 if( $t eq 'c' ) {
    50          
    100          
    100          
    100          
    50          
859 8 50       13 if( $class =~ /conserved\-not\-identical/ ) {
860 0         0 $class = 'conserved';
861             } else {
862 8         8 $class = 'conservedall';
863             }
864             } elsif( $t eq 'i' ) {
865 0         0 $class = 'identical';
866             } elsif( $t eq 'n' ) {
867 18         18 $class = 'nomatch';
868             } elsif( $t eq 'm' ) {
869 10         11 $class = 'mismatch';
870             } elsif( $t eq 'g' ) {
871 150         131 $class = 'gap';
872             } elsif( $t eq 'f' ) {
873 8         7 $class = 'frameshift';
874             } else {
875 0         0 $self->warn("unknown sequence class $class using 'identical'");
876 0         0 $class = 'identical';
877             }
878              
879             ## Sensitive to member name changes.
880 194         231 $seqType = "_\L$seqType\E";
881 194         195 $class = "_\L$class\E";
882 194         139 my @ary;
883              
884 194 100       259 if( $class eq '_gap' ) {
    100          
885             # this means that we are remapping the gap length that is stored
886             # in the hash (for example $self->{'_gapRes_query'} )
887             # so we'll return an array which has the values of the position of the
888             # of the gap (the key in the hash) + the gap length (value in the
889             # hash for this key - 1.
890            
891             # changing this; since the index is the position prior to the insertion,
892             # repeat the position based on the number of gaps inserted
893 521         276 @ary = map { my @tmp;
894             # position holds number of gaps inserted
895 521         792 for my $g (1..$self->{seqinds}{"${class}Res$seqType"}->{$_}) {
896 1148         954 push @tmp, $_ ;
897             }
898 521         700 @tmp}
899 150         126 sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
  1919         1260  
  150         763  
900             } elsif( $class eq '_conservedall' ) {
901 13462         7532 @ary = sort { $a <=> $b }
902 8         78 keys %{ $self->{seqinds}{"_conservedRes$seqType"}},
903 8         9 keys %{ $self->{seqinds}{"_identicalRes$seqType"}},
  8         170  
904             } else {
905 36         34 @ary = sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
  28287         16030  
  36         736  
906             }
907 194 100       4266 require Bio::Search::BlastUtils if $collapse;
908              
909 194 100       531 return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary;
910             }
911              
912             =head2 ambiguous_seq_inds
913              
914             Title : ambiguous_seq_inds
915             Purpose : returns a string denoting whether sequence indices for query,
916             : subject, or both are ambiguous
917             Returns : String; 'query', 'subject', 'query/subject', or empty string ''
918             Argument : none
919             Comments : For HSPs partially or completely derived from translated sequences
920             : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
921             : cannot easily be attributed to a single position (i.e. the
922             : positional data is ambiguous). In these cases all three codon
923             : positions are reported when using seq_inds(). Under these
924             : conditions you can check ambiguous_seq_inds() to determine whether
925             : the query, subject, or both are ambiguous.
926             See Also : L
927              
928             =cut
929              
930             sub ambiguous_seq_inds {
931 6     6 1 13 my $self = shift;
932 6         16 $self->_calculate_seq_positions();
933             my $type = ($self->{_query_offset} == 3 && $self->{_sbjct_offset} == 3) ?
934             'query/subject' :
935             ($self->{_query_offset} == 3) ? 'query' :
936 6 100 100     53 ($self->{_sbjct_offset} == 3) ? 'subject' : '';
    100          
    100          
937 6         24 return $type;
938             }
939              
940             =head2 Inherited from L
941              
942             These methods come from L
943              
944             =head2 query
945              
946             Title : query
947             Usage : my $query = $hsp->query
948             Function: Returns a SeqFeature representing the query in the HSP
949             Returns : L
950             Args : [optional] new value to set
951              
952             =cut
953              
954             sub query {
955 5590     5590 1 7036 my $self = shift;
956 5590 100       8280 unless ($self->{_created_qff}) {
957 1167         1954 $self->_query_seq_feature;
958             }
959 5590         10808 return $self->SUPER::query(@_);
960             }
961              
962             sub feature1 {
963 6939     6939 1 5280 my $self = shift;
964 6939 100 66     16737 if (! $self->{_finished_new} || $self->{_making_qff}) {
965 1331 50       1917 return $self->{_sim1} if $self->{_sim1};
966 1331         3192 $self->{_sim1} = Bio::SeqFeature::Similarity->new();
967 1331         3219 return $self->{_sim1};
968             }
969 5608 100       7338 unless ($self->{_created_qff}) {
970 9         30 $self->_query_seq_feature;
971             }
972 5608         8455 return $self->SUPER::feature1(@_);
973             }
974              
975             =head2 hit
976              
977             Title : hit
978             Usage : my $hit = $hsp->hit
979             Function: Returns a SeqFeature representing the hit in the HSP
980             Returns : L
981             Args : [optional] new value to set
982              
983             =cut
984              
985             sub hit {
986 4390     4390 1 9202 my $self = shift;
987 4390 100       6241 unless ($self->{_created_sff}) {
988 550         997 $self->_subject_seq_feature;
989             }
990 4390         8350 return $self->SUPER::hit(@_);
991             }
992              
993             sub feature2 {
994 4408     4408 1 5797 my $self = shift;
995 4408 50 33     10709 if (! $self->{_finished_new} || $self->{_making_sff}) {
996 0 0       0 return $self->{_sim2} if $self->{_sim2};
997 0         0 $self->{_sim2} = Bio::SeqFeature::Similarity->new();
998 0         0 return $self->{_sim2};
999             }
1000 4408 100       5514 unless ($self->{_created_sff}) {
1001 12         39 $self->_subject_seq_feature;
1002             }
1003 4408         7450 return $self->SUPER::feature2(@_);
1004             }
1005              
1006             =head2 significance
1007              
1008             Title : significance
1009             Usage : $evalue = $obj->significance();
1010             $obj->significance($evalue);
1011             Function: Get/Set the significance value
1012             Returns : numeric
1013             Args : [optional] new value to set
1014              
1015             =cut
1016              
1017             # Override significance to return the e-value or, if this is
1018             # not defined (WU-BLAST), return the p-value.
1019             sub significance {
1020 19     19 1 33 my ($self, $val) = @_;
1021 19 50 33     75 if (!defined $self->{SIGNIFICANCE} || defined $val) {
1022 19 50       70 $self->{SIGNIFICANCE} = defined $val ? $val :
    100          
    50          
1023             defined $self->evalue ? $self->evalue :
1024             defined $self->pvalue ? $$self->pvalue :
1025             undef;
1026 19         42 $self->query->significance($self->{SIGNIFICANCE});
1027             }
1028 19         76 return $self->{SIGNIFICANCE};
1029             }
1030              
1031             =head2 strand
1032              
1033             Title : strand
1034             Usage : $hsp->strand('query')
1035             Function: Retrieves the strand for the HSP component requested
1036             Returns : +1 or -1
1037             Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject,
1038             'query' to retrieve the query strand (default)
1039              
1040             =cut
1041              
1042             sub strand {
1043 587     587 1 740 my ($self,$type) = @_;
1044              
1045 587 100 100     4422 if( $type =~ /^q/i && defined $self->{'QUERY_STRAND'} ) {
    100 100        
1046 2         8 return $self->{'QUERY_STRAND'};
1047             } elsif( $type =~ /^(hit|subject|sbjct)/i && defined $self->{'HIT_STRAND'} ) {
1048 2         9 return $self->{'HIT_STRAND'};
1049             }
1050              
1051 583         1346 return $self->SUPER::strand($type)
1052             }
1053              
1054             =head2 score
1055              
1056             Title : score
1057             Usage : $score = $obj->score();
1058             $obj->score($value);
1059             Function: Get/Set the score value
1060             Returns : numeric
1061             Args : [optional] new value to set
1062              
1063             =head2 bits
1064              
1065             Title : bits
1066             Usage : $bits = $obj->bits();
1067             $obj->bits($value);
1068             Function: Get/Set the bits value
1069             Returns : numeric
1070             Args : [optional] new value to set
1071              
1072             =head1 Private methods
1073              
1074             =cut
1075              
1076             =head2 _calculate_seq_positions
1077              
1078             Title : _calculate_seq_positions
1079             Usage : $self->_calculate_seq_positions
1080             Function: Internal function
1081             Returns :
1082             Args :
1083              
1084              
1085             =cut
1086              
1087             sub _calculate_seq_positions {
1088 205     205   247 my ($self,@args) = @_;
1089 205 100       448 return unless ( $self->{'_sequenceschanged'} );
1090 73         92 $self->{'_sequenceschanged'} = 0;
1091 73         166 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
1092             $self->query_string(),
1093             $self->hit_string() );
1094 73         138 my ($mlen, $qlen, $slen) = (CORE::length($seqString), CORE::length($qseq), CORE::length($sseq));
1095 73   100     117 my $qdir = $self->query->strand || 1;
1096 73   100     158 my $sdir = $self->hit->strand || 1;
1097 73 100       209 my ($resCount_query, $endpoint_query) = ($qdir <=0) ? ($self->query->end, $self->query->start)
1098             : ($self->query->start, $self->query->end);
1099 73 100       246 my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ? ($self->hit->end, $self->hit->start)
1100             : ($self->hit->start, $self->hit->end);
1101            
1102 73         145 my $prog = $self->algorithm;
1103            
1104 73 100       354 if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1105            
1106             # we infer the end of the regional sequence where the first and last
1107             # non spaces are in the homology string
1108             # then we use the HSP->length to tell us how far to read
1109             # to cut off the end of the sequence
1110            
1111 4         5 my ($start, $rest) = (0,0);
1112 4 50       102 if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1113 4 100       28 ($start, $rest) = ($1 ? CORE::length($1) : 0, CORE::length($2));
1114             }
1115            
1116 4         12 $seqString = substr($seqString, $start, $rest);
1117 4         6 $qseq = substr($qseq, $start, $rest);
1118 4         5 $sseq = substr($sseq, $start, $rest);
1119              
1120             # commented out 10/29/08
1121             # removing frameshift symbols doesn't take into account the following:
1122             # 1) does not remove the same point in the homology string (get
1123             # positional errors)
1124             # 2) adjustments to the overall position in the string due to the
1125             # frameshift must be taken into consideration (get balancing errors)
1126             #
1127             # Frameshifts will be handled directly in the main loop.
1128             # --chris
1129            
1130             # fasta reports some extra 'regional' sequence information
1131             # we need to clear out first
1132             # this seemed a bit insane to me at first, but it appears to
1133             # work --jason
1134            
1135             #$qseq =~ s![\\\/]!!g;
1136             #$sseq =~ s![\\\/]!!g;
1137             }
1138              
1139 73 100 66     297 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
1140 2         4 $self->_calculate_seq_offsets();
1141             }
1142            
1143 73         102 my ($qfs, $sfs) = (0,0);
1144             CHAR_LOOP:
1145 73         177 for my $pos (0..CORE::length($seqString)-1) {
1146 30602         37373 my @qrange = (0 - $qfs)..($self->{_query_offset} - 1);
1147 30602         28227 my @srange = (0 - $sfs)..($self->{_sbjct_offset} - 1);
1148             # $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1149 30602         19772 ($qfs, $sfs) = (0,0);
1150 30602 50 100     131063 my ($mchar, $qchar, $schar) = (
    50          
1151             unpack("x$pos A1",$seqString) || ' ',
1152             $pos < CORE::length($qseq) ? unpack("x$pos A1",$qseq) : '-',
1153             $pos < CORE::length($sseq) ? unpack("x$pos A1",$sseq) : '-'
1154             );
1155 30602         27552 my $matchtype = '';
1156 30602         18666 my ($qgap, $sgap) = (0,0);
1157 30602 100 100     122133 if( $mchar eq '+' || $mchar eq '.') { # conserved
    100 100        
    50          
1158 757         2125 $self->{seqinds}{_conservedRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1159 757         1805 $self->{seqinds}{_conservedRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1160 757         597 $matchtype = 'conserved';
1161             } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1162 25371         62511 $self->{seqinds}{_identicalRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1163 25371         52227 $self->{seqinds}{_identicalRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1164 25371         18981 $matchtype = 'identical';
1165             } elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1166 4474 100       5766 $qfs = $qchar eq '/' ? -1 : # base inserted to match frame
    50          
1167             $qchar eq '\\' ? 1 : # base deleted to match frame
1168             0;
1169 4474 50       5669 $sfs = $schar eq '/' ? -1 :
    50          
1170             $schar eq '\\' ? 1 :
1171             0;
1172 4474 100       9574 if ($qfs) {
    50          
    100          
    100          
1173             # Frameshifts are tricky.
1174            
1175             # '/' indicates that the next residue is shifted back one
1176             # (-1) frame position and is a deletion of one base (so to
1177             # correctly align, a base is inserted). That residue should only
1178             # occupy two sequence positions instead of three.
1179            
1180             # '\' indicates that the next residue is shifted forward
1181             # one (+1) frame position and is an insertion of one base (so to
1182             # correctly align, a base is removed). That residue should
1183             # occupy four sequence positions instead of three.
1184            
1185             # Note that gaps are not counted across from frameshifts.
1186             # Frameshift indices are a range of positions starting in the
1187             # previous sequence position in which the frameshift occurs;
1188             # they are ambiguous by nature.
1189 2         9 $self->{seqinds}{_frameshiftRes_query}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1190 2         3 $matchtype = "$qfs frameshift-query";
1191 2         2 $sgap = $qgap = 1;
1192             }
1193             elsif ($sfs) {
1194 0         0 $self->{seqinds}{_frameshiftRes_sbjct}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1195 0         0 $matchtype = "$sfs frameshift-sbcjt";
1196 0         0 $sgap = $qgap = 1;
1197             }
1198             elsif ($qchar eq $self->{GAP_SYMBOL}) {
1199             # gap are counted as being in the immediately preceeding residue
1200             # position; for translated sequences this is not the start of
1201             # the previous codon, but the third codon position
1202 512         1038 $self->{seqinds}{_gapRes_query}{ $resCount_query - $qdir }++ for @qrange;
1203 512         382 $matchtype = 'gap-query';
1204 512         368 $qgap++;
1205             }
1206             elsif ($schar eq $self->{GAP_SYMBOL}) {
1207 491         1030 $self->{seqinds}{_gapRes_sbjct}{ $resCount_sbjct - $sdir }++ for @srange;
1208 491         397 $matchtype = 'gap-sbjct';
1209 491         334 $sgap++;
1210             }
1211             else {
1212             # if not a gap or frameshift in either seq, must be mismatch
1213 3469         9045 $self->{seqinds}{_mismatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1214 3469         8964 $self->{seqinds}{_mismatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1215 3469         2887 $matchtype = 'mismatch';
1216             }
1217             # always add a nomatch unless the current position in the seq is a gap
1218 4474 100       5405 if (!$sgap) {
1219 3981         7779 $self->{seqinds}{_nomatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1220             }
1221 4474 100       5118 if (!$qgap) {
1222 3960         7756 $self->{seqinds}{_nomatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1223             }
1224             } else {
1225 0         0 $self->warn("Unknown midline character: [$mchar]");
1226             }
1227             # leave in and uncomment for future debugging
1228             #$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1229             # $resCount_query,
1230             # $qchar,
1231             # $mchar,
1232             # $schar,
1233             # $resCount_sbjct,
1234             # $matchtype,
1235             # ($self->{_query_offset} * $qdir),
1236             # ($self->{_sbjct_offset} * $sdir)));
1237 30602 100       40085 $resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1238 30602 100       49720 $resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;
1239             }
1240 73         157 return 1;
1241             }
1242              
1243             sub _calculate_seq_offsets {
1244 107     107   99 my $self = shift;
1245 107         229 my $prog = $self->algorithm;
1246 107         265 ($self->{_sbjct_offset}, $self->{_query_offset}) = (1,1);
1247 107 100       510 if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
    100          
1248 68         74 $self->{_sbjct_offset} = 3;
1249 68 100 66     305 if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1250 67         64 $self->{_query_offset} = 3;
1251             }
1252             } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
1253 3         6 $self->{_query_offset} = 3;
1254             }
1255 107         123 1;
1256             }
1257              
1258             =head2 n
1259              
1260             See documentation in L
1261              
1262             =cut
1263              
1264             sub n {
1265 113     113 1 226 my $self = shift;
1266 113 50       221 if(@_) { $self->{'N'} = shift; }
  0         0  
1267             # note that returning 1 is completely an assumption
1268 113 100       390 defined $self->{'N'} ? $self->{'N'} : 1;
1269             }
1270              
1271             =head2 range
1272              
1273             See documentation in L
1274              
1275             =cut
1276              
1277             sub range {
1278 1659     1659 1 1748 my ($self, $seqType) = @_;
1279              
1280 1659   100     2177 $seqType ||= 'query';
1281 1659 100       2443 $seqType = 'sbjct' if $seqType eq 'hit';
1282              
1283 1659         1217 my ($start, $end);
1284 1659 100       2029 if( $seqType eq 'query' ) {
1285 840         1147 $start = $self->query->start;
1286 840         1193 $end = $self->query->end;
1287             }
1288             else {
1289 819         1170 $start = $self->hit->start;
1290 819         1239 $end = $self->hit->end;
1291             }
1292 1659         3749 return ($start, $end);
1293             }
1294              
1295              
1296             =head2 links
1297              
1298             Title : links
1299             Usage : $obj->links($newval)
1300             Function: Get/Set the Links value (from WU-BLAST)
1301             Indicates the placement of the alignment in the group of HSPs
1302             Returns : Value of links
1303             Args : On set, new value (a scalar or undef, optional)
1304              
1305              
1306             =cut
1307              
1308             sub links{
1309 54     54 1 53 my $self = shift;
1310              
1311 54 50       103 return $self->{LINKS} = shift if @_;
1312 54         159 return $self->{LINKS};
1313             }
1314              
1315             =head2 hsp_group
1316              
1317             Title : hsp_group
1318             Usage : $obj->hsp_group($newval)
1319             Function: Get/Set the Group value (from WU-BLAST)
1320             Indicates a grouping of HSPs
1321             Returns : Value of group
1322             Args : On set, new value (a scalar or undef, optional)
1323              
1324              
1325             =cut
1326              
1327             sub hsp_group {
1328 12     12 1 19 my $self = shift;
1329              
1330 12 50       38 return $self->{HSP_GROUP} = shift if @_;
1331 12         43 return $self->{HSP_GROUP};
1332             }
1333              
1334             =head2 hit_features
1335              
1336             Title : hit_features
1337             Usage : $obj->hit_features($newval)
1338             Function: Get/Set the HSP hit feature string (from some BLAST 2.2.13 text
1339             output), which is a string of overlapping or nearby features in HSP
1340             hit
1341             Returns : Value of hit features, if present
1342             Args : On set, new value (a scalar or undef, optional)
1343              
1344              
1345             =cut
1346              
1347             sub hit_features {
1348 6     6 1 28 my $self = shift;
1349              
1350 6 50       24 return $self->{HIT_FEATURES} = shift if @_;
1351 6         27 return $self->{HIT_FEATURES};
1352             }
1353              
1354             # The cigar string code is written by Juguang Xiao
1355              
1356             =head1 Brief introduction on cigar string
1357              
1358             NOTE: the concept is originally from EnsEMBL docs at
1359             http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html
1360             Please append to these docs if you have a better definition.
1361              
1362             Sequence alignment hits can be stored in a database as ungapped alignments.
1363             This imposes 2 major constraints on alignments:
1364              
1365             a) alignments for a single hit record require multiple rows in the database,
1366             and
1367             b) it is not possible to accurately retrieve the exact original alignment.
1368              
1369             Alternatively, sequence alignments can be stored as gapped alignments using
1370             the CIGAR line format (where CIGAR stands for Concise Idiosyncratic Gapped
1371             Alignment Report).
1372              
1373             In the cigar line format alignments are stored as follows:
1374              
1375             M: Match
1376             D: Deletion
1377             I: Insertion
1378              
1379             An example of an alignment for a hypthetical protein match is shown below:
1380              
1381              
1382             Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
1383              
1384             PG P G GP R PLGP
1385              
1386             Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
1387              
1388              
1389             protein_align_feature table as the following cigar line:
1390              
1391             7M4D12M2I2MD7M
1392              
1393             =head2 cigar_string
1394              
1395             Name: cigar_string
1396             Usage: $cigar_string = $hsp->cigar_string
1397             Function: Generate and return cigar string for this HSP alignment
1398             Args: No input needed
1399             Return: a cigar string
1400              
1401             =cut
1402              
1403              
1404             sub cigar_string {
1405 8     8 1 18 my ($self, $arg) = @_;
1406 8 50       16 $self->warn("this is not a setter") if(defined $arg);
1407              
1408 8 100       16 unless(defined $self->{_cigar_string}){ # generate cigar string
1409 7         18 my $cigar_string = $self->generate_cigar_string($self->query_string, $self->hit_string);
1410 7         24 $self->{_cigar_string} = $cigar_string;
1411             } # end of unless
1412              
1413 8         31 return $self->{_cigar_string};
1414             }
1415              
1416             =head2 generate_cigar_string
1417              
1418             Name: generate_cigar_string
1419             Usage: my $cigar_string = Bio::Search::HSP::GenericHSP::generate_cigar_string ($qstr, $hstr);
1420             Function: generate cigar string from a simple sequence of alignment.
1421             Args: the string of query and subject
1422             Return: cigar string
1423              
1424             =cut
1425              
1426             sub generate_cigar_string {
1427 7     7 1 14 my ($self, $qstr, $hstr) = @_;
1428 7         203 my @qchars = split //, $qstr;
1429 7         226 my @hchars = split //, $hstr;
1430              
1431 7 50       20 unless(scalar(@qchars) == scalar(@hchars)){
1432 0         0 $self->throw("two sequences are not equal in lengths");
1433             }
1434              
1435 7         13 $self->{_count_for_cigar_string} = 0;
1436 7         10 $self->{_state_for_cigar_string} = 'M';
1437              
1438 7         7 my $cigar_string = '';
1439 7         18 for(my $i=0; $i <= $#qchars; $i++){
1440 1917         1315 my $qchar = $qchars[$i];
1441 1917         1158 my $hchar = $hchars[$i];
1442 1917 100 100     4522 if($qchar ne $self->{GAP_SYMBOL} && $hchar ne $self->{GAP_SYMBOL}){ # Match
    100          
    50          
1443 1837         1781 $cigar_string .= $self->_sub_cigar_string('M');
1444             }elsif($qchar eq $self->{GAP_SYMBOL}){ # Deletion
1445 13         16 $cigar_string .= $self->_sub_cigar_string('D');
1446             }elsif($hchar eq $self->{GAP_SYMBOL}){ # Insertion
1447 67         73 $cigar_string .= $self->_sub_cigar_string('I');
1448             }else{
1449 0         0 $self->throw("Impossible state that 2 gaps on each seq aligned");
1450             }
1451             }
1452 7         12 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
1453 7         149 return $cigar_string;
1454             }
1455              
1456             # an internal method to help generate cigar string
1457              
1458             sub _sub_cigar_string {
1459 1924     1924   1414 my ($self, $new_state) = @_;
1460              
1461 1924         1093 my $sub_cigar_string = '';
1462 1924 100       1862 if($self->{_state_for_cigar_string} eq $new_state){
1463 1835         1338 $self->{_count_for_cigar_string} += 1; # Remain the state and increase the counter
1464             }else{
1465             $sub_cigar_string .= $self->{_count_for_cigar_string}
1466 89 100       139 unless $self->{_count_for_cigar_string} == 1;
1467 89         71 $sub_cigar_string .= $self->{_state_for_cigar_string};
1468 89         73 $self->{_count_for_cigar_string} = 1;
1469 89         64 $self->{_state_for_cigar_string} = $new_state;
1470             }
1471 1924         3745 return $sub_cigar_string;
1472             }
1473              
1474             # needed before seqfeatures can be made
1475             sub _pre_seq_feature {
1476 1177     1177   1141 my $self = shift;
1477 1177         1246 my $algo = $self->{ALGORITHM};
1478              
1479 1177         1253 my ($queryfactor, $hitfactor) = (0,0);
1480 1177         1046 my ($hitmap, $querymap) = (1,1);
1481 1177 100 100     13012 if( $algo =~ /^(?:PSI)?T(?:BLAST|FAST|SW)[NY]/oi ) {
    100 100        
    100 100        
    100          
1482 7         8 $hitfactor = 1;
1483 7         9 $hitmap = 3;
1484             }
1485             elsif ($algo =~ /^(?:FAST|BLAST)(?:X|Y|XY)/oi || $algo =~ /^P?GENEWISE/oi ) {
1486 25         30 $queryfactor = 1;
1487 25         26 $querymap = 3;
1488             }
1489             elsif ($algo =~ /^T(BLAST|FAST|SW)(X|Y|XY)/oi || $algo =~ /^(BLAST|FAST|SW)N/oi || $algo =~ /^WABA|AXT|BLAT|BLASTZ|PSL|MEGABLAST|EXONERATE|SW|SSEARCH|SMITH\-WATERMAN|SIM4$/){
1490 286 100       750 if ($2) {
1491 126         175 $hitmap = $querymap = 3;
1492             }
1493 286         320 $hitfactor = 1;
1494 286         269 $queryfactor = 1;
1495             }
1496             elsif ($algo =~ /^RPS-BLAST/) {
1497 1 50       4 if ($algo =~ /^RPS-BLAST\(BLASTX\)/) {
1498 0         0 $queryfactor = 1;
1499 0         0 $querymap = 3;
1500             }
1501 1         2 $hitfactor = 0;
1502             }
1503             else {
1504 858         1585 my $stranded = lc substr($self->{STRANDED}, 0,1);
1505 858 100 66     2623 $queryfactor = ($stranded eq 'q' || $stranded eq 'b') ? 1 : 0;
1506 858 100 100     3639 $hitfactor = ($stranded eq 'h' || $stranded eq 's' || $stranded eq 'b') ? 1 : 0;
1507             }
1508 1177         1379 $self->{_query_factor} = $queryfactor;
1509 1177         1332 $self->{_hit_factor} = $hitfactor;
1510 1177         1262 $self->{_hit_mapping} = $hitmap;
1511 1177         1597 $self->{_query_mapping} = $querymap;
1512             }
1513              
1514             # make query seq feature
1515             sub _query_seq_feature {
1516 1176     1176   979 my $self = shift;
1517 1176         1355 $self->{_making_qff} = 1;
1518 1176         1280 my $qs = $self->{QUERY_START};
1519 1176         1208 my $qe = $self->{QUERY_END};
1520 1176 100       2168 unless (defined $self->{_query_factor}) {
1521 1133         1885 $self->_pre_seq_feature;
1522             }
1523 1176         1041 my $queryfactor = $self->{_query_factor};
1524              
1525 1176 50 33     3561 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin"); }
  0         0  
1526              
1527 1176         1077 my $strand;
1528 1176 100       1762 if ($qe > $qs) { # normal query: start < end
1529 1106 100       1431 if ($queryfactor) {
1530 843         825 $strand = 1;
1531             }
1532             else {
1533 263         287 $strand = undef;
1534             }
1535             }
1536             else {
1537 70 50       139 if ($queryfactor) {
1538 70         75 $strand = -1;
1539             }
1540             else {
1541 0         0 $strand = undef;
1542             }
1543 70         155 ($qs,$qe) = ($qe,$qs);
1544             }
1545              
1546             # Note: many of these data are not query- and hit-specific.
1547             # Only start, end, name, length are.
1548             # We could be more efficient by only storing this info once.
1549             # steve chervitz --- Sat Apr 5 00:55:07 2003
1550              
1551 1176   33     2169 my $sim1 = $self->{_sim1} || Bio::SeqFeature::Similarity->new();
1552 1176         2447 $sim1->start($qs);
1553 1176         2249 $sim1->end($qe);
1554 1176         3222 $sim1->significance($self->{EVALUE});
1555 1176         2844 $sim1->bits($self->{BITS});
1556 1176         2683 $sim1->score($self->{SCORE});
1557 1176         2069 $sim1->strand($strand);
1558 1176         2825 $sim1->seq_id($self->{QUERY_NAME});
1559 1176         2318 $sim1->seqlength($self->{QUERY_LENGTH});
1560 1176         2403 $sim1->source_tag($self->{ALGORITHM});
1561 1176         2144 $sim1->seqdesc($self->{QUERY_DESC});
1562 1176 100       4183 $sim1->add_tag_value('meta', $self->{META}) if $self->can('meta');
1563             # to determine frame from something like FASTXY which doesn't
1564             # report the frame
1565 1176         1327 my $qframe = $self->{QUERY_FRAME};
1566            
1567 1176 100 100     5077 if (defined $strand && !defined $qframe && $queryfactor) {
    100 66        
1568 603         738 $qframe = ( $qs % 3 ) * $strand;
1569             }
1570             elsif (!defined $strand) {
1571 263         279 $qframe = 0;
1572             }
1573            
1574 1176 50       4030 if( $qframe =~ /^([+-])?([0-3])/ ) {
1575 1176   100     3838 my $dir = $1 || '+';
1576 1176 50 33     4798 if($qframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
      66        
1577 0         0 $self->warn("Query frame ($qframe) did not match strand of query ($strand)");
1578             }
1579 1176 100       3395 $qframe = $2 != 0 ? $2 - 1 : $2;
1580             } else {
1581 0         0 $self->warn("Unknown query frame ($qframe)");
1582 0         0 $qframe = 0;
1583             }
1584            
1585 1176         2465 $sim1->frame($qframe);
1586 1176         2568 $self->SUPER::feature1($sim1);
1587              
1588 1176         1307 $self->{_created_qff} = 1;
1589 1176         1590 $self->{_making_qff} = 0;
1590             }
1591              
1592             # make subject seq feature
1593             sub _subject_seq_feature {
1594 562     562   517 my $self = shift;
1595 562         769 $self->{_making_sff} = 1;
1596 562         719 my $hs = $self->{HIT_START};
1597 562         664 my $he = $self->{HIT_END};
1598 562 100       992 unless (defined $self->{_hit_factor}) {
1599 44         95 $self->_pre_seq_feature;
1600             }
1601 562         530 my $hitfactor = $self->{_hit_factor};
1602              
1603 562 50 33     1861 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
  0         0  
1604              
1605 562         447 my $strand;
1606 562 100       1158 if ($he > $hs) { # normal subject
1607 455 100       610 if ($hitfactor) {
1608 199         287 $strand = 1;
1609             }
1610             else {
1611 256         275 $strand = undef;
1612             }
1613             }
1614             else {
1615 107 100       182 if ($hitfactor) {
1616 106         129 $strand = -1;
1617             }
1618             else {
1619 1         3 $strand = undef;
1620             }
1621 107         204 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
1622             }
1623              
1624 562   33     2156 my $sim2 = $self->{_sim2} || Bio::SeqFeature::Similarity->new();
1625 562         1063 $sim2->start($hs);
1626 562         996 $sim2->end($he);
1627 562         1238 $sim2->significance($self->{EVALUE});
1628 562         1282 $sim2->bits($self->{BITS});
1629 562         1185 $sim2->score($self->{SCORE});
1630 562         965 $sim2->strand($strand);
1631 562         1151 $sim2->seq_id($self->{HIT_NAME});
1632 562         1087 $sim2->seqlength($self->{HIT_LENGTH});
1633 562         1290 $sim2->source_tag($self->{ALGORITHM});
1634 562         940 $sim2->seqdesc($self->{HIT_DESC});
1635 562 100       1949 $sim2->add_tag_value('meta', $self->{META}) if $self->can('meta');
1636 562         725 my $hframe = $self->{HIT_FRAME};
1637            
1638 562 100 100     2367 if (defined $strand && !defined $hframe && $hitfactor) {
    100 66        
1639 3         6 $hframe = ( $hs % 3 ) * $strand;
1640             }
1641             elsif (!defined $strand) {
1642 257         263 $hframe = 0;
1643             }
1644            
1645 562 50       2011 if( $hframe =~ /^([+-])?([0-3])/ ) {
1646 562   100     2091 my $dir = $1 || '+';
1647 562 50 33     1641 if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
      66        
1648 0         0 $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
1649             }
1650 562 100       2173 $hframe = $2 != 0 ? $2 - 1 : $2;
1651             } else {
1652 0         0 $self->warn("Unknown subject frame ($hframe)");
1653 0         0 $hframe = 0;
1654             }
1655            
1656 562         1305 $sim2->frame($hframe);
1657 562         1455 $self->SUPER::feature2($sim2);
1658              
1659 562         804 $self->{_created_sff} = 1;
1660 562         853 $self->{_making_sff} = 0;
1661             }
1662              
1663             # before calling the num_* methods
1664             sub _pre_similar_stats {
1665 438     438   397 my $self = shift;
1666 438         529 my $identical = $self->{IDENTICAL};
1667 438         460 my $conserved = $self->{CONSERVED};
1668 438         384 my $percent_id = $self->{PERCENT_IDENTITY};
1669              
1670 438 50       691 if (! defined $identical) {
1671 0 0       0 if (! defined $percent_id) {
1672 0         0 $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
1673 0         0 $identical = 0;
1674             }
1675             else {
1676 0         0 $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH});
1677             }
1678             }
1679              
1680 438 50       635 if (! defined $conserved) {
1681             $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
1682 0 0       0 if( $self->{ALGORITHM} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
1683 0         0 $conserved = $identical;
1684             }
1685 438         449 $self->{IDENTICAL} = $identical;
1686 438         433 $self->{CONSERVED} = $conserved;
1687 438         727 $self->{_did_presimilar} = 1;
1688             }
1689              
1690             # before calling the frac_* methods
1691              
1692             sub _pre_frac {
1693 105     105   96 my $self = shift;
1694            
1695 105         125 my $hsp_len = $self->{HSP_LENGTH};
1696 105         132 my $hit_len = $self->{HIT_LENGTH};
1697 105         115 my $query_len = $self->{QUERY_LENGTH};
1698              
1699 105         201 my $identical = $self->num_identical;
1700 105         206 my $conserved = $self->num_conserved;
1701              
1702 105         147 $self->{_did_prefrac} = 1;
1703 105         85 my $logical;
1704 105 50       183 if( $hsp_len ) {
1705 105         190 $self->length('total', $hsp_len);
1706 105         257 $logical = $self->_logical_length('total');
1707 105         304 $self->frac_identical( 'total', $identical / $hsp_len);
1708 105         267 $self->frac_conserved( 'total', $conserved / $hsp_len);
1709             }
1710 105 100       176 if( $hit_len ) {
1711 104         166 $logical = $self->_logical_length('hit');
1712 104         276 $self->frac_identical( 'hit', $identical / $logical);
1713 104         219 $self->frac_conserved( 'hit', $conserved / $logical);
1714             }
1715 105 100       191 if( $query_len ) {
1716 104         178 $logical = $self->_logical_length('query');
1717 104         255 $self->frac_identical( 'query', $identical / $logical) ;
1718 104         227 $self->frac_conserved( 'query', $conserved / $logical);
1719             }
1720             }
1721              
1722             # before calling gaps()
1723             # This relies first on passed parameters (parser-dependent), then on gaps
1724             # calculated by seq_inds() (if set), then falls back to directly checking
1725             # for '-' or '.' as a last resort
1726              
1727             sub _pre_gaps {
1728 378     378   428 my $self = shift;
1729 378         429 my $query_gaps = $self->{QUERY_GAPS};
1730 378         523 my $query_seq = $self->{QUERY_SEQ};
1731 378         360 my $hit_gaps = $self->{HIT_GAPS};
1732 378         478 my $hit_seq = $self->{HIT_SEQ};
1733 378         411 my $gaps = $self->{HSP_GAPS};
1734              
1735 378         474 $self->{_did_pregaps} = 1; # well, we're in the process; avoid recursion
1736 378 50       893 if( defined $query_gaps ) {
    100          
1737 0         0 $self->gaps('query', $query_gaps);
1738             } elsif( defined $query_seq ) {
1739 363 50       931 my $qg = (defined $self->{'_query_offset'}) ? $self->seq_inds('query','gaps')
    100          
1740             : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1741             : scalar( $query_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1742 363   100     973 my $offset = $self->{'_query_offset'} || 1;
1743 363         910 $self->gaps('query', $qg/$offset);
1744             }
1745 378 50       959 if( defined $hit_gaps ) {
    100          
1746 0         0 $self->gaps('hit', $hit_gaps);
1747             } elsif( defined $hit_seq ) {
1748 358 100       838 my $hg = (defined $self->{'_sbjct_offset'}) ? $self->seq_inds('hit','gaps')
    100          
1749             : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1750             : scalar( $hit_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1751 358   100     902 my $offset = $self->{'_sbjct_offset'} || 1;
1752 358         602 $self->gaps('hit', $hg/$offset);
1753             }
1754 378 100       633 if( ! defined $gaps ) {
1755 329         461 $gaps = $self->gaps("query") + $self->gaps("hit");
1756             }
1757 378         610 $self->gaps('total', $gaps);
1758             }
1759              
1760             # before percent_identity
1761             sub _pre_pi {
1762 42     42   68 my $self = shift;
1763 42         93 $self->{_did_prepi} = 1;
1764 42 50 33     306 $self->percent_identity($self->{PERCENT_IDENTITY} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH} > 0 );
1765             }
1766              
1767             1;