File Coverage

blib/lib/Bio/Translator/Utils.pm
Criterion Covered Total %
statement 120 124 96.7
branch 42 52 80.7
condition 9 14 64.2
subroutine 17 17 100.0
pod 6 6 100.0
total 194 213 91.0


line stmt bran cond sub pod time code
1             package Bio::Translator::Utils;
2              
3 7     7   187716 use strict;
  7         17  
  7         267  
4 7     7   55 use warnings;
  7         16  
  7         248  
5              
6             =head1 NAME
7              
8             Bio::Translator::Utils - Utilities that requrie a translation table
9              
10             =head1 SYNOPSIS
11              
12             use Bio::Translator::Utils;
13              
14             # Same constructor as Bio::Translator
15             my $utils = new Bio::Translator::Utils();
16             my $utils = custom Bio::Translator( \$custom_table );
17              
18             my $codons = $utils->codons( $residue );
19             my $regex = $utils->regex( $residue );
20             my $indices = $utils->find( $residue );
21              
22             my $orf = $utils->getORF( $seq_ref );
23             my $cds = $utils->getCDS( $seq_ref );
24              
25             my $frames = $utils->nonstop( $seq_ref );
26              
27             =head1 DESCRIPTION
28              
29             See Bio::Translator for more info. Utils contains utilites that require
30             knowledge of the translation table.
31              
32             =cut
33              
34 7     7   67 use base qw(Bio::Translator);
  7         12  
  7         9731  
35             __PACKAGE__->mk_accessors(qw( _regexes ));
36              
37 7     7   47 use Params::Validate;
  7         10  
  7         419  
38              
39 7     7   40 use Bio::Translator::Validations qw(:validations :regexes);
  7         13  
  7         1446  
40              
41 7     7   37 use Bio::Util::DNA qw( cleanDNA );
  7         12  
  7         329  
42 7     7   44 use Bio::Util::AA qw( $aa_match );
  7         10  
  7         15543  
43              
44             # Default values
45             our $DEFAULT_STRAND = 1;
46             our $DEFAULT_SEARCH_STRAND = 0;
47              
48             # Precompiled regular expressions for SPOT rule and to save time
49             our $BOOLEAN_REGEX = qr/^[01]$/;
50             our $INTEGER_REGEX = qr/^\d+$/;
51             our $STRAND_REGEX = qr/^[+-]?1$/;
52             our $SEARCH_STRAND_REGEX = qr/^[+-]?[01]$/;
53             our $RESIDUE_REGEX = qr/^(?:$aa_match|\+|start|stop|lower|upper)$/;
54             our $STRICT_REGEX = qr/^[012]$/;
55              
56             sub _new {
57 6     6   105 my $self = shift->SUPER::_new(@_);
58 6         115 $self->_regexes( [ {}, {} ] );
59 6         126 return $self;
60             }
61              
62             =head1 METHODS
63              
64             =cut
65              
66             =head2 codons
67              
68             my $codon_array = $translator->codons( $residue);
69             my $codon_array = $translator->codons( $residue, \%params );
70              
71             Returns a list of codons for a particular residue or start codon. In addition
72             to the one-letter codes for amino acids, the following are valid inputs for the
73             residue:
74              
75             start: Start codons (you may also use "+" which is what the translator
76             uses as the 1-letter code for start codons)
77             stop: Stop codons (you may also use "*" which is the 1-letter code)
78             lower: Start or stop codons, depending up on strand
79             upper: Start or stop codons, depending up on strand
80              
81             "lower" and "upper" match the respective ends of a CDS for a given strand (i.e.
82             on the positive strand, lower matches the start, and upper matches them stop).
83             Valid options for the params hash are:
84              
85             strand: 1 or -1; default = 1
86              
87             =cut
88              
89             sub codons {
90 27     27 1 7150 my $self = shift;
91              
92             # Get the residue and the optional validation hash
93 27         726 my ( $residue, @p ) = validate_pos(
94             @_,
95             {
96             type => Params::Validate::SCALAR,
97             regex => $RESIDUE_REGEX
98             },
99             { type => Params::Validate::HASHREF, default => {} }
100             );
101              
102 20         498 my %p = validate( @p, { strand => $VAL_STRAND } );
103              
104             # Set the reverse comlement variable
105 19 100       274 my $rc = $p{strand} == 1 ? 0 : 1;
106              
107             # Format start/stop to be '+' and '*' which is how translator stores them
108 19 50       107 if ( $residue eq 'stop' ) { $residue = '*' }
  0 100       0  
    100          
    100          
109 2         5 elsif ( $residue eq 'start' ) { $residue = '+' }
110              
111             # Lower bound is stop on the - strand, start on the + strand. Upper bound
112             # is the reverse
113 2 100       6 elsif ( $residue eq 'lower' ) { $residue = $rc ? '*' : '+' }
114 2 100       7 elsif ( $residue eq 'upper' ) { $residue = $rc ? '+' : '*' }
115              
116             # Capitalize all other residues
117 13         30 else { $residue = uc $residue }
118              
119             # Get the codons array or set it to the empty array
120 19   50     93 my $codons = $self->table->aa2codons->[$rc]->{$residue} || [];
121              
122             # Return a copy of the arrayref so that the internal array can't get
123             # modified
124 19         295 return [@$codons];
125             }
126              
127             =head2 regex
128              
129             my $regex = $translator->regex( $residue );
130             my $regex = $translator->regex( $residue, \%params );
131              
132             Returns a regular expression matching codons for a particular amino acid
133             residue. In addition to the one-letter codes for amino acids, the following are
134             valid inputs for the residue:
135              
136             start: Start codons (you may also use "+" which is what the translator
137             uses as the 1-letter code for start codons)
138             stop: Stop codons (you may also use "*" which is the 1 letter code)
139             lower: Start or stop codons, depending up on strand
140             upper: Start or stop codons, depending up on strand
141              
142             "lower" and "upper" match the respective ends of a CDS for a given strand (i.e.
143             on the positive strand, lower matches the start, and upper matches the stop).
144             Valid options for the params hash are:
145              
146             strand: 1 or -1; default = 1
147              
148             =cut
149              
150             sub regex {
151 26     26 1 6257 my $self = shift;
152              
153 26         810 my ( $residue, @p ) = validate_pos(
154             @_,
155             { type => Params::Validate::SCALAR },
156             { type => Params::Validate::HASHREF, default => {} }
157             );
158              
159 24         366 my %p = validate(
160             @p,
161             {
162              
163             strand => $VAL_STRAND
164             }
165             );
166              
167             # Get the index for the regex array
168 23 100       363 my $rc = $p{strand} == 1 ? 0 : 1;
169              
170             # Get the regex, and if it is defined, return it
171 23         76 my $regex = $self->_regexes->[$rc]->{$residue};
172 23 100       158 if ( defined $regex ) {
173 7         30 return $regex;
174             }
175              
176             # If the regex wasn't defined, build it by calling codons
177 16         21 $regex = join '|', @{ $self->codons( $residue, \%p ) };
  16         48  
178 12         461 $regex = qr/$regex/;
179              
180             # Cache the regex and return it
181 12         62 $self->_regexes->[$rc]->{$residue} = $regex;
182              
183 12         100 return $regex;
184             }
185              
186             =head2 find
187              
188             my $locations = $translator->find( $seq_ref, $residue );
189             my $locations = $translator->find( $seq_ref, $residue, \%params );
190              
191             Find the indexes of a given residue in a sequence. In addition to the
192             one-letter codes for amino acids, the following are valid inputs for the
193             residue:
194              
195             start: Start codons (you may also use "+" which is what the translator
196             uses as the 1-letter code for start codons)
197             stop: Stop codons (you may also use "*" which is the 1 letter code)
198             lower: Start or stop codons, depending up on strand
199             upper: Start or stop codons, depending up on strand
200              
201             "lower" and "upper" match the respective ends of a CDS for a given strand (i.e.
202             on the positive strand, lower matches the start, and upper matches the stop).
203             Valid options for the params hash are:
204              
205             strand: 1 or -1; default = 1
206              
207             =cut
208              
209             sub find {
210 8     8 1 4842 my $self = shift;
211              
212 8         618 my ( $seq_ref, $residue, @p ) = validate_pos(
213             @_,
214             { type => Params::Validate::SCALARREF | Params::Validate::SCALAR },
215             { type => Params::Validate::SCALAR },
216             { type => Params::Validate::HASHREF, default => {} }
217             );
218              
219 6 50       31 $seq_ref = \$seq_ref unless ( ref $seq_ref );
220              
221             # Strand is unnecessary for now. Uncomment this section when other options
222             # are added to find.
223             # my %p = validate(
224             # @p,
225             # {
226             # strand => {
227             # default => $DEFAULT_STRAND,
228             # regex => $STRAND_REGEX,
229             # type => Params::Validate::SCALAR
230             # }
231             # }
232             # );
233             #
234             # my $regex = $self->regex( $residue, \%p );
235              
236 6         14 my $regex = $self->regex( $residue, @p );
237              
238             # Use a look-ahead in the regular expression. For instance, if the amino
239             # acid has the codon AAA, and you have a poly-A region, the match will be
240             # at every single base, not every 3 bases.
241 4         6 my @positions;
242 4         133 while ( $$seq_ref =~ m/(?=$regex)/ig ) {
243 13         74 push @positions, pos($$seq_ref);
244             }
245              
246 4         16 return \@positions;
247             }
248              
249             =head2 getORF
250              
251             my $orf_arrayref = $translator->getORF( $seq_ref );
252             my $orf_arrayref = $translator->getORF( $seq_ref, \%params );
253              
254             This will get the longest region between stops and return lower and
255             upper bounds, and the strand. Valid options for the params hash are:
256              
257             strand: 0, 1 or -1; default = 0 (meaning search both strands)
258             lower: integer between 0 and length; default = 0
259             upper: integer between 0 and length; default = length
260              
261             Lower and upper are used to specify bounds between which you are searching.
262             Suppose the following was the longest ORF:
263              
264             0 1 2 3 4 5 6 7 8 9 10
265             T A A A T C T A A G
266             ***** *****
267             <--------->
268              
269             This will return:
270              
271             [ 3, 9, 1 ]
272              
273             You can also specify which strand you are looking for the ORF to be on.
274              
275             For ORFs starting at the very beginning of the strand or trailing off the end,
276             but not in phase with the start or ends, this method will cut at the last
277             complete codon. For example, if the following was the longest ORF:
278              
279             0 1 2 3 4 5 6 7 8 9 10
280             A C G T A G T T T A
281             *****
282             <--------------->
283              
284             getORF will return:
285              
286             [ 1, 10, 1 ]
287              
288             The distance between lower and upper will always be a multiple of 3. This is to
289             make it clear which frame the ORF is in. The resulting hash may be passed to
290             the translate method.
291              
292             Example:
293              
294             my $orf_ref = $translator->getORF( \'TAGAAATAG' );
295             my $orf_ref = $translator->getORF( \$seq, { strand => -1 } );
296             my $orf_ref = $translator->getORF(
297             \$seq,
298             {
299             lower => $lower,
300             upper => $upper
301             }
302             );
303              
304             =cut
305              
306             sub getORF {
307 1     1 1 197 my $self = shift;
308              
309 1         8 my ( $seq_ref, @p ) = validate_seq_params(@_);
310              
311 1         23 my %p = validate(
312             @p,
313             {
314             lower => $VAL_NON_NEG_INT,
315             upper => $VAL_NON_NEG_INT,
316             strand => $VAL_SEARCH_STRAND,
317             }
318             );
319 1         21 my ( $lower, $upper ) = validate_lower_upper( delete( @p{qw/ lower upper /} ), $seq_ref );
320            
321             # Initialize the longest ORF.
322 1         4 my @ORF = ( $lower, $lower, 0 );
323              
324             # Go through each strand which we are looking in
325 1 50       5 foreach my $strand ( $p{strand} == 0 ? ( -1, 1 ) : $p{strand} ) {
326              
327             # Initialize lower bounds and regular expression for stop
328 2         5 my @lowers = map { $_ + $lower } ( 0 .. 2 );
  6         11  
329 2         10 my $stop_regex = $self->regex( '*', { strand => $strand } );
330              
331             # Look for all the stops in our sequence using a regular expression. A
332             # lookahead is used to cope with the possibility of overlapping stop
333             # codons
334              
335 2         14 pos($$seq_ref) = $lower;
336              
337 2         11 while ( $$seq_ref =~ /(?=stop_regex)/gx ) {
338              
339             # Get the location of the upper bound. Add 3 for the length of the
340             # stop codon if we are on the + strand.
341 0 0       0 my $cur_upper = pos($$seq_ref) + ( $strand == 1 ? 3 : 0 );
342              
343             # End the iteration if we are out of range
344 0 0       0 last if ( $cur_upper > $upper );
345              
346             # Call our helper function
347 0         0 $self->_getORF( $strand, \@lowers, $cur_upper, $lower, \@ORF );
348             }
349              
350             # Now evaluate for the last three ORFS
351 2         5 foreach my $i ( 0 .. 2 ) {
352 6         9 my $cur_upper = $upper - $i;
353 6         21 $self->_getORF( $strand, \@lowers, $cur_upper, $lower, \@ORF );
354             }
355              
356             # NOTE: Perl's regular expression engine could be faster than code
357             # execution, so it may be faster to find ORFS using regular expression
358             # matching an entire ORF.
359             # m/(?=(^|$stop)((.{3})*)($stop|$))/g
360             }
361              
362 1         8 return \@ORF;
363             }
364              
365             # Helper function for getORF above.
366             sub _getORF {
367 6     6   8 my $self = shift;
368 6         11 my ( $strand, $lowers, $upper, $offset, $longest ) = @_;
369              
370             # Calculate the frame relative to the starting offset
371 6         8 my $frame = ( $upper - $offset ) % 3;
372              
373             # Compare if this is better than the longest ORF
374 6         18 $self->_compare_regions( $longest, [ $lowers->[$frame], $upper, $strand ] );
375              
376             # Mark the lower bound for this frame
377 6         19 $lowers->[$frame] = $upper;
378             }
379              
380             =head2 getCDS
381              
382             my $cds_ref = $translator->getCDS( $seq_ref );
383             my $cds_ref = $translator->getCDS( $seq_ref, \%params );
384              
385             Return the strand and boundaries of the longest CDS similar to getORF.
386              
387             0 1 2 3 4 5 6 7 8 9 10
388             A T G A A A T A A G
389             >>>>> *****
390             <--------------->
391              
392             Will return:
393              
394             [ 0, 9, 1 ]
395              
396             Valid options for the params hash are:
397              
398             strand: 0, 1 or -1; default = 0 (meaning search both strands)
399             lower: integer between 0 and length; default = 0
400             upper: integer between 0 and length; default = length
401             strict: 0, 1 or 2; default = 1
402              
403             Strict controls how strictly getCDS functions. There are 3 levels of
404             strictness, enumerated 0, 1 and 2. 2 is the most strict, and in that mode, a
405             region will only be considered a CDS if both the start and stop is found. In
406             strict level 1, if a start is found, but no stop is present before the end of
407             the sequence, the CDS will run until the end of the sequence. Strict level 0
408             assumes that start codon is present in each frame just before the start of the
409             molecule. Level 1 is a pretty safe bet, so that is the default.
410              
411             Example:
412              
413             my $cds_ref = $translator->getCDS(\'ATGAAATAG');
414             my $cds_ref = $translator->getCDS(\$seq, { strand => -1 } );
415             my $cds_ref = $translator->getCDS(\$seq, { strict => 2 } );
416              
417             =cut
418              
419             sub getCDS {
420 1     1 1 161 my $self = shift;
421              
422 1         8 my ( $seq_ref, @p ) = validate_seq_params(@_);
423              
424 1         30 my %p = validate(
425             @p,
426             {
427             lower => $VAL_NON_NEG_INT,
428             upper => $VAL_NON_NEG_INT,
429             strand => $VAL_SEARCH_STRAND,
430             strict => {%$VAL_OFFSET, default => 1 },
431             }
432             );
433              
434 1         19 my ( $lower, $upper ) =
435             validate_lower_upper( delete( @p{qw/ lower upper /} ), $seq_ref );
436              
437             # Initialize the longest CDS. Length is -1.
438 1         4 my @CDS = ( 0, -1, 0 );
439              
440 1 50       6 foreach my $strand ( $p{strand} == 0 ? ( -1, 1 ) : $p{strand} ) {
441 2         11 my $lower_regex = $self->regex( 'lower', { strand => $strand } );
442 2         10 my $upper_regex = $self->regex( 'upper', { strand => $strand } );
443              
444             # Initialize lowers. On the + strand, we don't set the lower bounds
445             # unless strict is 0. On the - strand, we don't set the lower bounds if
446             # strict is 2. Otherwise, set the lower boudns to be the first bases.
447 3         9 my @lowers =
448             ( ( ( $strand == 1 ) && ( $p{strict} != 0 ) )
449             || ( ( $strand == -1 ) && ( $p{strict} == 2 ) ) )
450             ? (undef) x 3
451 2 100 66     41 : map { $lower + $_ } ( 0 .. 2 );
452              
453             # Similar to getORF, rather than using a regular expression to find
454             # entire coding regions, instead find individual starts and stops and
455             # react accordingly.
456             # The regular expression captures the starts and stops separately
457             # ($1 vs $2) so that it is easy to tell if a start or a stop was
458             # matched.
459              
460 2         9 pos($$seq_ref) = $lower;
461              
462 2         154 while ( $$seq_ref =~ /(?=($lower_regex)|($upper_regex))/g ) {
463 20         97 my $position = pos $$seq_ref;
464 20 50       42 last if ( $position > $upper );
465              
466 20         30 my $frame = $position % 3;
467              
468             # If the lower regex matches:
469             #
470             # In the case that it is on the '-' strand, that means a stop was
471             # found. CDSs always end on stops, so update the lower bound.
472             #
473             # Otherwise, it is on the positive strand, meaning a start was
474             # found. Internal start codons are allowed, so only set the lower
475             # bound if it is not already set.
476 20 100       51 if ($1) {
477 7 100 100     40 if ( ( $strand == -1 )
478             || ( !defined $lowers[$frame] ) )
479              
480             {
481 6         59 $lowers[$frame] = $position;
482             }
483             }
484              
485             # If the lower regex wasn't matched, the the upper one was.
486             #
487             # If this is the positive strand, that means that this is a stop
488             # codon. Compute the CDS, update if necessary, and reset the lower
489             # bound in this case.
490             #
491             # On the negative strand, that means that a start was matched.
492             # Compute the CDS, update if necessary, but don't reset the lower
493             # bound.
494              
495             else {
496 13         17 $position += 3;
497 13 50       27 last if ( $position > $upper );
498              
499 13         35 $self->_getCDS( $strand, \@lowers, $position, $lower, \@CDS );
500             }
501             }
502              
503             # If strict mode is at level 2, we don't allow CDSs to trail off the
504             # end of the molecule. We also don't allow the end to trail off if we
505             # are on the - strand and strict isn't 0.
506              
507             next
508 2 100 66     36 if ( ( $p{strict} == 2 )
      33        
509             || ( ( $strand == -1 ) && ( $p{strict} != 0 ) ) );
510              
511 1         5 foreach my $i ( 0 .. 2 ) {
512 3         5 my $end_upper = $upper - $i;
513 3         9 $self->_getCDS( $strand, \@lowers, $end_upper, $lower, \@CDS );
514             }
515             }
516              
517 1         12 return \@CDS;
518             }
519              
520             # Helper function for getCDS above.
521             sub _getCDS {
522 16     16   22 my $self = shift;
523 16         27 my ( $strand, $lowers, $upper, $offset, $longest ) = @_;
524              
525             # Calculate the frame relative to the starting offset
526 16         22 my $frame = ( $upper - $offset ) % 3;
527              
528             # Do nothing if lower bound wasn't defined
529 16 100       112 return unless ( defined $lowers->[$frame] );
530              
531             # Compare if this is better than the longest ORF
532 6         21 $self->_compare_regions( $longest, [ $lowers->[$frame], $upper, $strand ] );
533              
534             # Mark the lower bound for this frame
535 6 100       73 undef $lowers->[$frame] if ( $strand == 1 );
536             }
537              
538             # If the current range is longer than the longest range, store the range
539             sub _compare_regions {
540 12     12   19 my $self = shift;
541 12         16 my ( $longest, $current ) = @_;
542 12 100       57 @$longest = @$current
543             if ( $longest->[1] - $longest->[0] < $current->[1] - $current->[0] );
544             }
545              
546             =head2 nonstop
547              
548             my $frames = $translator->nonstop( $seq_ref );
549             my $frames = $translator->nonstop( $seq_ref, \%params );
550              
551             Returns the frames that contain no stop codons for the sequence. Frames are
552             numbered -3, -2, -1, 1, 2 and 3.
553              
554             3 ---->
555             2 ----->
556             1 ------>
557             -------
558             -1 <------
559             -2 <-----
560             -3 <----
561              
562             Valid options for the params hash are:
563              
564             strand: 0, 1 or -1; default = 0 (meaning search both strands)
565              
566             Example:
567              
568             my $frames = $translator->nonstop(\'TACGTTGGTTAAGTT'); # [ 2, 3, -1, -3 ]
569             my $frames = $translator->nonstop(\$seq, { strand => 1 } ); # [ 2, 3 ]
570             my $frames = $translator->nonstop(\$seq, { strand => -1 } ); # [ -1, -3 ]
571              
572             =cut
573              
574             sub nonstop {
575 3     3 1 1798 my $self = shift;
576              
577 3         12 my ( $seq_ref, @p ) = validate_seq_params(@_);
578              
579 3         45 my %p = validate( @p, { strand => $VAL_SEARCH_STRAND } );
580              
581             # Go through both strands
582 3         32 my @frames;
583 3 100       11 foreach my $strand ( $p{strand} == 0 ? ( 1, -1 ) : $p{strand} ) {
584 4         17 my $stop = $self->regex( '*', { strand => $strand } );
585              
586             # Go through each frame
587 4         11 foreach my $frame ( 0 .. 2 ) {
588 12 100       428 my $regex =
589             $strand == 1
590             ? qr/^.{$frame}(?:.{3})*$stop/
591             : qr/$stop(?:.{3})*.{$frame}$/;
592              
593             # Convert strand = +/-1, frame = [0,1,2] into 1 value +/-[1,2,3]
594 12 100       130 push @frames, ( $frame + 1 ) * $strand
595             unless ( $$seq_ref =~ m/$regex/ );
596             }
597             }
598              
599 3         24 return \@frames;
600             }
601              
602             1;
603              
604             =head1 AUTHOR
605              
606             Kevin Galinsky,
607              
608             =cut