File Coverage

blib/lib/Bio/Translator/Utils.pm

Criterion	Covered	Total	%
statement	120	124	96.7
branch	42	52	80.7
condition	9	14	64.2
subroutine	17	17	100.0
pod	6	6	100.0
total	194	213	91.0

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Translator::Utils;
2
3	7			7		187716	use strict;
	7					17
	7					267
4	7			7		55	use warnings;
	7					16
	7					248
5
6							=head1 NAME
7
8							Bio::Translator::Utils - Utilities that requrie a translation table
9
10							=head1 SYNOPSIS
11
12							use Bio::Translator::Utils;
13
14							# Same constructor as Bio::Translator
15							my $utils = new Bio::Translator::Utils();
16							my $utils = custom Bio::Translator( \$custom_table );
17
18							my $codons = $utils->codons( $residue );
19							my $regex = $utils->regex( $residue );
20							my $indices = $utils->find( $residue );
21
22							my $orf = $utils->getORF( $seq_ref );
23							my $cds = $utils->getCDS( $seq_ref );
24
25							my $frames = $utils->nonstop( $seq_ref );
26
27							=head1 DESCRIPTION
28
29							See Bio::Translator for more info. Utils contains utilites that require
30							knowledge of the translation table.
31
32							=cut
33
34	7			7		67	use base qw(Bio::Translator);
	7					12
	7					9731
35							__PACKAGE__->mk_accessors(qw( _regexes ));
36
37	7			7		47	use Params::Validate;
	7					10
	7					419
38
39	7			7		40	use Bio::Translator::Validations qw(:validations :regexes);
	7					13
	7					1446
40
41	7			7		37	use Bio::Util::DNA qw( cleanDNA );
	7					12
	7					329
42	7			7		44	use Bio::Util::AA qw( $aa_match );
	7					10
	7					15543
43
44							# Default values
45							our $DEFAULT_STRAND = 1;
46							our $DEFAULT_SEARCH_STRAND = 0;
47
48							# Precompiled regular expressions for SPOT rule and to save time
49							our $BOOLEAN_REGEX = qr/^[01]$/;
50							our $INTEGER_REGEX = qr/^\d+$/;
51							our $STRAND_REGEX = qr/^[+-]?1$/;
52							our $SEARCH_STRAND_REGEX = qr/^[+-]?[01]$/;
53							our $RESIDUE_REGEX = qr/^(?:$aa_match\|\+\|start\|stop\|lower\|upper)$/;
54							our $STRICT_REGEX = qr/^[012]$/;
55
56							sub _new {
57	6			6		105	my $self = shift->SUPER::_new(@_);
58	6					115	$self->_regexes( [ {}, {} ] );
59	6					126	return $self;
60							}
61
62							=head1 METHODS
63
64							=cut
65
66							=head2 codons
67
68							my $codon_array = $translator->codons( $residue);
69							my $codon_array = $translator->codons( $residue, \%params );
70
71							Returns a list of codons for a particular residue or start codon. In addition
72							to the one-letter codes for amino acids, the following are valid inputs for the
73							residue:
74
75							start: Start codons (you may also use "+" which is what the translator
76							uses as the 1-letter code for start codons)
77							stop: Stop codons (you may also use "*" which is the 1-letter code)
78							lower: Start or stop codons, depending up on strand
79							upper: Start or stop codons, depending up on strand
80
81							"lower" and "upper" match the respective ends of a CDS for a given strand (i.e.
82							on the positive strand, lower matches the start, and upper matches them stop).
83							Valid options for the params hash are:
84
85							strand: 1 or -1; default = 1
86
87							=cut
88
89							sub codons {
90	27			27	1	7150	my $self = shift;
91
92							# Get the residue and the optional validation hash
93	27					726	my ( $residue, @p ) = validate_pos(
94							@_,
95							{
96							type => Params::Validate::SCALAR,
97							regex => $RESIDUE_REGEX
98							},
99							{ type => Params::Validate::HASHREF, default => {} }
100							);
101
102	20					498	my %p = validate( @p, { strand => $VAL_STRAND } );
103
104							# Set the reverse comlement variable
105	19	100				274	my $rc = $p{strand} == 1 ? 0 : 1;
106
107							# Format start/stop to be '+' and '*' which is how translator stores them
108	19	50				107	if ( $residue eq 'stop' ) { $residue = '*' }
	0	100				0
		100
		100
109	2					5	elsif ( $residue eq 'start' ) { $residue = '+' }
110
111							# Lower bound is stop on the - strand, start on the + strand. Upper bound
112							# is the reverse
113	2	100				6	elsif ( $residue eq 'lower' ) { $residue = $rc ? '*' : '+' }
114	2	100				7	elsif ( $residue eq 'upper' ) { $residue = $rc ? '+' : '*' }
115
116							# Capitalize all other residues
117	13					30	else { $residue = uc $residue }
118
119							# Get the codons array or set it to the empty array
120	19		50			93	my $codons = $self->table->aa2codons->[$rc]->{$residue} \|\| [];
121
122							# Return a copy of the arrayref so that the internal array can't get
123							# modified
124	19					295	return [@$codons];
125							}
126
127							=head2 regex
128
129							my $regex = $translator->regex( $residue );
130							my $regex = $translator->regex( $residue, \%params );
131
132							Returns a regular expression matching codons for a particular amino acid
133							residue. In addition to the one-letter codes for amino acids, the following are
134							valid inputs for the residue:
135
136							start: Start codons (you may also use "+" which is what the translator
137							uses as the 1-letter code for start codons)
138							stop: Stop codons (you may also use "*" which is the 1 letter code)
139							lower: Start or stop codons, depending up on strand
140							upper: Start or stop codons, depending up on strand
141
142							"lower" and "upper" match the respective ends of a CDS for a given strand (i.e.
143							on the positive strand, lower matches the start, and upper matches the stop).
144							Valid options for the params hash are:
145
146							strand: 1 or -1; default = 1
147
148							=cut
149
150							sub regex {
151	26			26	1	6257	my $self = shift;
152
153	26					810	my ( $residue, @p ) = validate_pos(
154							@_,
155							{ type => Params::Validate::SCALAR },
156							{ type => Params::Validate::HASHREF, default => {} }
157							);
158
159	24					366	my %p = validate(
160							@p,
161							{
162
163							strand => $VAL_STRAND
164							}
165							);
166
167							# Get the index for the regex array
168	23	100				363	my $rc = $p{strand} == 1 ? 0 : 1;
169
170							# Get the regex, and if it is defined, return it
171	23					76	my $regex = $self->_regexes->[$rc]->{$residue};
172	23	100				158	if ( defined $regex ) {
173	7					30	return $regex;
174							}
175
176							# If the regex wasn't defined, build it by calling codons
177	16					21	$regex = join '\|', @{ $self->codons( $residue, \%p ) };
	16					48
178	12					461	$regex = qr/$regex/;
179
180							# Cache the regex and return it
181	12					62	$self->_regexes->[$rc]->{$residue} = $regex;
182
183	12					100	return $regex;
184							}
185
186							=head2 find
187
188							my $locations = $translator->find( $seq_ref, $residue );
189							my $locations = $translator->find( $seq_ref, $residue, \%params );
190
191							Find the indexes of a given residue in a sequence. In addition to the
192							one-letter codes for amino acids, the following are valid inputs for the
193							residue:
194
195							start: Start codons (you may also use "+" which is what the translator
196							uses as the 1-letter code for start codons)
197							stop: Stop codons (you may also use "*" which is the 1 letter code)
198							lower: Start or stop codons, depending up on strand
199							upper: Start or stop codons, depending up on strand
200
201							"lower" and "upper" match the respective ends of a CDS for a given strand (i.e.
202							on the positive strand, lower matches the start, and upper matches the stop).
203							Valid options for the params hash are:
204
205							strand: 1 or -1; default = 1
206
207							=cut
208
209							sub find {
210	8			8	1	4842	my $self = shift;
211
212	8					618	my ( $seq_ref, $residue, @p ) = validate_pos(
213							@_,
214							{ type => Params::Validate::SCALARREF \| Params::Validate::SCALAR },
215							{ type => Params::Validate::SCALAR },
216							{ type => Params::Validate::HASHREF, default => {} }
217							);
218
219	6	50				31	$seq_ref = \$seq_ref unless ( ref $seq_ref );
220
221							# Strand is unnecessary for now. Uncomment this section when other options
222							# are added to find.
223							# my %p = validate(
224							# @p,
225							# {
226							# strand => {
227							# default => $DEFAULT_STRAND,
228							# regex => $STRAND_REGEX,
229							# type => Params::Validate::SCALAR
230							# }
231							# }
232							# );
233							#
234							# my $regex = $self->regex( $residue, \%p );
235
236	6					14	my $regex = $self->regex( $residue, @p );
237
238							# Use a look-ahead in the regular expression. For instance, if the amino
239							# acid has the codon AAA, and you have a poly-A region, the match will be
240							# at every single base, not every 3 bases.
241	4					6	my @positions;
242	4					133	while ( $$seq_ref =~ m/(?=$regex)/ig ) {
243	13					74	push @positions, pos($$seq_ref);
244							}
245
246	4					16	return \@positions;
247							}
248
249							=head2 getORF
250
251							my $orf_arrayref = $translator->getORF( $seq_ref );
252							my $orf_arrayref = $translator->getORF( $seq_ref, \%params );
253
254							This will get the longest region between stops and return lower and
255							upper bounds, and the strand. Valid options for the params hash are:
256
257							strand: 0, 1 or -1; default = 0 (meaning search both strands)
258							lower: integer between 0 and length; default = 0
259							upper: integer between 0 and length; default = length
260
261							Lower and upper are used to specify bounds between which you are searching.
262							Suppose the following was the longest ORF:
263
264							0 1 2 3 4 5 6 7 8 9 10
265							T A A A T C T A A G
266							*** ***
267							<--------->
268
269							This will return:
270
271							[ 3, 9, 1 ]
272
273							You can also specify which strand you are looking for the ORF to be on.
274
275							For ORFs starting at the very beginning of the strand or trailing off the end,
276							but not in phase with the start or ends, this method will cut at the last
277							complete codon. For example, if the following was the longest ORF:
278
279							0 1 2 3 4 5 6 7 8 9 10
280							A C G T A G T T T A
281							*****
282							<--------------->
283
284							getORF will return:
285
286							[ 1, 10, 1 ]
287
288							The distance between lower and upper will always be a multiple of 3. This is to
289							make it clear which frame the ORF is in. The resulting hash may be passed to
290							the translate method.
291
292							Example:
293
294							my $orf_ref = $translator->getORF( \'TAGAAATAG' );
295							my $orf_ref = $translator->getORF( \$seq, { strand => -1 } );
296							my $orf_ref = $translator->getORF(
297							\$seq,
298							{
299							lower => $lower,
300							upper => $upper
301							}
302							);
303
304							=cut
305
306							sub getORF {
307	1			1	1	197	my $self = shift;
308
309	1					8	my ( $seq_ref, @p ) = validate_seq_params(@_);
310
311	1					23	my %p = validate(
312							@p,
313							{
314							lower => $VAL_NON_NEG_INT,
315							upper => $VAL_NON_NEG_INT,
316							strand => $VAL_SEARCH_STRAND,
317							}
318							);
319	1					21	my ( $lower, $upper ) = validate_lower_upper( delete( @p{qw/ lower upper /} ), $seq_ref );
320
321							# Initialize the longest ORF.
322	1					4	my @ORF = ( $lower, $lower, 0 );
323
324							# Go through each strand which we are looking in
325	1	50				5	foreach my $strand ( $p{strand} == 0 ? ( -1, 1 ) : $p{strand} ) {
326
327							# Initialize lower bounds and regular expression for stop
328	2					5	my @lowers = map { $_ + $lower } ( 0 .. 2 );
	6					11
329	2					10	my $stop_regex = $self->regex( '*', { strand => $strand } );
330
331							# Look for all the stops in our sequence using a regular expression. A
332							# lookahead is used to cope with the possibility of overlapping stop
333							# codons
334
335	2					14	pos($$seq_ref) = $lower;
336
337	2					11	while ( $$seq_ref =~ /(?=stop_regex)/gx ) {
338
339							# Get the location of the upper bound. Add 3 for the length of the
340							# stop codon if we are on the + strand.
341	0	0				0	my $cur_upper = pos($$seq_ref) + ( $strand == 1 ? 3 : 0 );
342
343							# End the iteration if we are out of range
344	0	0				0	last if ( $cur_upper > $upper );
345
346							# Call our helper function
347	0					0	$self->_getORF( $strand, \@lowers, $cur_upper, $lower, \@ORF );
348							}
349
350							# Now evaluate for the last three ORFS
351	2					5	foreach my $i ( 0 .. 2 ) {
352	6					9	my $cur_upper = $upper - $i;
353	6					21	$self->_getORF( $strand, \@lowers, $cur_upper, $lower, \@ORF );
354							}
355
356							# NOTE: Perl's regular expression engine could be faster than code
357							# execution, so it may be faster to find ORFS using regular expression
358							# matching an entire ORF.
359							# m/(?=(^\|$stop)((.{3})*)($stop\|$))/g
360							}
361
362	1					8	return \@ORF;
363							}
364
365							# Helper function for getORF above.
366							sub _getORF {
367	6			6		8	my $self = shift;
368	6					11	my ( $strand, $lowers, $upper, $offset, $longest ) = @_;
369
370							# Calculate the frame relative to the starting offset
371	6					8	my $frame = ( $upper - $offset ) % 3;
372
373							# Compare if this is better than the longest ORF
374	6					18	$self->_compare_regions( $longest, [ $lowers->[$frame], $upper, $strand ] );
375
376							# Mark the lower bound for this frame
377	6					19	$lowers->[$frame] = $upper;
378							}
379
380							=head2 getCDS
381
382							my $cds_ref = $translator->getCDS( $seq_ref );
383							my $cds_ref = $translator->getCDS( $seq_ref, \%params );
384
385							Return the strand and boundaries of the longest CDS similar to getORF.
386
387							0 1 2 3 4 5 6 7 8 9 10
388							A T G A A A T A A G
389							>>>>> *****
390							<--------------->
391
392							Will return:
393
394							[ 0, 9, 1 ]
395
396							Valid options for the params hash are:
397
398							strand: 0, 1 or -1; default = 0 (meaning search both strands)
399							lower: integer between 0 and length; default = 0
400							upper: integer between 0 and length; default = length
401							strict: 0, 1 or 2; default = 1
402
403							Strict controls how strictly getCDS functions. There are 3 levels of
404							strictness, enumerated 0, 1 and 2. 2 is the most strict, and in that mode, a
405							region will only be considered a CDS if both the start and stop is found. In
406							strict level 1, if a start is found, but no stop is present before the end of
407							the sequence, the CDS will run until the end of the sequence. Strict level 0
408							assumes that start codon is present in each frame just before the start of the
409							molecule. Level 1 is a pretty safe bet, so that is the default.
410
411							Example:
412
413							my $cds_ref = $translator->getCDS(\'ATGAAATAG');
414							my $cds_ref = $translator->getCDS(\$seq, { strand => -1 } );
415							my $cds_ref = $translator->getCDS(\$seq, { strict => 2 } );
416
417							=cut
418
419							sub getCDS {
420	1			1	1	161	my $self = shift;
421
422	1					8	my ( $seq_ref, @p ) = validate_seq_params(@_);
423
424	1					30	my %p = validate(
425							@p,
426							{
427							lower => $VAL_NON_NEG_INT,
428							upper => $VAL_NON_NEG_INT,
429							strand => $VAL_SEARCH_STRAND,
430							strict => {%$VAL_OFFSET, default => 1 },
431							}
432							);
433
434	1					19	my ( $lower, $upper ) =
435							validate_lower_upper( delete( @p{qw/ lower upper /} ), $seq_ref );
436
437							# Initialize the longest CDS. Length is -1.
438	1					4	my @CDS = ( 0, -1, 0 );
439
440	1	50				6	foreach my $strand ( $p{strand} == 0 ? ( -1, 1 ) : $p{strand} ) {
441	2					11	my $lower_regex = $self->regex( 'lower', { strand => $strand } );
442	2					10	my $upper_regex = $self->regex( 'upper', { strand => $strand } );
443
444							# Initialize lowers. On the + strand, we don't set the lower bounds
445							# unless strict is 0. On the - strand, we don't set the lower bounds if
446							# strict is 2. Otherwise, set the lower boudns to be the first bases.
447	3					9	my @lowers =
448							( ( ( $strand == 1 ) && ( $p{strict} != 0 ) )
449							\|\| ( ( $strand == -1 ) && ( $p{strict} == 2 ) ) )
450							? (undef) x 3
451	2	100	66			41	: map { $lower + $_ } ( 0 .. 2 );
452
453							# Similar to getORF, rather than using a regular expression to find
454							# entire coding regions, instead find individual starts and stops and
455							# react accordingly.
456							# The regular expression captures the starts and stops separately
457							# ($1 vs $2) so that it is easy to tell if a start or a stop was
458							# matched.
459
460	2					9	pos($$seq_ref) = $lower;
461
462	2					154	while ( $$seq_ref =~ /(?=($lower_regex)\|($upper_regex))/g ) {
463	20					97	my $position = pos $$seq_ref;
464	20	50				42	last if ( $position > $upper );
465
466	20					30	my $frame = $position % 3;
467
468							# If the lower regex matches:
469							#
470							# In the case that it is on the '-' strand, that means a stop was
471							# found. CDSs always end on stops, so update the lower bound.
472							#
473							# Otherwise, it is on the positive strand, meaning a start was
474							# found. Internal start codons are allowed, so only set the lower
475							# bound if it is not already set.
476	20	100				51	if ($1) {
477	7	100	100			40	if ( ( $strand == -1 )
478							\|\| ( !defined $lowers[$frame] ) )
479
480							{
481	6					59	$lowers[$frame] = $position;
482							}
483							}
484
485							# If the lower regex wasn't matched, the the upper one was.
486							#
487							# If this is the positive strand, that means that this is a stop
488							# codon. Compute the CDS, update if necessary, and reset the lower
489							# bound in this case.
490							#
491							# On the negative strand, that means that a start was matched.
492							# Compute the CDS, update if necessary, but don't reset the lower
493							# bound.
494
495							else {
496	13					17	$position += 3;
497	13	50				27	last if ( $position > $upper );
498
499	13					35	$self->_getCDS( $strand, \@lowers, $position, $lower, \@CDS );
500							}
501							}
502
503							# If strict mode is at level 2, we don't allow CDSs to trail off the
504							# end of the molecule. We also don't allow the end to trail off if we
505							# are on the - strand and strict isn't 0.
506
507							next
508	2	100	66			36	if ( ( $p{strict} == 2 )
			33
509							\|\| ( ( $strand == -1 ) && ( $p{strict} != 0 ) ) );
510
511	1					5	foreach my $i ( 0 .. 2 ) {
512	3					5	my $end_upper = $upper - $i;
513	3					9	$self->_getCDS( $strand, \@lowers, $end_upper, $lower, \@CDS );
514							}
515							}
516
517	1					12	return \@CDS;
518							}
519
520							# Helper function for getCDS above.
521							sub _getCDS {
522	16			16		22	my $self = shift;
523	16					27	my ( $strand, $lowers, $upper, $offset, $longest ) = @_;
524
525							# Calculate the frame relative to the starting offset
526	16					22	my $frame = ( $upper - $offset ) % 3;
527
528							# Do nothing if lower bound wasn't defined
529	16	100				112	return unless ( defined $lowers->[$frame] );
530
531							# Compare if this is better than the longest ORF
532	6					21	$self->_compare_regions( $longest, [ $lowers->[$frame], $upper, $strand ] );
533
534							# Mark the lower bound for this frame
535	6	100				73	undef $lowers->[$frame] if ( $strand == 1 );
536							}
537
538							# If the current range is longer than the longest range, store the range
539							sub _compare_regions {
540	12			12		19	my $self = shift;
541	12					16	my ( $longest, $current ) = @_;
542	12	100				57	@$longest = @$current
543							if ( $longest->[1] - $longest->[0] < $current->[1] - $current->[0] );
544							}
545
546							=head2 nonstop
547
548							my $frames = $translator->nonstop( $seq_ref );
549							my $frames = $translator->nonstop( $seq_ref, \%params );
550
551							Returns the frames that contain no stop codons for the sequence. Frames are
552							numbered -3, -2, -1, 1, 2 and 3.
553
554							3 ---->
555							2 ----->
556							1 ------>
557							-------
558							-1 <------
559							-2 <-----
560							-3 <----
561
562							Valid options for the params hash are:
563
564							strand: 0, 1 or -1; default = 0 (meaning search both strands)
565
566							Example:
567
568							my $frames = $translator->nonstop(\'TACGTTGGTTAAGTT'); # [ 2, 3, -1, -3 ]
569							my $frames = $translator->nonstop(\$seq, { strand => 1 } ); # [ 2, 3 ]
570							my $frames = $translator->nonstop(\$seq, { strand => -1 } ); # [ -1, -3 ]
571
572							=cut
573
574							sub nonstop {
575	3			3	1	1798	my $self = shift;
576
577	3					12	my ( $seq_ref, @p ) = validate_seq_params(@_);
578
579	3					45	my %p = validate( @p, { strand => $VAL_SEARCH_STRAND } );
580
581							# Go through both strands
582	3					32	my @frames;
583	3	100				11	foreach my $strand ( $p{strand} == 0 ? ( 1, -1 ) : $p{strand} ) {
584	4					17	my $stop = $self->regex( '*', { strand => $strand } );
585
586							# Go through each frame
587	4					11	foreach my $frame ( 0 .. 2 ) {
588	12	100				428	my $regex =
589							$strand == 1
590							? qr/^.{$frame}(?:.{3})*$stop/
591							: qr/$stop(?:.{3})*.{$frame}$/;
592
593							# Convert strand = +/-1, frame = [0,1,2] into 1 value +/-[1,2,3]
594	12	100				130	push @frames, ( $frame + 1 ) * $strand
595							unless ( $$seq_ref =~ m/$regex/ );
596							}
597							}
598
599	3					24	return \@frames;
600							}
601
602							1;
603
604							=head1 AUTHOR
605
606							Kevin Galinsky,
607
608							=cut