File Coverage

blib/lib/Bio/Tools/SeqPattern/Backtranslate.pm
Criterion Covered Total %
statement 126 127 99.2
branch 26 28 92.8
condition 3 9 33.3
subroutine 17 17 100.0
pod n/a
total 172 181 95.0


line stmt bran cond sub pod time code
1             package Bio::Tools::SeqPattern::Backtranslate;
2 2     2   756 use strict;
  2         4  
  2         53  
3 2     2   8 use warnings;
  2         3  
  2         60  
4              
5 2     2   9 use base qw(Bio::Root::Root);
  2         3  
  2         382  
6 2     2   18 use base qw(Exporter);
  2         6  
  2         174  
7              
8             =head1 NAME
9              
10             Bio::Tools::SeqPattern::Backtranslate
11              
12             =head1 DESCRIPTION
13              
14             This module should not be used directly. It provides helper methods to
15             Bio::Tools::SeqPattern to reverse translate protein patterns.
16              
17             =cut
18              
19 2     2   1178 use Bio::Seq;
  2         11  
  2         145  
20 2     2   22 use Bio::Tools::CodonTable;
  2         6  
  2         70  
21              
22 2     2   17 use List::MoreUtils qw(uniq);
  2         5  
  2         48  
23 2     2   3585 use Carp qw(croak);
  2         6  
  2         3319  
24              
25             our @EXPORT_OK = qw(_reverse_translate_motif);
26              
27             our @EXPORT = @EXPORT_OK;
28              
29             sub _reverse_translate_motif {
30             # Main subroutine. It takes a Profam-like motif and returns its
31             # reverse translation using degenerate codons.
32              
33             # Steps:
34             # 1. Tokenize, then parse tokens.
35             # 2. Reverse translate each token type.
36             # 3. Join tokens in original order. Return the resulting string.
37              
38 12     12   5789 my $motif = shift;
39              
40 12         44 $motif =~ s/\./X/g;
41 12         39 $motif = uc $motif;
42              
43             ### 1. Tokenize, parse the motif.
44 12         41 my ( $ordered, $classified ) = _parse_motif($motif);
45              
46             ### 2. Reverse translate each token type.
47             # Reverse translate the plain (unambiguous) tokens.
48 11         72 my $ct = Bio::Tools::CodonTable->new;
49 11         19 foreach my $seq ( @{ $classified->{plain} } ) {
  11         32  
50 19         102 my $seqO
51             = Bio::Seq->new( -seq => $$seq, -alphabet => 'protein' );
52 19         77 $$seq = $ct->reverse_translate_all($seqO);
53             }
54              
55             # Reverse translate the ambiguous tokens.
56 11         21 foreach my $token ( @{ $classified->{ambiguous} } ) {
  11         32  
57 8         44 my ($aas) = $$token =~ m(([A-Za-z\.]+));
58 8         16 my @codons_to_contract;
59              
60 8         25 foreach my $residue ( split '', $aas ) {
61 32         73 push @codons_to_contract, $ct->revtranslate($residue);
62             }
63              
64 8         31 my $ambiguous_codon = _contract_codons(@codons_to_contract);
65 8         25 $$token = $ambiguous_codon;
66             }
67              
68             # Reverse translate the negated residues.
69 11         19 foreach my $token ( @{ $classified->{negated} } ) {
  11         29  
70 6         26 my ($aas) = $$token =~ m(([A-Za-z\.]+));
71 6         21 my $ambiguous_codon = _negated_aas_to_codon($aas);
72 6         33 $$token = $ambiguous_codon;
73             }
74              
75             ### 3. Join the profile back from its tokens.
76 11         24 return join '', map {$$_} @{$ordered};
  41         196  
  11         26  
77              
78             }
79              
80             sub _parse_motif {
81             # Profam-like motif parser. It takes the pattern as a string, and
82             # returns two data structures that contain the tokens, organized
83             # by order of appearance in the pattern (first return value) and by
84             # category (second return value).
85              
86 12     12   18 my $motif = shift;
87 12         44 my $parser = _tokenize_motif($motif);
88 12         30 my ( %tokens, @tokens );
89              
90 12         28 while ( my $token = $parser->() ) {
91 42 100       119 croak ("Unknown syntax token: <", $token->[1], ">")
92             if ( $token->[0] eq 'UNKNOWN' );
93 41         52 push @{ $tokens{ $token->[0] } }, \$token->[1];
  41         103  
94 41         109 push @tokens, \$token->[1];
95             }
96 11         67 return ( \@tokens, \%tokens );
97             }
98              
99             sub _tokenize_motif {
100              
101             # Return a tokenizer iterator that sequentially recognizes and
102             # returns each token in the input pattern.
103             # Examples of each token type:
104              
105             # ambiguous: a position with more than one possible residue.
106             # eg. [ALEP]
107             # negated: a position in which some residues are excluded.
108             # eg. [^WY]
109             # plain: a common sequence of residues. One position, one residue.
110             # eg. MAAEIK
111             # open_par, close_par: tags surrounding a motif that is repeated
112             # a certain number of times.
113             # eg. (...){3}
114              
115 12     12   25 my $target = shift;
116             return sub {
117 53 100   53   162 return [ 'ambiguous', $1 ]
118             if $target =~ /\G (\[[A-Za-z\.]+\]) /gcx;
119 45 100       123 return [ 'negated', $1 ]
120             if $target =~ /\G (\[\^[A-Za-z\.]+\]) /gcx;
121 39 100       166 return [ 'plain', $1 ]
122             if $target =~ /\G ([A-Za-z\.]+) /gcx;
123 20 100       68 return [ 'open_par', $1 ]
124             if $target =~ /\G (\() /gcx;
125 16 100       54 return [ 'close_par', $1 ]
126             if $target =~ /\G (\)[\{\d+[,\d+]*\}]*) /gcx;
127 12 100       42 return [ 'UNKNOWN', $1 ]
128             if $target =~ /\G (.) /gcx;
129 11         28 return;
130 12         70 };
131             }
132              
133             sub _contract_codons {
134              
135             # Take a list of codons, return an ambiguous codon.
136 8     8   21 my @codons = map { uc $_ } @_;
  56         100  
137              
138 8         25 my @by_letter = ( [], [], [], );
139 8         14 my $ambiguous_codon;
140 8         19 foreach my $codon (@codons) {
141 56         105 my @letters = split '', $codon;
142 56         91 for my $i ( 0 .. 2 ) {
143 168         204 push @{ $by_letter[$i] }, $letters[$i];
  168         374  
144             }
145             }
146 8         18 for my $i ( 0 .. 2 ) {
147             $ambiguous_codon
148 24         37 .= _convert( 'dna', _uniq_string( @{ $by_letter[$i] } ) );
  24         54  
149             }
150 8         35 return $ambiguous_codon;
151             }
152              
153             sub _expand_codon {
154              
155             # Given a degenerate codon, return a list with all its
156             # constituents. Takes a three-letter string (codon) as
157             # input, returns a list with three-letter scalars.
158              
159 300     300   508 my $codon = shift;
160 300 50       598 die "Wrong codon length!\n" if length $codon != 3;
161              
162              
163 300         473 my ( @codons, @return_bases );
164 300         718 my @orig_bases = split '', $codon;
165              
166 300         590 for my $i ( 0 .. 2 ) {
167              
168             # from each redundant base, create a list with all their
169             # components (e.g., N -> (A, C, G, T) );
170 900         1674 my @components = split '', _convert('dna', $orig_bases[$i] );
171 900         2915 $orig_bases[$i] = [@components];
172             }
173              
174             # Combine all the bases of each of the three positions of the
175             # codons, and build the return list.
176 300         437 for my $i ( @{ $orig_bases[0] } ) {
  300         578  
177 1050         1357 for my $j ( @{ $orig_bases[1] } ) {
  1050         1760  
178 3558         4610 for my $k ( @{ $orig_bases[2] } ) {
  3558         5806  
179 11832         22063 push @return_bases, $i . $j . $k;
180             }
181             }
182             }
183 300         3207 return @return_bases;
184             }
185              
186             {
187             my %convert;
188              
189             sub _convert {
190             # Interconvert between redundant and non-redundant protein and
191             # dna alphabets. Takes an alphabet (protein or dna) and a string
192             # with the letter, and returns its equivalent in
193             # redundant/non-redundant alphabet. Example ACTG -> N.
194              
195 924     924   1648 my ($alphabet, $letter) = @_;
196 924 50 33     5376 unless (
      33        
      33        
197             $alphabet and $alphabet =~ /^dna$|^protein$/i
198             and $letter and length $letter <= 4
199 0         0 ) { croak "Wrong arguments!\n"; }
200              
201 924 100       1679 unless (%convert) {
202 2         34 %convert = (
203             'dna' => {
204             qw(N ACGT B CGT D AGT H ACT V ACG K GT
205             M AC R AG S CG W AT Y CT A A C C T T G G)
206             },
207             'protein' => {
208             '.' => 'ACDEFGHIJKLMNOPQRSTUVWY',
209             X => 'ACDEFGHIJKLMNOPQRSTUVWY',
210             Z => 'QE',
211             B => 'ND',
212             },
213             );
214              
215             # Make %convert hash key/value agnostic.
216 2         7 foreach my $alphabet ( keys %convert ) {
217 38         66 map { $convert{$alphabet}->{ $convert{$alphabet}{$_} } = $_ }
218 4         5 keys %{ $convert{$alphabet} };
  4         15  
219             }
220             }
221              
222 924         2942 return $convert{$alphabet}{$letter};
223             }
224              
225             }
226              
227             sub _uniq_string {
228             # Takes a list of letters and returns an alphabetically sorted
229             # list with unique elements.
230              
231 24     24   67 my @letters = @_;
232 24         126 return join '', sort { $a cmp $b } uniq @letters;
  48         142  
233             }
234              
235             {
236             my ( @codon_library, $ct );
237              
238             sub _negated_aas_to_codon {
239              
240             # Given a string of residues, returns a degenerate codon that will
241             # not be translated into any of them, while maximizing degeneracy
242             # (ie, it tries to also translate into as many residues as possible).
243              
244             # This functionality is required for reverse translating profiles
245             # that contain negative patterns: [^X]. This means that the current
246             # position should not contain aminoacid X, but can have any of the
247             # others. The reverse translated nucleotide sequence should
248             # reflect this.
249              
250             # Approach: construct a list of all possible codons, incluiding all
251             # degenerate bases. This is an array of 15x15x15 = 3375 elements.
252             # Order them by descendent "degeneracy".
253             # Return the first one whose expansion in 4-lettered codons
254             # doesn't contain a codon that translates into any of the
255             # non-wanted residues.
256            
257             # * Since this takes some time, I presorted them and saved them.
258             # Reading them from a file takes a fraction of the time that it taes
259             # to re-sort them every time the application is launched.
260              
261 6     6   14 my $aas_to_avoid = shift;
262              
263             # Initialize reusable variables if it's the first time the sub
264             # is called.
265 6 100       20 unless (@codon_library) {
266 2         11 while () { chomp; push @codon_library, split ' ', $_ }
  356         572  
  356         2928  
267             }
268 6 100       30 unless ($ct) { $ct = Bio::Tools::CodonTable->new; }
  2         16  
269              
270             # Reverse translate the unwanted aminoacids to unwanted codons.
271 6         17 my @unwanted_codons;
272 6         26 foreach my $aa ( split '', $aas_to_avoid ) {
273 18         57 push @unwanted_codons, $ct->revtranslate($aa);
274             }
275              
276 6         21 foreach my $degenerate_codon (@codon_library) {
277 300         630 my @codons = _expand_codon($degenerate_codon);
278 300         581 my $success = 1;
279              
280 300         486 foreach my $unwanted (@unwanted_codons) {
281 3000 100       4723 if ( grep { uc $unwanted eq $_ } @codons ) {
  118320         207002  
282 1788         2968 $success = 0;
283             }
284             }
285              
286 300 100       1230 if ($success) { return $degenerate_codon }
  6         49  
287             }
288             }
289              
290             }
291              
292             1;
293              
294             =head1 COPYRIGHT & LICENSE
295              
296             Copyright 2009 Bruno Vecchi, all rights reserved.
297              
298             This program is free software; you can redistribute it and/or modify it
299             under the same terms as Perl itself.
300              
301             =cut
302              
303             __DATA__