File Coverage

blib/lib/Bio/Tools/SeqPattern/Backtranslate.pm
Criterion Covered Total %
statement 126 127 99.2
branch 26 28 92.8
condition 3 9 33.3
subroutine 17 17 100.0
pod n/a
total 172 181 95.0


line stmt bran cond sub pod time code
1             package Bio::Tools::SeqPattern::Backtranslate;
2 2     2   633 use strict;
  2         2  
  2         44  
3 2     2   5 use warnings;
  2         2  
  2         41  
4              
5 2     2   5 use base qw(Bio::Root::Root);
  2         3  
  2         463  
6 2     2   6 use base qw(Exporter);
  2         2  
  2         95  
7              
8             =head1 NAME
9              
10             Bio::Tools::SeqPattern::Backtranslate
11              
12             =head1 DESCRIPTION
13              
14             This module should not be used directly. It provides helper methods to
15             Bio::Tools::SeqPattern to reverse translate protein patterns.
16              
17             =cut
18              
19 2     2   862 use Bio::Seq;
  2         4  
  2         62  
20 2     2   9 use Bio::Tools::CodonTable;
  2         2  
  2         75  
21              
22 2     2   6 use List::MoreUtils qw(uniq);
  2         2  
  2         21  
23 2     2   1004 use Carp qw(croak);
  2         2  
  2         2072  
24              
25             our @EXPORT_OK = qw(_reverse_translate_motif);
26              
27             our @EXPORT = @EXPORT_OK;
28              
29             sub _reverse_translate_motif {
30             # Main subroutine. It takes a Profam-like motif and returns its
31             # reverse translation using degenerate codons.
32              
33             # Steps:
34             # 1. Tokenize, then parse tokens.
35             # 2. Reverse translate each token type.
36             # 3. Join tokens in original order. Return the resulting string.
37              
38 12     12   3316 my $motif = shift;
39              
40 12         21 $motif =~ s/\./X/g;
41 12         19 $motif = uc $motif;
42              
43             ### 1. Tokenize, parse the motif.
44 12         17 my ( $ordered, $classified ) = _parse_motif($motif);
45              
46             ### 2. Reverse translate each token type.
47             # Reverse translate the plain (unambiguous) tokens.
48 11         30 my $ct = Bio::Tools::CodonTable->new;
49 11         10 foreach my $seq ( @{ $classified->{plain} } ) {
  11         15  
50 19         59 my $seqO
51             = Bio::Seq->new( -seq => $$seq, -alphabet => 'protein' );
52 19         38 $$seq = $ct->reverse_translate_all($seqO);
53             }
54              
55             # Reverse translate the ambiguous tokens.
56 11         9 foreach my $token ( @{ $classified->{ambiguous} } ) {
  11         19  
57 8         31 my ($aas) = $$token =~ m(([A-Za-z\.]+));
58 8         8 my @codons_to_contract;
59              
60 8         16 foreach my $residue ( split '', $aas ) {
61 32         55 push @codons_to_contract, $ct->revtranslate($residue);
62             }
63              
64 8         15 my $ambiguous_codon = _contract_codons(@codons_to_contract);
65 8         13 $$token = $ambiguous_codon;
66             }
67              
68             # Reverse translate the negated residues.
69 11         11 foreach my $token ( @{ $classified->{negated} } ) {
  11         14  
70 6         18 my ($aas) = $$token =~ m(([A-Za-z\.]+));
71 6         9 my $ambiguous_codon = _negated_aas_to_codon($aas);
72 6         10 $$token = $ambiguous_codon;
73             }
74              
75             ### 3. Join the profile back from its tokens.
76 11         10 return join '', map {$$_} @{$ordered};
  41         71  
  11         14  
77              
78             }
79              
80             sub _parse_motif {
81             # Profam-like motif parser. It takes the pattern as a string, and
82             # returns two data structures that contain the tokens, organized
83             # by order of appearance in the pattern (first return value) and by
84             # category (second return value).
85              
86 12     12   632 my $motif = shift;
87 12         19 my $parser = _tokenize_motif($motif);
88 12         9 my ( %tokens, @tokens );
89              
90 12         14 while ( my $token = $parser->() ) {
91 42 100       80 croak ("Unknown syntax token: <", $token->[1], ">")
92             if ( $token->[0] eq 'UNKNOWN' );
93 41         21 push @{ $tokens{ $token->[0] } }, \$token->[1];
  41         70  
94 41         66 push @tokens, \$token->[1];
95             }
96 11         41 return ( \@tokens, \%tokens );
97             }
98              
99             sub _tokenize_motif {
100              
101             # Return a tokenizer iterator that sequentially recognizes and
102             # returns each token in the input pattern.
103             # Examples of each token type:
104              
105             # ambiguous: a position with more than one possible residue.
106             # eg. [ALEP]
107             # negated: a position in which some residues are excluded.
108             # eg. [^WY]
109             # plain: a common sequence of residues. One position, one residue.
110             # eg. MAAEIK
111             # open_par, close_par: tags surrounding a motif that is repeated
112             # a certain number of times.
113             # eg. (...){3}
114              
115 12     12   12 my $target = shift;
116             return sub {
117 53 100   53   107 return [ 'ambiguous', $1 ]
118             if $target =~ /\G (\[[A-Za-z\.]+\]) /gcx;
119 45 100       70 return [ 'negated', $1 ]
120             if $target =~ /\G (\[\^[A-Za-z\.]+\]) /gcx;
121 39 100       117 return [ 'plain', $1 ]
122             if $target =~ /\G ([A-Za-z\.]+) /gcx;
123 20 100       43 return [ 'open_par', $1 ]
124             if $target =~ /\G (\() /gcx;
125 16 100       33 return [ 'close_par', $1 ]
126             if $target =~ /\G (\)[\{\d+[,\d+]*\}]*) /gcx;
127 12 100       18 return [ 'UNKNOWN', $1 ]
128             if $target =~ /\G (.) /gcx;
129 11         20 return;
130 12         37 };
131             }
132              
133             sub _contract_codons {
134              
135             # Take a list of codons, return an ambiguous codon.
136 8     8   10 my @codons = map { uc $_ } @_;
  56         54  
137              
138 8         13 my @by_letter = ( [], [], [], );
139 8         8 my $ambiguous_codon;
140 8         8 foreach my $codon (@codons) {
141 56         56 my @letters = split '', $codon;
142 56         44 for my $i ( 0 .. 2 ) {
143 168         89 push @{ $by_letter[$i] }, $letters[$i];
  168         200  
144             }
145             }
146 8         7 for my $i ( 0 .. 2 ) {
147             $ambiguous_codon
148 24         19 .= _convert( 'dna', _uniq_string( @{ $by_letter[$i] } ) );
  24         33  
149             }
150 8         22 return $ambiguous_codon;
151             }
152              
153             sub _expand_codon {
154              
155             # Given a degenerate codon, return a list with all its
156             # constituents. Takes a three-letter string (codon) as
157             # input, returns a list with three-letter scalars.
158              
159 300     300   227 my $codon = shift;
160 300 50       372 die "Wrong codon length!\n" if length $codon != 3;
161              
162              
163 300         183 my ( @codons, @return_bases );
164 300         389 my @orig_bases = split '', $codon;
165              
166 300         300 for my $i ( 0 .. 2 ) {
167              
168             # from each redundant base, create a list with all their
169             # components (e.g., N -> (A, C, G, T) );
170 900         913 my @components = split '', _convert('dna', $orig_bases[$i] );
171 900         1519 $orig_bases[$i] = [@components];
172             }
173              
174             # Combine all the bases of each of the three positions of the
175             # codons, and build the return list.
176 300         208 for my $i ( @{ $orig_bases[0] } ) {
  300         335  
177 1050         608 for my $j ( @{ $orig_bases[1] } ) {
  1050         914  
178 3558         1895 for my $k ( @{ $orig_bases[2] } ) {
  3558         3021  
179 11832         10954 push @return_bases, $i . $j . $k;
180             }
181             }
182             }
183 300         2390 return @return_bases;
184             }
185              
186             {
187             my %convert;
188              
189             sub _convert {
190             # Interconvert between redundant and non-redundant protein and
191             # dna alphabets. Takes an alphabet (protein or dna) and a string
192             # with the letter, and returns its equivalent in
193             # redundant/non-redundant alphabet. Example ACTG -> N.
194              
195 924     924   668 my ($alphabet, $letter) = @_;
196 924 50 33     4998 unless (
      33        
      33        
197             $alphabet and $alphabet =~ /^dna$|^protein$/i
198             and $letter and length $letter <= 4
199 0         0 ) { croak "Wrong arguments!\n"; }
200              
201 924 100       1694 unless (%convert) {
202 2         19 %convert = (
203             'dna' => {
204             qw(N ACGT B CGT D AGT H ACT V ACG K GT
205             M AC R AG S CG W AT Y CT A A C C T T G G)
206             },
207             'protein' => {
208             '.' => 'ACDEFGHIJKLMNOPQRSTUVWY',
209             X => 'ACDEFGHIJKLMNOPQRSTUVWY',
210             Z => 'QE',
211             B => 'ND',
212             },
213             );
214              
215             # Make %convert hash key/value agnostic.
216 2         4 foreach my $alphabet ( keys %convert ) {
217 38         51 map { $convert{$alphabet}->{ $convert{$alphabet}{$_} } = $_ }
218 4         4 keys %{ $convert{$alphabet} };
  4         7  
219             }
220             }
221              
222 924         1644 return $convert{$alphabet}{$letter};
223             }
224              
225             }
226              
227             sub _uniq_string {
228             # Takes a list of letters and returns an alphabetically sorted
229             # list with unique elements.
230              
231 24     24   37 my @letters = @_;
232 24         89 return join '', sort { $a cmp $b } uniq @letters;
  48         77  
233             }
234              
235             {
236             my ( @codon_library, $ct );
237              
238             sub _negated_aas_to_codon {
239              
240             # Given a string of residues, returns a degenerate codon that will
241             # not be translated into any of them, while maximizing degeneracy
242             # (ie, it tries to also translate into as many residues as possible).
243              
244             # This functionality is required for reverse translating profiles
245             # that contain negative patterns: [^X]. This means that the current
246             # position should not contain aminoacid X, but can have any of the
247             # others. The reverse translated nucleotide sequence should
248             # reflect this.
249              
250             # Approach: construct a list of all possible codons, incluiding all
251             # degenerate bases. This is an array of 15x15x15 = 3375 elements.
252             # Order them by descendent "degeneracy".
253             # Return the first one whose expansion in 4-lettered codons
254             # doesn't contain a codon that translates into any of the
255             # non-wanted residues.
256            
257             # * Since this takes some time, I presorted them and saved them.
258             # Reading them from a file takes a fraction of the time that it taes
259             # to re-sort them every time the application is launched.
260              
261 6     6   7 my $aas_to_avoid = shift;
262              
263             # Initialize reusable variables if it's the first time the sub
264             # is called.
265 6 100       14 unless (@codon_library) {
266 2         6 while () { chomp; push @codon_library, split ' ', $_ }
  356         214  
  356         1863  
267             }
268 6 100       13 unless ($ct) { $ct = Bio::Tools::CodonTable->new; }
  2         8  
269              
270             # Reverse translate the unwanted aminoacids to unwanted codons.
271 6         4 my @unwanted_codons;
272 6         12 foreach my $aa ( split '', $aas_to_avoid ) {
273 18         29 push @unwanted_codons, $ct->revtranslate($aa);
274             }
275              
276 6         7 foreach my $degenerate_codon (@codon_library) {
277 300         336 my @codons = _expand_codon($degenerate_codon);
278 300         402 my $success = 1;
279              
280 300         233 foreach my $unwanted (@unwanted_codons) {
281 3000 100       2129 if ( grep { uc $unwanted eq $_ } @codons ) {
  118320         91757  
282 1788         1370 $success = 0;
283             }
284             }
285              
286 300 100       781 if ($success) { return $degenerate_codon }
  6         24  
287             }
288             }
289              
290             }
291              
292             1;
293              
294             =head1 COPYRIGHT & LICENSE
295              
296             Copyright 2009 Bruno Vecchi, all rights reserved.
297              
298             This program is free software; you can redistribute it and/or modify it
299             under the same terms as Perl itself.
300              
301             =cut
302              
303             __DATA__