File Coverage

blib/lib/Bio/Tools/SeqPattern/Backtranslate.pm

Criterion	Covered	Total	%
statement	126	127	99.2
branch	26	28	92.8
condition	3	9	33.3
subroutine	17	17	100.0
pod			n/a
total	172	181	95.0

line	stmt	bran	cond	sub	time	code
1						package Bio::Tools::SeqPattern::Backtranslate;
2	2			2	756	use strict;
	2				4
	2				53
3	2			2	8	use warnings;
	2				3
	2				60
4
5	2			2	9	use base qw(Bio::Root::Root);
	2				3
	2				382
6	2			2	18	use base qw(Exporter);
	2				6
	2				174
7
8						=head1 NAME
9
10						Bio::Tools::SeqPattern::Backtranslate
11
12						=head1 DESCRIPTION
13
14						This module should not be used directly. It provides helper methods to
15						Bio::Tools::SeqPattern to reverse translate protein patterns.
16
17						=cut
18
19	2			2	1178	use Bio::Seq;
	2				11
	2				145
20	2			2	22	use Bio::Tools::CodonTable;
	2				6
	2				70
21
22	2			2	17	use List::MoreUtils qw(uniq);
	2				5
	2				48
23	2			2	3585	use Carp qw(croak);
	2				6
	2				3319
24
25						our @EXPORT_OK = qw(_reverse_translate_motif);
26
27						our @EXPORT = @EXPORT_OK;
28
29						sub _reverse_translate_motif {
30						# Main subroutine. It takes a Profam-like motif and returns its
31						# reverse translation using degenerate codons.
32
33						# Steps:
34						# 1. Tokenize, then parse tokens.
35						# 2. Reverse translate each token type.
36						# 3. Join tokens in original order. Return the resulting string.
37
38	12			12	5789	my $motif = shift;
39
40	12				44	$motif =~ s/\./X/g;
41	12				39	$motif = uc $motif;
42
43						### 1. Tokenize, parse the motif.
44	12				41	my ( $ordered, $classified ) = _parse_motif($motif);
45
46						### 2. Reverse translate each token type.
47						# Reverse translate the plain (unambiguous) tokens.
48	11				72	my $ct = Bio::Tools::CodonTable->new;
49	11				19	foreach my $seq ( @{ $classified->{plain} } ) {
	11				32
50	19				102	my $seqO
51						= Bio::Seq->new( -seq => $$seq, -alphabet => 'protein' );
52	19				77	$$seq = $ct->reverse_translate_all($seqO);
53						}
54
55						# Reverse translate the ambiguous tokens.
56	11				21	foreach my $token ( @{ $classified->{ambiguous} } ) {
	11				32
57	8				44	my ($aas) = $$token =~ m(([A-Za-z\.]+));
58	8				16	my @codons_to_contract;
59
60	8				25	foreach my $residue ( split '', $aas ) {
61	32				73	push @codons_to_contract, $ct->revtranslate($residue);
62						}
63
64	8				31	my $ambiguous_codon = _contract_codons(@codons_to_contract);
65	8				25	$$token = $ambiguous_codon;
66						}
67
68						# Reverse translate the negated residues.
69	11				19	foreach my $token ( @{ $classified->{negated} } ) {
	11				29
70	6				26	my ($aas) = $$token =~ m(([A-Za-z\.]+));
71	6				21	my $ambiguous_codon = _negated_aas_to_codon($aas);
72	6				33	$$token = $ambiguous_codon;
73						}
74
75						### 3. Join the profile back from its tokens.
76	11				24	return join '', map {$$_} @{$ordered};
	41				196
	11				26
77
78						}
79
80						sub _parse_motif {
81						# Profam-like motif parser. It takes the pattern as a string, and
82						# returns two data structures that contain the tokens, organized
83						# by order of appearance in the pattern (first return value) and by
84						# category (second return value).
85
86	12			12	18	my $motif = shift;
87	12				44	my $parser = _tokenize_motif($motif);
88	12				30	my ( %tokens, @tokens );
89
90	12				28	while ( my $token = $parser->() ) {
91	42	100			119	croak ("Unknown syntax token: <", $token->[1], ">")
92						if ( $token->[0] eq 'UNKNOWN' );
93	41				52	push @{ $tokens{ $token->[0] } }, \$token->[1];
	41				103
94	41				109	push @tokens, \$token->[1];
95						}
96	11				67	return ( \@tokens, \%tokens );
97						}
98
99						sub _tokenize_motif {
100
101						# Return a tokenizer iterator that sequentially recognizes and
102						# returns each token in the input pattern.
103						# Examples of each token type:
104
105						# ambiguous: a position with more than one possible residue.
106						# eg. [ALEP]
107						# negated: a position in which some residues are excluded.
108						# eg. [^WY]
109						# plain: a common sequence of residues. One position, one residue.
110						# eg. MAAEIK
111						# open_par, close_par: tags surrounding a motif that is repeated
112						# a certain number of times.
113						# eg. (...){3}
114
115	12			12	25	my $target = shift;
116						return sub {
117	53	100		53	162	return [ 'ambiguous', $1 ]
118						if $target =~ /\G (\[[A-Za-z\.]+\]) /gcx;
119	45	100			123	return [ 'negated', $1 ]
120						if $target =~ /\G (\[\^[A-Za-z\.]+\]) /gcx;
121	39	100			166	return [ 'plain', $1 ]
122						if $target =~ /\G ([A-Za-z\.]+) /gcx;
123	20	100			68	return [ 'open_par', $1 ]
124						if $target =~ /\G (\() /gcx;
125	16	100			54	return [ 'close_par', $1 ]
126						if $target =~ /\G (\)[\{\d+[,\d+]\}]) /gcx;
127	12	100			42	return [ 'UNKNOWN', $1 ]
128						if $target =~ /\G (.) /gcx;
129	11				28	return;
130	12				70	};
131						}
132
133						sub _contract_codons {
134
135						# Take a list of codons, return an ambiguous codon.
136	8			8	21	my @codons = map { uc $_ } @_;
	56				100
137
138	8				25	my @by_letter = ( [], [], [], );
139	8				14	my $ambiguous_codon;
140	8				19	foreach my $codon (@codons) {
141	56				105	my @letters = split '', $codon;
142	56				91	for my $i ( 0 .. 2 ) {
143	168				204	push @{ $by_letter[$i] }, $letters[$i];
	168				374
144						}
145						}
146	8				18	for my $i ( 0 .. 2 ) {
147						$ambiguous_codon
148	24				37	.= _convert( 'dna', _uniq_string( @{ $by_letter[$i] } ) );
	24				54
149						}
150	8				35	return $ambiguous_codon;
151						}
152
153						sub _expand_codon {
154
155						# Given a degenerate codon, return a list with all its
156						# constituents. Takes a three-letter string (codon) as
157						# input, returns a list with three-letter scalars.
158
159	300			300	508	my $codon = shift;
160	300	50			598	die "Wrong codon length!\n" if length $codon != 3;
161
162
163	300				473	my ( @codons, @return_bases );
164	300				718	my @orig_bases = split '', $codon;
165
166	300				590	for my $i ( 0 .. 2 ) {
167
168						# from each redundant base, create a list with all their
169						# components (e.g., N -> (A, C, G, T) );
170	900				1674	my @components = split '', _convert('dna', $orig_bases[$i] );
171	900				2915	$orig_bases[$i] = [@components];
172						}
173
174						# Combine all the bases of each of the three positions of the
175						# codons, and build the return list.
176	300				437	for my $i ( @{ $orig_bases[0] } ) {
	300				578
177	1050				1357	for my $j ( @{ $orig_bases[1] } ) {
	1050				1760
178	3558				4610	for my $k ( @{ $orig_bases[2] } ) {
	3558				5806
179	11832				22063	push @return_bases, $i . $j . $k;
180						}
181						}
182						}
183	300				3207	return @return_bases;
184						}
185
186						{
187						my %convert;
188
189						sub _convert {
190						# Interconvert between redundant and non-redundant protein and
191						# dna alphabets. Takes an alphabet (protein or dna) and a string
192						# with the letter, and returns its equivalent in
193						# redundant/non-redundant alphabet. Example ACTG -> N.
194
195	924			924	1648	my ($alphabet, $letter) = @_;
196	924	50	33		5376	unless (
			33
			33
197						$alphabet and $alphabet =~ /^dna$\|^protein$/i
198						and $letter and length $letter <= 4
199	0				0	) { croak "Wrong arguments!\n"; }
200
201	924	100			1679	unless (%convert) {
202	2				34	%convert = (
203						'dna' => {
204						qw(N ACGT B CGT D AGT H ACT V ACG K GT
205						M AC R AG S CG W AT Y CT A A C C T T G G)
206						},
207						'protein' => {
208						'.' => 'ACDEFGHIJKLMNOPQRSTUVWY',
209						X => 'ACDEFGHIJKLMNOPQRSTUVWY',
210						Z => 'QE',
211						B => 'ND',
212						},
213						);
214
215						# Make %convert hash key/value agnostic.
216	2				7	foreach my $alphabet ( keys %convert ) {
217	38				66	map { $convert{$alphabet}->{ $convert{$alphabet}{$_} } = $_ }
218	4				5	keys %{ $convert{$alphabet} };
	4				15
219						}
220						}
221
222	924				2942	return $convert{$alphabet}{$letter};
223						}
224
225						}
226
227						sub _uniq_string {
228						# Takes a list of letters and returns an alphabetically sorted
229						# list with unique elements.
230
231	24			24	67	my @letters = @_;
232	24				126	return join '', sort { $a cmp $b } uniq @letters;
	48				142
233						}
234
235						{
236						my ( @codon_library, $ct );
237
238						sub _negated_aas_to_codon {
239
240						# Given a string of residues, returns a degenerate codon that will
241						# not be translated into any of them, while maximizing degeneracy
242						# (ie, it tries to also translate into as many residues as possible).
243
244						# This functionality is required for reverse translating profiles
245						# that contain negative patterns: [^X]. This means that the current
246						# position should not contain aminoacid X, but can have any of the
247						# others. The reverse translated nucleotide sequence should
248						# reflect this.
249
250						# Approach: construct a list of all possible codons, incluiding all
251						# degenerate bases. This is an array of 15x15x15 = 3375 elements.
252						# Order them by descendent "degeneracy".
253						# Return the first one whose expansion in 4-lettered codons
254						# doesn't contain a codon that translates into any of the
255						# non-wanted residues.
256
257						# * Since this takes some time, I presorted them and saved them.
258						# Reading them from a file takes a fraction of the time that it taes
259						# to re-sort them every time the application is launched.
260
261	6			6	14	my $aas_to_avoid = shift;
262
263						# Initialize reusable variables if it's the first time the sub
264						# is called.
265	6	100			20	unless (@codon_library) {
266	2				11	while () { chomp; push @codon_library, split ' ', $_ }
	356				572
	356				2928
267						}
268	6	100			30	unless ($ct) { $ct = Bio::Tools::CodonTable->new; }
	2				16
269
270						# Reverse translate the unwanted aminoacids to unwanted codons.
271	6				17	my @unwanted_codons;
272	6				26	foreach my $aa ( split '', $aas_to_avoid ) {
273	18				57	push @unwanted_codons, $ct->revtranslate($aa);
274						}
275
276	6				21	foreach my $degenerate_codon (@codon_library) {
277	300				630	my @codons = _expand_codon($degenerate_codon);
278	300				581	my $success = 1;
279
280	300				486	foreach my $unwanted (@unwanted_codons) {
281	3000	100			4723	if ( grep { uc $unwanted eq $_ } @codons ) {
	118320				207002
282	1788				2968	$success = 0;
283						}
284						}
285
286	300	100			1230	if ($success) { return $degenerate_codon }
	6				49
287						}
288						}
289
290						}
291
292						1;
293
294						=head1 COPYRIGHT & LICENSE
295
296						Copyright 2009 Bruno Vecchi, all rights reserved.
297
298						This program is free software; you can redistribute it and/or modify it
299						under the same terms as Perl itself.
300
301						=cut
302
303						__DATA__