File Coverage

Bio/Tools/CodonTable.pm

Criterion	Covered	Total	%
statement	225	239	94.1
branch	75	92	81.5
condition	25	41	60.9
subroutine	24	25	96.0
pod	14	14	100.0
total	363	411	88.3

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# bioperl module for Bio::Tools::CodonTable
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Heikki Lehvaslaiho
7							#
8							# Copyright Heikki Lehvaslaiho
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::Tools::CodonTable - Codon table object
17
18							=head1 SYNOPSIS
19
20							# This is a read-only class for all known codon tables. The IDs are
21							# the ones used by nucleotide sequence databases. All common IUPAC
22							# ambiguity codes for DNA, RNA and amino acids are recognized.
23
24							use Bio::Tools::CodonTable;
25
26							# defaults to ID 1 "Standard"
27							$myCodonTable = Bio::Tools::CodonTable->new();
28							$myCodonTable2 = Bio::Tools::CodonTable->new( -id => 3 );
29
30							# change codon table
31							$myCodonTable->id(5);
32
33							# examine codon table
34							print join (' ', "The name of the codon table no.", $myCodonTable->id(4),
35							"is:", $myCodonTable->name(), "\n");
36
37							# print possible codon tables
38							$tables = Bio::Tools::CodonTable->tables;
39							while ( ($id,$name) = each %{$tables} ) {
40							print "$id = $name\n";
41							}
42
43							# translate a codon
44							$aa = $myCodonTable->translate('ACU');
45							$aa = $myCodonTable->translate('act');
46							$aa = $myCodonTable->translate('ytr');
47
48							# reverse translate an amino acid
49							@codons = $myCodonTable->revtranslate('A');
50							@codons = $myCodonTable->revtranslate('Ser');
51							@codons = $myCodonTable->revtranslate('Glx');
52							@codons = $myCodonTable->revtranslate('cYS', 'rna');
53
54							# reverse translate an entire amino acid sequence into a IUPAC
55							# nucleotide string
56
57							my $seqobj = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
58							my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
59
60							# boolean tests
61							print "Is a start\n" if $myCodonTable->is_start_codon('ATG');
62							print "Is a terminator\n" if $myCodonTable->is_ter_codon('tar');
63							print "Is a unknown\n" if $myCodonTable->is_unknown_codon('JTG');
64
65							=head1 DESCRIPTION
66
67							Codon tables are also called translation tables or genetic codes
68							since that is what they represent. A bit more complete picture
69							of the full complexity of codon usage in various taxonomic groups
70							is presented at the NCBI Genetic Codes Home page.
71
72							CodonTable is a BioPerl class that knows all current translation
73							tables that are used by primary nucleotide sequence databases
74							(GenBank, EMBL and DDBJ). It provides methods to output information
75							about tables and relationships between codons and amino acids.
76
77							This class and its methods recognized all common IUPAC ambiguity codes
78							for DNA, RNA and animo acids. The translation method follows the
79							conventions in EMBL and TREMBL databases.
80
81							It is a nuisance to separate RNA and cDNA representations of nucleic
82							acid transcripts. The CodonTable object accepts codons of both type as
83							input and allows the user to set the mode for output when reverse
84							translating. Its default for output is DNA.
85
86							Note:
87
88							This class deals primarily with individual codons and amino
89							acids. However in the interest of speed you can L
90							longer sequence, too. The full complexity of protein translation
91							is tackled by L.
92
93
94							The amino acid codes are IUPAC recommendations for common amino acids:
95
96							A Ala Alanine
97							R Arg Arginine
98							N Asn Asparagine
99							D Asp Aspartic acid
100							C Cys Cysteine
101							Q Gln Glutamine
102							E Glu Glutamic acid
103							G Gly Glycine
104							H His Histidine
105							I Ile Isoleucine
106							L Leu Leucine
107							K Lys Lysine
108							M Met Methionine
109							F Phe Phenylalanine
110							P Pro Proline
111							O Pyl Pyrrolysine (22nd amino acid)
112							U Sec Selenocysteine (21st amino acid)
113							S Ser Serine
114							T Thr Threonine
115							W Trp Tryptophan
116							Y Tyr Tyrosine
117							V Val Valine
118							B Asx Aspartic acid or Asparagine
119							Z Glx Glutamine or Glutamic acid
120							J Xle Isoleucine or Valine (mass spec ambiguity)
121							X Xaa Any or unknown amino acid
122
123
124							It is worth noting that, "Bacterial" codon table no. 11 produces an
125							polypeptide that is, confusingly, identical to the standard one. The
126							only differences are in available initiator codons.
127
128
129							NCBI Genetic Codes home page:
130							(Last update of the Genetic Codes: April 30, 2013)
131							https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
132
133							ASN.1 version with ids 1 to 25 is at:
134							ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
135
136							Thanks to Matteo diTomasso for the original Perl implementation
137							of these tables.
138
139							=head1 FEEDBACK
140
141							=head2 Mailing Lists
142
143							User feedback is an integral part of the evolution of this and other
144							Bioperl modules. Send your comments and suggestions preferably to the
145							Bioperl mailing lists Your participation is much appreciated.
146
147							bioperl-l@bioperl.org - General discussion
148							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
149
150							=head2 Support
151
152							Please direct usage questions or support issues to the mailing list:
153
154							I
155
156							rather than to the module maintainer directly. Many experienced and
157							reponsive experts will be able look at the problem and quickly
158							address it. Please include a thorough description of the problem
159							with code and data examples if at all possible.
160
161							=head2 Reporting Bugs
162
163							Report bugs to the Bioperl bug tracking system to help us keep track
164							the bugs and their resolution. Bug reports can be submitted via the
165							web:
166
167							https://github.com/bioperl/bioperl-live/issues
168
169							=head1 AUTHOR - Heikki Lehvaslaiho
170
171							Email: heikki-at-bioperl-dot-org
172
173							=head1 APPENDIX
174
175							The rest of the documentation details each of the object
176							methods. Internal methods are usually preceded with a _
177
178							=cut
179
180							# Let the code begin...
181
182							package Bio::Tools::CodonTable;
183	203					18989	use vars qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
184	203			203		2213	%IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
	203					349
185	203			203		1116	use strict;
	203					318
	203					4289
186
187							# Object preamble - inherits from Bio::Root::Root
188	203			203		55182	use Bio::Tools::IUPAC;
	203					611
	203					5240
189	203			203		81909	use Bio::SeqUtils;
	203					538
	203					6928
190
191	203			203		1493	use base qw(Bio::Root::Root);
	203					350
	203					15445
192
193
194							# first set internal values for all translation tables
195
196							BEGIN {
197	203			203		1196	use constant CODONSIZE => 3;
	203					375
	203					61599
198	203			203		828	$GAP = '-';
199	203					687	$CODONGAP = $GAP x CODONSIZE;
200
201	203					1162	@NAMES = #id
202							(
203							'Strict', #0, special option for ATG-only start
204							'Standard', #1
205							'Vertebrate Mitochondrial',#2
206							'Yeast Mitochondrial',# 3
207							'Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma',#4
208							'Invertebrate Mitochondrial',#5
209							'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
210							'', '',
211							'Echinoderm and Flatworm Mitochondrial',#9
212							'Euplotid Nuclear',#10
213							'Bacterial, Archaeal and Plant Plastid',# 11
214							'Alternative Yeast Nuclear',# 12
215							'Ascidian Mitochondrial',# 13
216							'Alternative Flatworm Mitochondrial',# 14
217							'Blepharisma Nuclear',# 15
218							'Chlorophycean Mitochondrial',# 16
219							'', '', '', '',
220							'Trematode Mitochondrial',# 21
221							'Scenedesmus obliquus Mitochondrial', #22
222							'Thraustochytrium Mitochondrial', #23
223							'Pterobranchia Mitochondrial', #24
224							'Candidate Division SR1 and Gracilibacteria', #25
225							);
226
227	203					911	@TABLES =
228							qw(
229							FFLLSSSSYY*CCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
230							FFLLSSSSYY*CCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
231							FFLLSSSSYYCCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSVVVVAAAADDEEGGGG
232							FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
233							FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
234							FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
235							FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
236							'' ''
237							FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
238							FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
239							FFLLSSSSYY*CCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
240							FFLLSSSSYY*CCWLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
241							FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
242							FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
243							FFLLSSSSYYQCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
244							FFLLSSSSYYLCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
245							'' '' '' ''
246							FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
247							FFLLSSSYYLCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
248							FFLSSSSYYCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
249							FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG
250							FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
251							);
252
253							# (bases used for these tables, for reference)
254							# 1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
255							# 2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
256							# 3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
257
258	203					880	@STARTS =
259							qw(
260							-----------------------------------M----------------------------
261							---M---------------M---------------M----------------------------
262							--------------------------------MMMM---------------M------------
263							----------------------------------MM----------------------------
264							--MM---------------M------------MMMM---------------M------------
265							---M----------------------------MMMM---------------M------------
266							-----------------------------------M----------------------------
267							'' ''
268							-----------------------------------M---------------M------------
269							-----------------------------------M----------------------------
270							---M---------------M------------MMMM---------------M------------
271							-------------------M---------------M----------------------------
272							---M------------------------------MM---------------M------------
273							-----------------------------------M----------------------------
274							-----------------------------------M----------------------------
275							-----------------------------------M----------------------------
276							'' '' '' ''
277							-----------------------------------M---------------M------------
278							-----------------------------------M----------------------------
279							--------------------------------M--M---------------M------------
280							---M---------------M---------------M---------------M------------
281							---M-------------------------------M---------------M------------
282							);
283
284	203					483	my @nucs = qw(t c a g);
285	203					362	my $x = 0;
286	203					448	($CODONS, $TRCOL) = ({}, {});
287	203					573	for my $i (@nucs) {
288	812					1013	for my $j (@nucs) {
289	3248					3252	for my $k (@nucs) {
290	12992					13127	my $codon = "$i$j$k";
291	12992					20760	$CODONS->{$codon} = $x;
292	12992					20312	$TRCOL->{$x} = $codon;
293	12992					13467	$x++;
294							}
295							}
296							}
297	203					1355	%IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub();
298	203					989	%IUPAC_AA = Bio::Tools::IUPAC->iupac_iup();
299	203					1041	%THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2);
300	203					1061	$VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']';
301	203					386791	$TERMINATOR = '*';
302							}
303
304							sub new {
305	505			505	1	1789	my($class,@args) = @_;
306	505					1422	my $self = $class->SUPER::new(@args);
307
308	505					1773	my($id) =
309							$self->_rearrange([qw(ID
310							)],
311							@args);
312
313	505	100				1401	$id = 1 if ( ! $id );
314	505	50				1743	$id && $self->id($id);
315	505					1017	return $self; # success - we hope!
316							}
317
318							=head2 id
319
320							Title : id
321							Usage : $obj->id(3); $id_integer = $obj->id();
322							Function: Sets or returns the id of the translation table. IDs are
323							integers from 0 (special ATG-only start) to 25, excluding
324							7-8 and 17-20 which have been removed. If an invalid ID is
325							given the method returns 1, the standard table.
326							Example :
327							Returns : value of id, a scalar, warn and fall back to 1 (standard table)
328							if specified id is not valid
329							Args : newvalue (optional)
330
331							=cut
332
333							sub id{
334	25860			25860	1	30290	my ($self,$value) = @_;
335	25860	100				32497	if( defined $value) {
336	644	100	66			2327	if ( not defined $TABLES[$value] or $TABLES[$value] eq '') {
337	1					12	$self->warn("Not a valid codon table ID [$value], using [1] instead ");
338	1					2	$value = 1;
339							}
340	644					1030	$self->{'id'} = $value;
341							}
342	25860					32088	return $self->{'id'};
343							}
344
345							=head2 name
346
347							Title : name
348							Usage : $obj->name()
349							Function: returns the descriptive name of the translation table
350							Example :
351							Returns : A string
352							Args : None
353
354
355							=cut
356
357							sub name{
358	1			1	1	3	my ($self) = @_;
359
360	1					3	my ($id) = $self->{'id'};
361	1					5	return $NAMES[$id];
362							}
363
364							=head2 tables
365
366							Title : tables
367							Usage : $obj->tables() or Bio::Tools::CodonTable->tables()
368							Function: returns a hash reference where each key is a valid codon
369							table id() number, and each value is the corresponding
370							codon table name() string
371							Example :
372							Returns : A hashref
373							Args : None
374
375
376							=cut
377
378							sub tables{
379	2			2	1	801	my %tables;
380	2					6	for my $id (0 .. $#NAMES) {
381	52					47	my $name = $NAMES[$id];
382	52	100				88	$tables{$id} = $name if $name;
383							}
384	2					5	return \%tables;
385							}
386
387							=head2 translate
388
389							Title : translate
390							Usage : $obj->translate('YTR')
391							Function: Returns a string of one letter amino acid codes from
392							nucleotide sequence input. The imput can be of any length.
393
394							Returns 'X' for unknown codons and codons that code for
395							more than one amino acid. Returns an empty string if input
396							is not three characters long. Exceptions for these are:
397
398							- IUPAC amino acid code B for Aspartic Acid and
399							Asparagine, is used.
400							- IUPAC amino acid code Z for Glutamic Acid, Glutamine is
401							used.
402							- if the codon is two nucleotides long and if by adding
403							an a third character 'N', it codes for a single amino
404							acid (with exceptions above), return that, otherwise
405							return empty string.
406
407							Returns empty string for other input strings that are not
408							three characters long.
409
410							Example :
411							Returns : a string of one letter ambiguous IUPAC amino acid codes
412							Args : ambiguous IUPAC nucleotide string
413
414
415							=cut
416
417							sub translate {
418	19902			19902	1	33549	my ($self, $seq, $complete_codon) = @_;
419	19902	100				27346	$self->throw("Calling translate without a seq argument!") unless defined $seq;
420	19901	100				24845	return '' unless $seq;
421
422	19900					24263	my $id = $self->id;
423	19900					21740	my ($partial) = 0;
424	19900	100				29891	$partial = 2 if length($seq) % CODONSIZE == 2;
425
426	19900					23630	$seq = lc $seq;
427	19900					22038	$seq =~ tr/u/t/;
428	19900					20072	my $protein = "";
429	19900	100				40211	if ($seq =~ /[^actg]/ ) { #ambiguous chars
430	5315					10086	for (my $i = 0; $i < (length($seq) - (CODONSIZE-1)); $i+= CODONSIZE) {
431	17779					19080	my $triplet = substr($seq, $i, CODONSIZE);
432	17779	100				25226	if( $triplet eq $CODONGAP ) {
		100
433	1527					2158	$protein .= $GAP;
434							} elsif (exists $CODONS->{$triplet}) {
435							$protein .= substr($TABLES[$id],
436	10946					17879	$CODONS->{$triplet},1);
437							} else {
438	5306					7625	$protein .= $self->_translate_ambiguous_codon($triplet);
439							}
440							}
441							} else { # simple, strict translation
442	14585					26710	for (my $i = 0; $i < (length($seq) - (CODONSIZE -1)); $i+=CODONSIZE) {
443	124418					126129	my $triplet = substr($seq, $i, CODONSIZE);
444	124418	50				143298	if( $triplet eq $CODONGAP ) {
445	0					0	$protein .= $GAP;
446							}
447	124418	50				139473	if (exists $CODONS->{$triplet}) {
448	124418					203514	$protein .= substr($TABLES[$id], $CODONS->{$triplet}, 1);
449							} else {
450	0					0	$protein .= 'X';
451							}
452							}
453							}
454	19900	100	100			32152	if ($partial == 2 && $complete_codon) { # 2 overhanging nucleotides
455	7					21	my $triplet = substr($seq, ($partial -4)). "n";
456	7	50				23	if( $triplet eq $CODONGAP ) {
		50
457	0					0	$protein .= $GAP;
458							} elsif (exists $CODONS->{$triplet}) {
459	0					0	my $aa = substr($TABLES[$id], $CODONS->{$triplet},1);
460	0					0	$protein .= $aa;
461							} else {
462	7					18	$protein .= $self->_translate_ambiguous_codon($triplet, $partial);
463							}
464							}
465	19900					37457	return $protein;
466							}
467
468							sub _translate_ambiguous_codon {
469	5313			5313		6707	my ($self, $triplet, $partial) = @_;
470	5313		100			12930	$partial \|\|= 0;
471	5313					5811	my $id = $self->id;
472	5313					4760	my $aa;
473	5313					7235	my @codons = $self->unambiguous_codons($triplet);
474	5313					5091	my %aas =();
475	5313					5709	foreach my $codon (@codons) {
476	401					662	$aas{substr($TABLES[$id],$CODONS->{$codon},1)} = 1;
477							}
478	5313					6141	my $count = scalar keys %aas;
479	5313	100				7359	if ( $count == 1 ) {
		100
480	141					216	$aa = (keys %aas)[0];
481							}
482							elsif ( $count == 2 ) {
483	13	100	66			55	if ($aas{'D'} and $aas{'N'}) {
		100	66
484	3					5	$aa = 'B';
485							}
486							elsif ($aas{'E'} and $aas{'Q'}) {
487	4					6	$aa = 'Z';
488							} else {
489	6	100				13	$partial ? ($aa = '') : ($aa = 'X');
490							}
491							} else {
492	5159	100				6512	$partial ? ($aa = '') : ($aa = 'X');
493							}
494	5313					13796	return $aa;
495							}
496
497							=head2 translate_strict
498
499							Title : translate_strict
500							Usage : $obj->translate_strict('ACT')
501							Function: returns one letter amino acid code for a codon input
502
503							Fast and simple translation. User is responsible to resolve
504							ambiguous nucleotide codes before calling this
505							method. Returns 'X' for unknown codons and an empty string
506							for input strings that are not three characters long.
507
508							It is not recommended to use this method in a production
509							environment. Use method translate, instead.
510
511							Example :
512							Returns : A string
513							Args : a codon = a three nucleotide character string
514
515
516							=cut
517
518							sub translate_strict{
519	3			3	1	9	my ($self, $value) = @_;
520	3					6	my $id = $self->{'id'};
521
522	3					8	$value = lc $value;
523	3					4	$value =~ tr/u/t/;
524
525	3	50				11	return '' unless length $value == 3;
526
527	3	50				10	return 'X' unless defined $CODONS->{$value};
528
529	3					13	return substr( $TABLES[$id], $CODONS->{$value}, 1 );
530							}
531
532							=head2 revtranslate
533
534							Title : revtranslate
535							Usage : $obj->revtranslate('G')
536							Function: returns codons for an amino acid
537
538							Returns an empty string for unknown amino acid
539							codes. Ambiguous IUPAC codes Asx,B, (Asp,D; Asn,N) and
540							Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
541							letter amino acid codes are accepted. '*' and 'Ter' are
542							used for terminator.
543
544							By default, the output codons are shown in DNA. If the
545							output is needed in RNA (tr/t/u/), add a second argument
546							'RNA'.
547
548							Example : $obj->revtranslate('Gly', 'RNA')
549							Returns : An array of three lower case letter strings i.e. codons
550							Args : amino acid, 'RNA'
551
552							=cut
553
554							sub revtranslate {
555	135			135	1	1131	my ($self, $value, $coding) = @_;
556	135					155	my @codons;
557
558	135	100				246	if (length($value) == 3 ) {
559	5					7	$value = lc $value;
560	5					7	$value = ucfirst $value;
561	5					9	$value = $THREELETTERSYMBOLS{$value};
562							}
563	135	100	100			829	if ( defined $value and $value =~ /$VALID_PROTEIN/
			66
564							and length($value) == 1
565							) {
566	133					228	my $id = $self->{'id'};
567
568	133					190	$value = uc $value;
569	133					151	my @aas = @{$IUPAC_AA{$value}};
	133					335
570	133					201	foreach my $aa (@aas) {
571							#print $aa, " -2\n";
572	145	100				302	$aa = '\' if $aa eq '';
573	145					1329	while ($TABLES[$id] =~ m/$aa/g) {
574	399					656	my $p = pos $TABLES[$id];
575	399					1348	push (@codons, $TRCOL->{--$p});
576							}
577							}
578							}
579
580	135	100	66			269	if ($coding and uc ($coding) eq 'RNA') {
581	1					5	for my $i (0..$#codons) {
582	1					2	$codons[$i] =~ tr/t/u/;
583							}
584							}
585
586	135					406	return @codons;
587							}
588
589							=head2 reverse_translate_all
590
591							Title : reverse_translate_all
592							Usage : my $iup_str = $cttable->reverse_translate_all($seq_object)
593							my $iup_str = $cttable->reverse_translate_all($seq_object,
594							$cutable,
595							15);
596							Function: reverse translates a protein sequence into IUPAC nucleotide
597							sequence. An 'X' in the protein sequence is converted to 'NNN'
598							in the nucleotide sequence.
599							Returns : a string
600							Args : a Bio::PrimarySeqI compatible object (mandatory)
601							a Bio::CodonUsage::Table object and a threshold if only
602							codons with a relative frequency above the threshold are
603							to be considered.
604							=cut
605
606							sub reverse_translate_all {
607	22			22	1	54	my ($self, $obj, $cut, $threshold) = @_;
608
609							## check args are OK
610
611	22	50	33			147	if (!$obj \|\| !$obj->isa('Bio::PrimarySeqI')){
612	0					0	$self->throw(" I need a Bio::PrimarySeqI object, not a [".
613							ref($obj) . "]");
614							}
615	22	50				69	if($obj->alphabet ne 'protein') {
616	0					0	$self->throw("Cannot reverse translate, need an amino acid sequence .".
617							"This sequence is of type [" . $obj->alphabet ."]");
618							}
619	22					40	my @data;
620	22					57	my @seq = split '', $obj->seq;
621
622							## if we're not supplying a codon usage table...
623	22	100	66			91	if( !$cut && !$threshold) {
624							## get lists of possible codons for each aa.
625	21					41	for my $aa (@seq) {
626	75	100				173	if ($aa =~ /x/i) {
627	7					21	push @data, (['NNN']);
628							}else {
629	68					130	my @cods = $self->revtranslate($aa);
630	68					158	push @data, \@cods;
631							}
632							}
633							}else{
634							#else we are supplying a codon usage table, we just want common codons
635							#check args first.
636	1	50				5	if(!$cut->isa('Bio::CodonUsage::Table')) {
637	0					0	$self->throw("I need a Bio::CodonUsage::Table object, not a [".
638							ref($cut). "].");
639							}
640	1					5	my $cod_ref = $cut->probable_codons($threshold);
641	1					2	for my $aa (@seq) {
642	21	100				37	if ($aa =~ /x/i) {
643	1					3	push @data, (['NNN']);
644	1					5	next;
645							}
646	20					25	push @data, $cod_ref->{$aa};
647							}
648							}
649
650	22					69	return $self->_make_iupac_string(\@data);
651							}
652
653							=head2 reverse_translate_best
654
655							Title : reverse_translate_best
656							Usage : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
657							Function: Reverse translates a protein sequence into plain nucleotide
658							sequence (GATC), uses the most common codon for each amino acid
659							Returns : A string
660							Args : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
661
662							=cut
663
664							sub reverse_translate_best {
665
666	1			1	1	4	my ($self, $obj, $cut) = @_;
667
668	1	50	33			8	if (!$obj \|\| !$obj->isa('Bio::PrimarySeqI')){
669	0					0	$self->throw(" I need a Bio::PrimarySeqI object, not a [".
670							ref($obj) . "]");
671							}
672	1	50				4	if ($obj->alphabet ne 'protein') {
673	0					0	$self->throw("Cannot reverse translate, need an amino acid sequence .".
674							"This sequence is of type [" . $obj->alphabet ."]");
675							}
676	1	50				10	if ( !$cut \| !$cut->isa('Bio::CodonUsage::Table')) {
677	0					0	$self->throw("I need a Bio::CodonUsage::Table object, not a [".
678							ref($cut). "].");
679							}
680
681	1					2	my $str = '';
682	1					4	my @seq = split '', $obj->seq;
683
684	1					6	my $cod_ref = $cut->most_common_codons();
685
686	1					2	for my $aa ( @seq ) {
687	21	100				29	if ($aa =~ /x/i) {
688	1					2	$str .= 'NNN';
689	1					3	next;
690							}
691	20	50				23	if ( defined $cod_ref->{$aa} ) {
692	20					24	$str .= $cod_ref->{$aa};
693							} else {
694	0					0	$self->throw("Input sequence contains invalid character: $aa");
695							}
696							}
697	1					7	return $str;
698							}
699
700							=head2 is_start_codon
701
702							Title : is_start_codon
703							Usage : $obj->is_start_codon('ATG')
704							Function: returns true (1) for all codons that can be used as a
705							translation start, false (0) for others.
706							Example : $myCodonTable->is_start_codon('ATG')
707							Returns : boolean
708							Args : codon
709
710							=cut
711
712							sub is_start_codon{
713	718			718	1	1029	shift->_codon_is( shift, \@STARTS, 'M' );
714							}
715
716							=head2 is_ter_codon
717
718							Title : is_ter_codon
719							Usage : $obj->is_ter_codon('GAA')
720							Function: returns true (1) for all codons that can be used as a
721							translation tarminator, false (0) for others.
722							Example : $myCodonTable->is_ter_codon('ATG')
723							Returns : boolean
724							Args : codon
725
726							=cut
727
728							sub is_ter_codon{
729	613			613	1	785	my ($self, $value) = @_;
730	613					697	my $id = $self->{'id'};
731
732							# We need to ensure U is mapped to T (ie. UAG)
733	613					647	$value = uc $value;
734	613					651	$value =~ tr/U/T/;
735
736	613	100				824	if (length $value != 3 ) {
737							# Incomplete codons are not stop codons
738	1					5	return 0;
739							} else {
740	612					578	my $result = 0;
741
742							# For all the possible codons, if any are not a stop
743							# codon, fail immediately
744	612					821	for my $c ( $self->unambiguous_codons($value) ) {
745	615					999	my $m = substr( $TABLES[$id], $CODONS->{$c}, 1 );
746	615	100				808	if($m eq $TERMINATOR) {
747	46					58	$result = 1;
748							} else {
749	569					2363	return 0;
750							}
751							}
752	43					149	return $result;
753							}
754							}
755
756							# desc: compares the passed value with a single entry in the given
757							# codon table
758							# args: a value (typically a three-char string like 'atg'),
759							# a reference to the appropriate set of codon tables,
760							# a single-character value to check for at the position in the
761							# given codon table
762							# ret: boolean, true if the given codon table contains the $key at the
763							# position corresponding to $value
764							sub _codon_is {
765	718			718		981	my ($self, $value, $table, $key ) = @_;
766
767	718	50				995	return 0 unless length $value == 3;
768
769	718					755	$value = lc $value;
770	718					750	$value =~ tr/u/t/;
771
772	718					851	my $id = $self->{'id'};
773	718					963	for my $c ( $self->unambiguous_codons($value) ) {
774	720					1125	my $m = substr( $table->[$id], $CODONS->{$c}, 1 );
775	720	100				1175	if ($m eq $key) { return 1; }
	55					145
776							}
777	663					1733	return 0;
778							}
779
780							=head2 is_unknown_codon
781
782							Title : is_unknown_codon
783							Usage : $obj->is_unknown_codon('GAJ')
784							Function: returns false (0) for all codons that are valid,
785							true (1) for others.
786							Example : $myCodonTable->is_unknown_codon('NTG')
787							Returns : boolean
788							Args : codon
789
790
791							=cut
792
793							sub is_unknown_codon{
794	3			3	1	6	my ($self, $value) = @_;
795	3					6	$value = lc $value;
796	3					5	$value =~ tr/u/t/;
797	3	100				6	return 1 unless $self->unambiguous_codons($value);
798	1					4	return 0;
799							}
800
801							=head2 unambiguous_codons
802
803							Title : unambiguous_codons
804							Usage : @codons = $self->unambiguous_codons('ACN')
805							Returns : array of strings (one-letter unambiguous amino acid codes)
806							Args : a codon = a three IUPAC nucleotide character string
807
808							=cut
809
810							sub unambiguous_codons{
811	6646			6646	1	7460	my ($self,$value) = @_;
812	6646					13263	my @nts = map { $IUPAC_DNA{uc $_} } split(//, $value);
	19937					28577
813
814	6646					8094	my @codons;
815	6646					5817	for my $i ( @{$nts[0]} ) {
	6646					10898
816	1658					1610	for my $j ( @{$nts[1]} ) {
	1658					1934
817	1633					1453	for my $k ( @{$nts[2]} ) {
	1633					1869
818	1804					3693	push @codons, lc "$i$j$k";
819							}}}
820	6646					9708	return @codons;
821							}
822
823							=head2 _unambiquous_codons
824
825							deprecated, now an alias for unambiguous_codons
826
827							=cut
828
829							sub _unambiquous_codons {
830	0			0		0	unambiguous_codons( undef, @_ );
831							}
832
833							=head2 add_table
834
835							Title : add_table
836							Usage : $newid = $ct->add_table($name, $table, $starts)
837							Function: Add a custom Codon Table into the object.
838							Know what you are doing, only the length of
839							the argument strings is checked!
840							Returns : the id of the new codon table
841							Args : name, a string, optional (can be empty)
842							table, a string of 64 characters
843							startcodons, a string of 64 characters, defaults to standard
844
845							=cut
846
847							sub add_table {
848	1			1	1	4	my ($self, $name, $table, $starts) = @_;
849
850	1		33			4	$name \|\|= 'Custom' . $#NAMES + 1;
851	1		33			31	$starts \|\|= $STARTS[1];
852	1	50	33			5	$self->throw('Suspect input!')
853							unless length($table) == 64 and length($starts) == 64;
854
855	1					4	push @NAMES, $name;
856	1					3	push @TABLES, $table;
857	1					3	push @STARTS, $starts;
858
859	1					5	return $#NAMES;
860							}
861
862							sub _make_iupac_string {
863	22			22		45	my ($self, $cod_ref) = @_;
864	22	50				60	if(ref($cod_ref) ne 'ARRAY') {
865	0					0	$self->throw(" I need a reference to a list of references to codons, ".
866							" not a [". ref($cod_ref) . "].");
867							}
868	22					100	my %iupac_hash = Bio::Tools::IUPAC->iupac_rev_iub();
869	22					59	my $iupac_string = ''; ## the string to be returned
870	22					45	for my $aa (@$cod_ref) {
871
872							## scan through codon positions, record the differing values,
873							# then look up in the iub hash
874	96					139	for my $index(0..2) {
875	288					675	my %h;
876	288					337	map { my $k = substr($_,$index,1);
	846					936
877	846					1089	$h{$k} = undef;} @$aa;
878	288					558	my $lookup_key = join '', sort{$a cmp $b}keys %h;
	226					320
879
880							## extend string
881	288					541	$iupac_string .= $iupac_hash{uc$lookup_key};
882							}
883							}
884	22					180	return $iupac_string;
885							}
886
887
888							1;