File Coverage

Bio/Seq/EncodedSeq.pm

Criterion	Covered	Total	%
statement	120	160	75.0
branch	64	110	58.1
condition	30	50	60.0
subroutine	9	11	81.8
pod	7	7	100.0
total	230	338	68.0

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::Seq::EncodedSeq
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Aaron Mackey
7							#
8							# Copyright Aaron Mackey
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::Seq::EncodedSeq - subtype of L to store DNA that encodes a protein
17
18							=head1 SYNOPSIS
19
20							$obj = Bio::Seq::EncodedSeq->new( -seq => $dna,
21							-encoding => "CCCCCCCIIIIICCCCC",
22							-start => 1,
23							-strand => 1,
24							-length => 17 );
25
26							# splice out (and possibly revcomp) the coding sequence
27							$cds = obj->cds;
28
29							# obtain the protein translation of the sequence
30							$prot = $obj->translate;
31
32							# other access/inspection routines as with Bio::LocatableSeq and
33							# Bio::SeqI; note that coordinates are relative only to the DNA
34							# sequence, not it's implicit encoded protein sequence.
35
36							=head1 DESCRIPTION
37
38							Bio::Seq::EncodedSeq is a L
39							object that holds a DNA sequence as well as information about the
40							coding potential of that DNA sequence. It is meant to be useful in an
41							alignment context, where the DNA may contain frameshifts, gaps and/or
42							introns, or in describing the transcript of a gene. An EncodedSeq
43							provides the ability to access the "spliced" coding sequence, meaning
44							that all introns and gaps are removed, and any frameshifts are
45							adjusted to provide a "clean" CDS.
46
47							In order to make simultaneous use of either the DNA or the implicit
48							encoded protein sequence coordinates, please see
49							L.
50
51							=head1 ENCODING
52
53							We use the term "encoding" here to refer to the series of symbols that
54							we use to identify which residues of a DNA sequence are protein-coding
55							(i.e. part of a codon), intronic, part of a 5' or 3', frameshift
56							"mutations", etc. From this information, a Bio::Seq::EncodedSeq is
57							able to "figure out" its translational CDS. There are two sets of
58							coding characters, one termed "implicit" and one termed "explicit".
59
60							The "implicit" encoding is a bit simpler than the "explicit" encoding:
61							'C' is used for any nucleotide that's part of a codon, 'U' for any
62							UTR, etc. The full list is shown below:
63
64							Code Meaning
65							---- -------
66							C coding
67							I intronic
68							U untranslated
69							G gapped (for use in alignments)
70							F a "forward", +1 frameshift
71							B a "backward", -1 frameshift
72
73							The "explicit" encoding is just an expansion of the "implicit"
74							encoding, to denote phase:
75
76							Code Meaning
77							---- -------
78							C coding, 1st codon position
79							D coding, 2nd codon position
80							E coding, 3rd codon position
81
82							I intronic, phase 0 (relative to intron begin)
83							J intronic, phase 1
84							K intronic, phase 2
85
86							U untranslated 3'UTR
87							V untranslated 5'UTR
88
89							G gapped (for use in alignments)
90							F a "forward", +1 frameshift
91							B a "backward", -1 frameshift
92
93							Note that the explicit coding is meant to provide easy access to
94							position/phase specific nucleotides:
95
96							$obj = Bio::Seq::EncodedSeq->new(-seq => "ACAATCAGACTACG...",
97							-encoding => "CCCCCCIII..."
98							);
99
100							# fetch arrays of nucleotides at each codon position:
101							my @pos1 = $obj->dnaseq(encoding => 'C', explicit => 1);
102							my @pos2 = $obj->dnaseq(encoding => 'D');
103							my @pos3 = $obj->dnaseq(encoding => 'E');
104
105							# fetch arrays of "3-1" codon dinucleotides, useful for genomic
106							# signature analyses without compounding influences of codon bias:
107							my @pairs = $obj->dnaseq(encoding => 'EC');
108
109							=head1 FEEDBACK
110
111							=head2 Mailing Lists
112
113							User feedback is an integral part of the evolution of this and other
114							Bioperl modules. Send your comments and suggestions preferably to one
115							of the Bioperl mailing lists. Your participation is much appreciated.
116
117							bioperl-l@bioperl.org - General discussion
118							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
119
120							=head2 Support
121
122							Please direct usage questions or support issues to the mailing list:
123
124							I
125
126							rather than to the module maintainer directly. Many experienced and
127							reponsive experts will be able look at the problem and quickly
128							address it. Please include a thorough description of the problem
129							with code and data examples if at all possible.
130
131							=head2 Reporting Bugs
132
133							Report bugs to the Bioperl bug tracking system to help us keep track
134							the bugs and their resolution. Bug reports can be submitted via the
135							web:
136
137							https://github.com/bioperl/bioperl-live/issues
138
139							=head1 AUTHOR - Aaron Mackey
140
141							Email amackey@virginia.edu
142
143							=head1 APPENDIX
144
145							The rest of the documentation details each of the object
146							methods. Internal methods are usually preceded with a _
147
148							=cut
149
150
151
152							package Bio::Seq::EncodedSeq;
153
154	1			1		465	use strict;
	1					1
	1					27
155
156	1			1		4	use base qw(Bio::LocatableSeq);
	1					2
	1					276
157
158
159							=head2 new
160
161							Title : new
162							Usage : $obj = Bio::Seq::EncodedSeq->new(-seq => "AGTACGTGTCATG",
163							-encoding => "CCCCCCFCCCCCC",
164							-id => "myseq",
165							-start => 1,
166							-end => 13,
167							-strand => 1
168							);
169							Function: creates a new Bio::Seq::EncodedSeq object from a supplied DNA
170							sequence
171							Returns : a new Bio::Seq::EncodedSeq object
172
173							Args : seq - primary nucleotide sequence used to encode the
174							protein; note that any positions involved in a
175							gap ('G') or backward frameshift ('B') should
176							have one or more gap characters; if the encoding
177							specifies G or B, but no (or not enough) gap
178							characters exist, they'll be added; similarly,
179							if there are gap characters without a
180							corresponding G or B encoding, G's will be
181							inserted into the encoding. This allows some
182							flexibility in specifying your sequence and
183							coding without having to calculate a lot of the
184							encoding for yourself.
185
186							encoding - a string of characters (see Encoding Table)
187							describing backwards frameshifts implied by the
188							encoding but not present in the sequence will be
189							added (as '-'s) to the sequence. If not
190							supplied, it will be assumed that all positions
191							are coding (C). Encoding may include either
192							implicit phase encoding characters (i.e. "CCC")
193							and/or explicit encoding characters (i.e. "CDE").
194							Additionally, prefixed numbers may be used to
195							denote repetition (i.e. "27C3I28C").
196
197							Alternatively, encoding may be a hashref
198							datastructure, with encoding characters as keys
199							and Bio::LocationI objects (or arrayrefs of
200							Bio::LocationI objects) as values, e.g.:
201
202							{ C => [ Bio::Location::Simple->new(1,9),
203							Bio::Location::Simple->new(11,13) ],
204							F => Bio::Location::Simple->new(10,10),
205							} # same as "CCCCCCCCCFCCC"
206
207							Note that if the location ranges overlap, the
208							behavior of the encoding will be undefined
209							(well, it will be defined, but only according to
210							the order in which the hash keys are read, which
211							is basically undefined ... just don't do that).
212
213							id, start, end, strand - as with Bio::LocatableSeq; note
214							that the coordinates are relative to the
215							encoding DNA sequence, not the implicit protein
216							sequence. If strand is reversed, then the
217							encoding is assumed to be relative to the
218							reverse strand as well.
219
220							=cut
221
222							sub new {
223	4			4	1	149	my ($self, @args) = @_;
224	4					20	$self = $self->SUPER::new(@args, -alphabet => 'dna');
225	4					12	my ($enc) = $self->_rearrange([qw(ENCODING)], @args);
226							# set the real encoding:
227	4	50				11	if ($enc) {
228	0					0	$self->encoding($enc);
229							} else {
230	4					8	$self->_recheck_encoding;
231							}
232	4					26	return $self;
233							}
234
235
236							=head2 encoding
237
238							Title : encoding
239							Usage : $obj->encoding("CCCCCC");
240							$obj->encoding( -encoding => { I => $location } );
241							$enc = $obj->encoding(-explicit => 1);
242							$enc = $obj->encoding("CCCCCC", -explicit => 1);
243							$enc = $obj->encoding(-location => $location,
244							-explicit => 1,
245							-absolute => 1 );
246							Function: get/set the objects encoding, either globally or by location(s).
247							Returns : the (possibly new) encoding string.
248							Args : encoding - see the encoding argument to the new() function.
249
250							explicit - whether or not to return explicit phase
251							information in the coding (i.e. "CCC" becomes
252							"CDE", "III" becomes "IJK", etc); defaults to 0.
253
254							location - optional; location to get/set the encoding.
255							Defaults to the entire sequence.
256
257							absolute - whether or not the locational elements (either
258							in the encoding hashref or the location
259							argument) are relative to the absolute start/end
260							of the Bio::LocatableSeq, or to the internal,
261							relative coordinate system (beginning at 1);
262							defaults to 0 (i.e. relative)
263
264							=cut
265
266							sub encoding {
267	8			8	1	20	my ($self, @args) = @_;
268	8					27	my ($enc, $loc, $exp, $abs) = $self->_rearrange([qw(ENCODING LOCATION EXPLICIT ABSOLUTE)], @args);
269
270	8	100				28	if (!$enc) {
		50
		50
271							# do nothing; _recheck_encoding will fix for us, if necessary
272							} elsif (ref $enc eq 'HASH') {
273	0					0	$self->throw( -class => 'Bio::Root::NotImplemented',
274							-text => "Hashref functionality not yet implemented;\nplease email me if you really need this.");
275							#TODO: finish all this
276	0					0	while (my ($char, $locs) = each %$enc) {
277	0	0				0	if (ref $locs eq 'ARRAY') {
		0
278							} elsif (UNIVERSAL::isa($locs, "Bio::LocationI")) {
279							} else {
280	0					0	$self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}");
	0					0
281							}
282							}
283							} elsif (! ref $enc) {
284	4					5	$enc = uc $enc;
285	4	100	66			19	$exp = 1 if (!defined $exp && $enc =~ m/[DEJKV]/o);
286
287	4	100				11	if ($enc =~ m/\d/o) { # numerically "enhanced" encoding
288	1					2	my $numenc = $enc;
289	1					2	$enc = '';
290	1					5	while ($numenc =~ m/\G(\d*)([CDEIJKUVGFB])/g) {
291	4					8	my ($num, $char) = ($1, $2);
292	4	100				8	$num = 1 unless $num;
293	4					11	$enc .= $char x $num;
294							}
295							}
296
297	4	50	66			20	if (defined $exp && $exp == 0 && $enc =~ m/([^CIUGFB])/) {
		50	33
298	0					0	$self->throw("Unrecognized character '$1' in implicit encoding");
299							} elsif ($enc =~ m/[^CDEIJKUVGFB]/) {
300	0					0	$self->throw("Unrecognized character '$1' in explicit encoding");
301							}
302
303	4	100				8	if ($loc) { # a global location, over which to apply the specified encoding.
304
305							# balk if too many non-gap characters present in encoding:
306	1					3	my ($ct) = $enc =~ tr/GB/GB/;
307	1					3	$ct = length($enc) - $ct;
308	1	50	33			3	$self->throw("Location length doesn't match number of coding chars in encoding!")
309							if ($loc->location_type eq 'EXACT' && $loc->length != $ct);
310
311	1					2	my $start = $loc->start;
312	1					17	my $end = $loc->end;
313
314							# strip any encoding that hangs off the ends of the sequence:
315	1	50				3	if ($start < $self->start) {
316	0					0	my $diff = $self->start - $start;
317	0					0	$start = $self->start;
318	0					0	$enc = substr($enc, $diff);
319							}
320	1	50				4	if ($end > $self->end) {
321	0					0	my $diff = $end - $self->end;
322	0					0	$end = $self->end;
323	0					0	$enc = substr($enc, -$diff);
324							}
325
326	1					2	my $currenc = $self->{_encoding};
327	1					3	my $currseq = $self->seq;
328
329	1					3	my ($spanstart, $spanend) = ($self->column_from_residue_number($start),
330							$self->column_from_residue_number($end) );
331
332	1	50				3	if ($currseq) {
333							# strip any gaps in sequence spanned by this location:
334	1	50				3	($spanstart, $spanend) = ($spanend, $spanstart) if $self->strand < 0;
335	1	50				1	my ($before, $in, $after) = $currseq =~ m/(.{@{[ $spanstart - ($loc->location_type eq 'IN-BETWEEN' ? 0 : 1) ]}})
	1					3
336	1	50				4	(.{@{[ $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1) ]}})
337							(.*)
338							/x;
339	1		50			6	$in \|\|= '';
340	1					2	$in =~ s/[\.\-]+//g;
341	1		50			4	$currseq = ($before\|\|'') . $in. ($after\|\|'');
			50
342							# change seq without changing the alphabet
343	1					4	$self->seq($currseq,$self->alphabet());
344							}
345
346	1	50				3	$currenc = reverse $currenc if $self->strand < 0;
347	1	50				3	substr($currenc, $spanstart, $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1),
		50
348							$self->strand >= 0 ? $enc : reverse $enc);
349	1	50				4	$currenc = reverse $currenc if $self->strand < 0;
350
351	1					2	$self->{_encoding} = $currenc;
352	1					3	$self->_recheck_encoding;
353
354	1					2	$currenc = $self->{_encoding};
355	1	50				2	$currenc = reverse $currenc if $self->strand < 0;
356	1					3	$enc = substr($currenc, $spanstart, length $enc);
357	1	50				2	$enc = reverse $enc if $self->strand < 0;
358
359	1	50				4	return $exp ? $enc: $self->_convert2implicit($enc);
360
361							} else {
362							# presume a global redefinition; strip any current gap
363							# characters in the sequence so they don't corrupt the
364							# encoding
365	3					6	my $dna = $self->seq;
366	3					10	$dna =~ s/[\.\-]//g;
367	3					7	$self->seq($dna, 'dna');
368	3					4	$self->{_encoding} = $enc;
369							}
370							} else {
371	0					0	$self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}");
	0					0
372							}
373
374	7					18	$self->_recheck_encoding();
375
376	7	100				19	return $exp ? $self->{_encoding} : $self->_convert2implicit($self->{_encoding});
377							}
378
379
380							sub _convert2implicit {
381	8			8		13	my ($self, $enc) = @_;
382
383	8					30	$enc =~ s/[DE]/C/g;
384	8					11	$enc =~ s/[JK]/I/g;
385	8					14	$enc =~ s/V/U/g;
386
387	8					29	return $enc;
388							}
389
390
391							sub _recheck_encoding {
392
393	12			12		15	my $self = shift;
394
395	12		100			46	my @enc = split //, ($self->{_encoding} \|\| '');
396
397	12					30	my @nt = split(//, $self->SUPER::seq);
398	12	100	66			27	@nt = reverse @nt if $self->strand && $self->strand < 0;
399
400							# make sure an encoding exists!
401	12	100				24	@enc = ('C') x scalar grep { !/[\.\-]/o } @nt
	31					52
402							unless @enc;
403
404							# check for gaps to be truly present in the sequence
405							# and vice versa
406	12					12	my $i;
407	12		100			41	for ($i = 0 ; $i < @nt && $i < @enc ; $i++) {
408	101	100	100			486	if ($nt[$i] =~ /[\.\-]/o && $enc[$i] !~ m/[GB]/o) {
		100	100
409	5					18	splice(@enc, $i, 0, 'G');
410							} elsif ($nt[$i] !~ /[\.\-]/o && $enc[$i] =~ m/[GB]/o) {
411	10					32	splice(@nt, $i, 0, '-');
412							}
413							}
414	12	50				29	if ($i < @enc) {
		100
415							# extra encoding; presumably all gaps?
416	0					0	for ( ; $i < @enc ; $i++) {
417	0	0				0	if ($enc[$i] =~ m/[GB]/o) {
418	0					0	push @nt, '-';
419							} else {
420	0					0	$self->throw("Extraneous encoding info: " . join('', @enc[$i..$#enc]));
421							}
422							}
423							} elsif ($i < @nt) {
424	3					4	for ( ; $i < @nt ; $i++) {
425	8	100				16	if ($nt[$i] =~ m/[\.\-]/o) {
426	2					4	push @enc, 'G';
427							} else {
428	6					9	push @enc, 'C';
429							}
430							}
431							}
432
433	12					21	my @cde_array = qw(C D E);
434	12					18	my @ijk_array = qw(I J K);
435							# convert any leftover implicit coding into explicit coding
436	12					16	my ($Cct, $Ict, $Uct, $Vct, $Vwarned) = (0, 0, 0, 0);
437	12					20	for ($i = 0 ; $i < @enc ; $i++) {
438	109	100				238	if ($enc[$i] =~ m/[CDE]/o) {
		50
		50
		100
		50
439	68					70	my $temp_index = $Cct %3;
440	68					60	$enc[$i] = $cde_array[$temp_index];
441	68					53	$Cct++; $Ict = 0; $Uct = 1;
	68					57
	68					59
442	68	50	33			119	$self->warn("3' untranslated encoding (V) seen prior to other coding symbols")
443							if ($Vct && !$Vwarned++);
444							} elsif ($enc[$i] =~ m/[IJK]/o) {
445	0					0	$enc[$i] = $ijk_array[$Ict % 3];
446	0					0	$Ict++; $Uct = 1;
	0					0
447	0	0	0			0	$self->warn("3' untranslated encoding (V) seen before other coding symbols")
448							if ($Vct && !$Vwarned++);
449							} elsif ($enc[$i] =~ m/[UV]/o) {
450	0	0				0	if ($Uct == 1) {
451	0					0	$enc[$i] = 'V';
452	0					0	$Vct = 1;
453							}
454							} elsif ($enc[$i] eq 'B') {
455	4					4	$Cct++; $Ict++
	4					5
456							} elsif ($enc[$i] eq 'G') {
457							# gap; leave alone
458							}
459							}
460
461	12	100	66			23	@nt = reverse @nt if $self->strand && $self->strand < 0;
462	12					45	$self->seq(join('', @nt), 'dna');
463
464	12					42	$self->{_encoding} = join '', @enc;
465							}
466
467
468							=head2 cds
469
470							Title : cds
471							Usage : $cds = $obj->cds(-nogaps => 1);
472							Function: obtain the "spliced" DNA sequence, by removing any
473							nucleotides that participate in an UTR, forward frameshift
474							or intron, and replacing any unknown nucleotide implied by
475							a backward frameshift or gap with N's.
476							Returns : a Bio::Seq::EncodedSeq object, with an encoding consisting only
477							of "CCCC..".
478							Args : nogaps - strip any gap characters (resulting from 'G' or 'B'
479							encodings), rather than replacing them with N's.
480
481							=cut
482
483							sub cds {
484	2			2	1	9	my ($self, @args) = @_;
485
486	2					11	my ($nogaps, $loc) = $self->_rearrange([qw(NOGAPS LOCATION)], @args);
487	2	50				11	$nogaps = 0 unless defined $nogaps;
488
489	2	50				8	my @nt = split //, $self->strand < 0 ? $self->revcom->seq : $self->seq;
490	2					7	my @enc = split //, $self->_convert2implicit($self->{_encoding});
491
492	2					4	my ($start, $end) = (0, scalar @nt);
493
494	2	50				5	if ($loc) {
495	0					0	$start = $loc->start;
496	0	0				0	$start++ if $loc->location_type eq 'IN-BETWEEN';
497	0					0	$start = $self->column_from_residue_number($start);
498	0					0	$start--;
499
500	0					0	$end = $loc->end;
501	0					0	$end = $self->column_from_residue_number($end);
502
503	0	0				0	($start, $end) = ($end, $start) if $self->strand < 0;
504	0					0	$start--;
505							}
506
507	2					6	for (my $i = $start ; $i < $end ; $i++) {
508	20	50	33			82	if ($enc[$i] eq 'I' \|\| $enc[$i] eq 'U' \|\| $enc[$i] eq 'F') {
		100	33
			100
509							# remove introns, untranslated and forward frameshift nucleotides
510	0					0	$nt[$i] = undef;
511							} elsif ($enc[$i] eq 'G' \|\| $enc[$i] eq 'B') {
512							# replace gaps and backward frameshifts with N's, unless asked not to.
513	8	50				15	$nt[$i] = $nogaps ? undef : 'N';
514							}
515							}
516
517							return ($self->can_call_new ? ref($self) : __PACKAGE__)->new(
518	2	50				7	-seq => join('', grep { defined } @nt[$start..--$end]),
	20					28
519							-start => $self->start,
520							-end => $self->end,
521							-strand => 1,
522							-alphabet => 'dna' );
523							}
524
525
526							=head2 translate
527
528							Title : translate
529							Usage : $prot = $obj->translate(@args);
530							Function: obtain the protein sequence encoded by the underlying DNA
531							sequence; same as $obj->cds()->translate(@args).
532							Returns : a Bio::PrimarySeq object.
533							Args : same as the translate() function of Bio::PrimarySeqI
534
535							=cut
536
537	1			1	1	5	sub translate { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_) };
538
539
540							=head2 protseq
541
542							Title : seq
543							Usage : $protseq = $obj->protseq();
544							Function: obtain the raw protein sequence encoded by the underlying
545							DNA sequence; This is the same as calling
546							$obj->translate()->seq();
547							Returns : a string of single-letter amino acid codes
548							Args : same as the seq() function of Bio::PrimarySeq; note that this
549							function may not be used to set the protein sequence; see
550							the dnaseq() function for that.
551
552							=cut
553
554	0			0	1	0	sub protseq { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_)->seq };
555
556
557							=head2 dnaseq
558
559							Title : dnaseq
560							Usage : $dnaseq = $obj->dnaseq();
561							$obj->dnaseq("ACGTGTCGT", "CCCCCCCCC");
562							$obj->dnaseq(-seq => "ATG",
563							-encoding => "CCC",
564							-location => $loc );
565							@introns = $obj->$dnaseq(-encoding => 'I')
566							Function: get/set the underlying DNA sequence; will overwrite any
567							current DNA and/or encoding information present.
568							Returns : a string of single-letter nucleotide codes, including any
569							gaps implied by the encoding.
570							Args : seq - the DNA sequence to be used as a replacement
571							encoding - the encoding of the DNA sequence (see the new()
572							constructor); defaults to all 'C' if setting a
573							new DNA sequence. If no new DNA sequence is
574							being provided, then the encoding is used as a
575							"filter" for which to return fragments of
576							non-overlapping DNA that match the encoding.
577							location - optional, the location of the DNA sequence to
578							get/set; defaults to the entire sequence.
579
580							=cut
581
582							sub dnaseq {
583	0			0	1	0	my ($self, @args) = @_;
584	0					0	my ($seq, $enc, $loc) = $self->_rearrange([qw(DNASEQ ENCODING LOCATION)], @args);
585	0					0	return $self;
586							}
587
588
589							# need to overload this so that we truncate both the seq and the encoding!
590							sub trunc {
591	1			1	1	5	my ($self, $start, $end) = @_;
592	1					11	my $new = $self->SUPER::trunc($start, $end);
593	1					2	$start--;
594	1					2	my $enc = $self->{_encoding};
595	1	50				2	$enc = reverse $enc if $self->strand < 0;
596	1					3	$enc = substr($enc, $start, $end - $start);
597	1	50				3	$enc = reverse $enc if $self->strand < 0;
598	1					3	$new->encoding($enc);
599	1					5	return $new;
600							}
601
602							1;