File Coverage

Bio/Align/Utilities.pm

Criterion	Covered	Total	%
statement	120	192	62.5
branch	20	52	38.4
condition	9	25	36.0
subroutine	10	12	83.3
pod	6	6	100.0
total	165	287	57.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Align::Utilities;
2	3			3		1703	use strict;
	3					4
	3					74
3	3			3		12	use warnings;
	3					3
	3					72
4	3			3		10	use Carp;
	3					4
	3					153
5	3			3		514	use Bio::Root::Version;
	3					22
	3					15
6
7	3			3		103	use Exporter 'import';
	3					4
	3					203
8							our @EXPORT_OK = qw(
9							aa_to_dna_aln
10							bootstrap_replicates
11							cat
12							bootstrap_replicates_codons
13							dna_to_aa_aln
14							most_common_sequences
15							);
16							our %EXPORT_TAGS = (all => \@EXPORT_OK);
17
18							#
19							# BioPerl module for Bio::Align::Utilities
20							#
21							# Please direct questions and support issues to
22							#
23							# Cared for by Jason Stajich and Brian Osborne
24							#
25							# Copyright Jason Stajich
26							#
27							# You may distribute this module under the same terms as perl itself
28
29							# POD documentation - main docs before the code
30
31							=head1 NAME
32
33							Bio::Align::Utilities - A collection of utilities regarding converting
34							and manipulating alignment objects
35
36							=head1 SYNOPSIS
37
38							use Bio::Align::Utilities qw(:all);
39
40							# Even if the protein alignments are local make sure the start/end
41							# stored in the LocatableSeq objects are to the full length protein.
42							# The coding sequence that is passed in should still be the full
43							# length CDS as the nt alignment will be generated.
44							# %dnaseqs is a hash of CDS sequences (spliced)
45							my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs);
46
47							# The reverse, which is simpler. The input alignment has to be
48							# translate-able, with gap lengths and an overall length divisible by 3
49							my $aa_aln = dna_to_aa_aln($dna_al);
50
51							# Generate bootstraps
52							my $replicates = bootstrap_replicates($aln,$count);
53
54							=head1 DESCRIPTION
55
56							This module contains utility methods for manipulating sequence
57							alignments (L) objects.
58
59							The B utility is essentially the same as the B
60							program by Bill Pearson available at
61							ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this
62							is a pure-Perl implementation, but just to mention that if anything
63							seems odd you can check the alignments generated against Bill's
64							program.
65
66							=head1 FEEDBACK
67
68							=head2 Mailing Lists
69
70							User feedback is an integral part of the evolution of this and other
71							Bioperl modules. Send your comments and suggestions preferably to
72							the Bioperl mailing list. Your participation is much appreciated.
73
74							bioperl-l@bioperl.org - General discussion
75							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76
77							=head2 Support
78
79							Please direct usage questions or support issues to the mailing list:
80
81							I
82
83							rather than to the module maintainer directly. Many experienced and
84							reponsive experts will be able look at the problem and quickly
85							address it. Please include a thorough description of the problem
86							with code and data examples if at all possible.
87
88							=head2 Reporting Bugs
89
90							Report bugs to the Bioperl bug tracking system to help us keep track
91							of the bugs and their resolution. Bug reports can be submitted via the
92							web:
93
94							https://github.com/bioperl/bioperl-live/issues
95
96							=head1 AUTHOR - Jason Stajich
97
98							Email jason-at-bioperl-dot-org
99
100							=head1 APPENDIX
101
102							The rest of the documentation details each of the object methods.
103							Internal methods are usually preceded with a _
104
105							=cut
106
107	3			3		13	use constant CODONSIZE => 3;
	3					3
	3					5200
108							our $GAP = '-';
109							our $CODONGAP = $GAP x CODONSIZE;
110
111							=head2 aa_to_dna_aln
112
113							Title : aa_to_dna_aln
114							Usage : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs);
115							Function: Will convert an AA alignment to DNA space given the
116							corresponding DNA sequences. Note that this method expects
117							the DNA sequences to be in frame +1 (GFF frame 0) as it will
118							start to project into coordinates starting at the first base of
119							the DNA sequence, if this alignment represents a different
120							frame for the cDNA you will need to edit the DNA sequences
121							to remove the 1st or 2nd bases (and revcom if things should be).
122							Returns : Bio::Align::AlignI object
123							Args : 2 arguments, the alignment and a hashref.
124							Alignment is a Bio::Align::AlignI of amino acid sequences.
125							The hash reference should have keys which are
126							the display_ids for the aa
127							sequences in the alignment and the values are a
128							Bio::PrimarySeqI object for the corresponding
129							spliced cDNA sequence.
130
131							See also: L, L, L
132
133							=cut
134
135							sub aa_to_dna_aln {
136	2			2	1	19	my ( $aln, $dnaseqs ) = @_;
137	2	50	33			28	unless ( defined $aln
			33
138							&& ref($aln)
139							&& $aln->isa('Bio::Align::AlignI') )
140							{
141	0					0	croak(
142							'Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature'
143							);
144							}
145	2					10	my $alnlen = $aln->length;
146	2					11	my $dnaalign = Bio::SimpleAlign->new();
147	2					12	$aln->map_chars( '\.', $GAP );
148
149	2					6	foreach my $seq ( $aln->each_seq ) {
150	17					42	my $aa_seqstr = $seq->seq();
151	17					34	my $pepid = $seq->display_id;
152	17		33			52	my $dnaseq = $dnaseqs->{$pepid} \|\| $aln->throw( "cannot find " . $seq->display_id );
153	17					38	my $start_offset = ( $seq->start - 1 ) * CODONSIZE;
154	17					65	$dnaseq = $dnaseq->seq();
155	17					45	my $dnalen = $dnaseqs->{$pepid}->length;
156	17		33			38	my $dnaid = $dnaseqs->{$pepid}->display_id \|\| $pepid; # try to use DNAseq obj ID (issue #137)
157	17					24	my $nt_seqstr;
158	17					17	my $j = 0;
159	17					35	for ( my $i = 0 ; $i < $alnlen ; $i++ ) {
160	5552					5420	my $char = substr( $aa_seqstr, $i + $start_offset, 1 );
161	5552	100	66			9776	if ( $char eq $GAP \|\| $j >= $dnalen ) {
162	765					965	$nt_seqstr .= $CODONGAP;
163							}
164							else {
165	4787					4654	$nt_seqstr .= substr( $dnaseq, $j, CODONSIZE );
166	4787					5912	$j += CODONSIZE;
167							}
168							}
169	17					32	$nt_seqstr .= $GAP x ( ( $alnlen * 3 ) - length($nt_seqstr) );
170
171	17					58	my $newdna = Bio::LocatableSeq->new(
172							-display_id => $dnaid,
173							-alphabet => 'dna',
174							-start => $start_offset + 1,
175							-end => ( $seq->end * CODONSIZE ),
176							-strand => 1,
177							-seq => $nt_seqstr
178							);
179	17					60	$dnaalign->add_seq($newdna);
180							}
181	2					14	return $dnaalign;
182							}
183
184							=head2 dna_to_aa_aln
185
186							Title : dna_to_aa_aln
187							Usage : my $aa_aln = dna_to_aa_aln($dna_aln);
188							Function: Convert a DNA alignment to an amino acid alignment where
189							the length of all alignment strings and the lengths of any
190							gaps must be divisible by 3
191							Returns : Bio::Align::AlignI object
192							Args : the DNA alignment, a Bio::Align::AlignI of DNA sequences
193
194							See also: L, L, L
195
196							=cut
197
198							sub dna_to_aa_aln {
199	1			1	1	15	my $dna_aln = shift;
200	1	50	33			16	unless ( defined $dna_aln
			33
201							&& ref($dna_aln)
202							&& $dna_aln->isa('Bio::Align::AlignI') ) {
203	0					0	croak(
204							'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln'
205							);
206							}
207	1					8	my $codon_table = Bio::Tools::CodonTable->new;
208	1					9	my $aa_aln = Bio::SimpleAlign->new;
209
210	1					10	for my $seq ( $dna_aln->each_seq ) {
211	14					41	my ($aa_str, $aa_len);
212	14					53	my @aln_str = split '', $seq->seq;
213	14	50				76	croak("All lines in the alignment must have lengths divisible by 3")
214							if ( scalar(@aln_str) % CODONSIZE );
215
216	14					44	while ( @aln_str ) {
217	5516					11129	my $triplet = join '', (splice( @aln_str, 0, CODONSIZE ));
218
219	5516	100				17733	if ( $triplet =~ /^[GATC]+$/i ) {
		50
220	4754					9798	$aa_str .= $codon_table->translate($triplet);
221	4754					10776	$aa_len++;
222							}
223							elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) {
224	762					1835	$aa_str .= $GAP;
225							}
226							else {
227	0					0	croak("The triplet '$triplet' is neither a valid codon nor all gaps");
228							}
229							}
230	14					137	my $new_aa = Bio::LocatableSeq->new(
231							-display_id => $seq->display_id,
232							-alphabet => 'protein',
233							-start => 1,
234							-end => $aa_len,
235							-strand => 1,
236							-seq => $aa_str
237							);
238
239	14					111	$aa_aln->add_seq($new_aa);
240							}
241
242	1					9	$aa_aln;
243							}
244
245							=head2 bootstrap_replicates
246
247							Title : bootstrap_replicates
248							Usage : my $alns = &bootstrap_replicates($aln,100);
249							Function: Generate a pseudo-replicate of the data by randomly
250							sampling, with replacement, the columns from an alignment for
251							the non-parametric bootstrap.
252							Returns : Arrayref of L objects
253							Args : L object
254							Number of replicates to generate
255
256							=cut
257
258							sub bootstrap_replicates {
259	3			3	1	15	my ( $aln, $count ) = @_;
260	3		50			12	$count \|\|= 1;
261	3					15	my $alen = $aln->length;
262	3					6	my ( @seqs, @nm );
263	3					16	$aln->set_displayname_flat(1);
264	3					9	for my $s ( $aln->each_seq ) {
265	31					44	push @seqs, $s->seq();
266	31					40	push @nm, $s->id;
267							}
268	3					15	my ( @alns, $i );
269	3					16	while ( $count-- > 0 ) {
270	23					34	my @newseqs;
271	23					61	for ( $i = 0 ; $i < $alen ; $i++ ) {
272	7988					9835	my $index = int( rand($alen) );
273	7988					7261	my $c = 0;
274	7988					8370	for (@seqs) {
275	110644					130344	$newseqs[ $c++ ] .= substr( $_, $index, 1 );
276							}
277							}
278	23					181	my $newaln = Bio::SimpleAlign->new();
279	23					36	my $i = 0;
280	23					49	for my $s (@newseqs) {
281	289					6274	( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g;
282	289					1334	$newaln->add_seq(
283							Bio::LocatableSeq->new(
284							-start => 1,
285							-end => length($tmp),
286							-display_id => $nm[ $i++ ],
287							-seq => $s
288							)
289							);
290							}
291	23					129	push @alns, $newaln;
292							}
293	3					24	return \@alns;
294							}
295
296							=head2 bootstrap_replicates_codons
297
298							Title : bootstrap_replicates_codons
299							Usage : my $alns = &bootstrap_replicates_codons($aln,100);
300							Function: Generate a pseudo-replicate of the data by randomly
301							sampling, with replacement, the columns from a codon alignment for
302							the non-parametric bootstrap. The alignment is assumed to start on
303							the first position of a codon.
304							Returns : Arrayref of L objects
305							Args : L object
306							Number of replicates to generate
307
308							=cut
309
310							sub bootstrap_replicates_codons {
311	0			0	1	0	my ( $aln, $count ) = @_;
312	0		0			0	$count \|\|= 1;
313	0					0	my $alen = $aln->length;
314	0					0	my $ncodon = int( $alen / 3 );
315	0					0	my ( @seqs, @nm );
316	0					0	$aln->set_displayname_flat(1);
317	0					0	for my $s ( $aln->each_seq ) {
318	0					0	push @seqs, $s->seq();
319	0					0	push @nm, $s->id;
320							}
321	0					0	my ( @alns, $i );
322	0					0	while ( $count-- > 0 ) {
323	0					0	my @newseqs;
324	0					0	for ( $i = 0 ; $i < $ncodon ; $i++ ) {
325	0					0	my $index = int( rand($ncodon) );
326	0					0	my $seqpos = $index * 3;
327	0					0	my $c = 0;
328	0					0	for (@seqs) {
329	0					0	$newseqs[ $c++ ] .= substr( $_, $seqpos, 3 );
330							}
331							}
332	0					0	my $newaln = Bio::SimpleAlign->new();
333	0					0	my $i = 0;
334	0					0	for my $s (@newseqs) {
335	0					0	( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g;
336	0					0	$newaln->add_seq(
337							Bio::LocatableSeq->new(
338							-start => 1,
339							-end => length($tmp),
340							-display_id => $nm[ $i++ ],
341							-seq => $s
342							)
343							);
344							}
345	0					0	push @alns, $newaln;
346							}
347	0					0	return \@alns;
348							}
349
350							=head2 cat
351
352							Title : cat
353							Usage : $aln123 = cat($aln1, $aln2, $aln3)
354							Function : Concatenates alignment objects. Sequences are identified by id.
355							An error will be thrown if the sequence ids are not unique in the
356							first alignment. If any ids are not present or not unique in any
357							of the additional alignments then those sequences are omitted from
358							the concatenated alignment, and a warning is issued. An error will
359							be thrown if any of the alignments are not flush, since
360							concatenating such alignments is unlikely to make biological
361							sense.
362							Returns : A new Bio::SimpleAlign object
363							Args : A list of Bio::SimpleAlign objects
364
365							=cut
366
367							sub cat {
368	1			1	1	9	my ( $self, @aln ) = @_;
369	1	50				3	$self->throw("cat method called with no arguments") unless $self;
370	1					3	for ( $self, @aln ) {
371	2	50				8	$self->throw( $_->id . " is not a Bio::Align::AlignI object" )
372							unless $_->isa('Bio::Align::AlignI');
373	2	50				5	$self->throw( $_->id . " is not flush" ) unless $_->is_flush;
374							}
375	1					5	my $aln = $self->new;
376	1					3	$aln->id( $self->id );
377	1					3	$aln->annotation( $self->annotation );
378	1					2	my %unique;
379	1					7	SEQ: foreach my $seq ( $self->each_seq() ) {
380							throw( "ID: ", $seq->id, " is not unique in initial alignment." )
381	14	50				24	if exists $unique{ $seq->id };
382	14					21	$unique{ $seq->id } = 1;
383
384							# Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array
385	14					22	my $new_seq = $seq->new(
386							-id => $seq->id,
387							-strand => $seq->strand,
388							-verbose => $self->verbose
389							);
390	14					30	$new_seq->seq( $seq->seq );
391	14					24	$new_seq->start( $seq->start );
392	14					23	$new_seq->end( $seq->end );
393	14	50				58	if ( $new_seq->isa('Bio::Seq::MetaI') ) {
394	0					0	for my $meta_name ( $seq->meta_names ) {
395	0					0	$new_seq->named_submeta( $meta_name, $new_seq->start,
396							$new_seq->end, $seq->named_meta($meta_name) );
397							}
398							}
399	14					22	for my $cat_aln (@aln) {
400	14					21	my @cat_seq = $cat_aln->each_seq_with_id( $seq->id );
401	14	50				22	if ( @cat_seq == 0 ) {
402	0					0	$self->warn( $seq->id
403							. " not found in alignment "
404							. $cat_aln->id
405							. ", skipping this sequence." );
406	0					0	next SEQ;
407							}
408	14	50				28	if ( @cat_seq > 1 ) {
409	0					0	$self->warn( $seq->id
410							. " found multiple times in alignment "
411							. $cat_aln->id
412							. ", skipping this sequence." );
413	0					0	next SEQ;
414							}
415	14					17	my $cat_seq = $cat_seq[0];
416	14					20	my $old_end = $new_seq->end;
417	14					17	$new_seq->seq( $new_seq->seq . $cat_seq->seq );
418
419							# Not sure if this is a sensible way to deal with end coordinates
420	14					23	$new_seq->end(
421							$new_seq->end + $cat_seq->end + 1 - $cat_seq->start );
422
423	14	50				92	if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) {
		50
424	0	0				0	unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
425	0					0	my $meta_seq = Bio::Seq::Meta::Array->new;
426	0					0	$meta_seq->seq( $new_seq->seq );
427	0					0	$meta_seq->start( $new_seq->start );
428	0					0	$meta_seq->end( $new_seq->end );
429	0	0				0	if ( $new_seq->isa('Bio::Seq::Meta') ) {
430	0					0	for my $meta_name ( $new_seq->meta_names ) {
431	0					0	$meta_seq->named_submeta(
432							$meta_name,
433							$new_seq->start,
434							$old_end,
435							[
436							split(
437							//, $new_seq->named_meta($meta_name)
438							)
439							]
440							);
441							}
442							}
443	0					0	$new_seq = $meta_seq;
444							}
445	0					0	for my $meta_name ( $cat_seq->meta_names ) {
446	0					0	$new_seq->named_submeta( $meta_name, $old_end + 1,
447							$new_seq->end, $cat_seq->named_meta($meta_name) );
448							}
449							}
450							elsif ( $cat_seq->isa('Bio::Seq::Meta') ) {
451	0	0				0	if ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
452	0					0	for my $meta_name ( $cat_seq->meta_names ) {
453	0					0	$new_seq->named_submeta( $meta_name, $old_end + 1,
454							$new_seq->end,
455							[ split( //, $cat_seq->named_meta($meta_name) ) ] );
456							}
457							}
458							else {
459	0	0				0	unless ( $new_seq->isa('Bio::Seq::Meta') ) {
460	0					0	my $meta_seq = Bio::Seq::Meta::Array->new;
461	0					0	$meta_seq->seq( $new_seq->seq );
462	0					0	$meta_seq->start( $new_seq->start );
463	0					0	$meta_seq->end( $new_seq->end );
464	0					0	$new_seq = $meta_seq;
465							}
466	0					0	for my $meta_name ( $cat_seq->meta_names ) {
467	0					0	$new_seq->named_submeta( $meta_name, $old_end + 1,
468							$new_seq->end, $cat_seq->named_meta($meta_name) );
469							}
470							}
471							}
472							}
473	14					28	$aln->add_seq($new_seq);
474							}
475	1					28	my $cons_meta = $self->consensus_meta;
476	1					2	my $new_cons_meta;
477	1	50				3	if ($cons_meta) {
478	0					0	$new_cons_meta = Bio::Seq::Meta->new();
479	0					0	for my $meta_name ( $cons_meta->meta_names ) {
480	0					0	$new_cons_meta->named_submeta( $meta_name, 1, $self->length,
481							$cons_meta->$meta_name );
482							}
483							}
484	1					4	my $end = $self->length;
485	1					2	for my $cat_aln (@aln) {
486	1					3	my $cat_cons_meta = $cat_aln->consensus_meta;
487	1	50				3	if ($cat_cons_meta) {
488	0	0				0	$new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta;
489	0					0	for my $meta_name ( $cat_cons_meta->meta_names ) {
490	0					0	$new_cons_meta->named_submeta(
491							$meta_name, $end + 1,
492							$end + $cat_aln->length,
493							$cat_cons_meta->$meta_name
494							);
495							}
496							}
497	1					5	$end += $cat_aln->length;
498							}
499	1	50				4	$aln->consensus_meta($new_cons_meta) if $new_cons_meta;
500	1					10	return $aln;
501							}
502
503
504							=head2 most_common_sequences
505
506							Title : most_common_sequences
507							Usage : @common = most_common_sequences ($align, $case_sensitivity)
508							Function : Returns an array of the sequences that appear most often in the
509							alignment (although this probably makes more sense when there is
510							only a single most common sequence). Sequences are compared after
511							removing any "-" (gap characters), and ambiguous units (e.g., R
512							for purines) are only compared to themselves. The returned
513							sequence is also missing the "-" since they don't actually make
514							part of the sequence.
515							Returns : Array of text strings.
516							Arguments : Optional argument defining whether the comparison between sequences
517							to find the most common should be case sensitive. Defaults to
518							false, i.e, not case sensitive.
519
520							=cut
521
522							sub most_common_sequences {
523	0	0		0	1		my $align = shift
524							or croak ("Must provide Bio::AlignI object to Bio::Align::Utilities::most_common_sequences");
525	0						my $case_sensitive = shift; # defaults to false (we get undef if nothing)
526
527							## We keep track of the max on this loop. Saves us having to
528							## transverse the hash table later to find the maximum value.
529	0						my $max = 0;
530	0						my %counts;
531	0						foreach ($align->each_seq) {
532	0						(my $seq = $_->seq) =~ tr/-//d;
533	0	0					$seq = uc ($seq) unless $case_sensitive;
534	0	0					$max++ if (++$counts{$seq} > $max);
535							}
536	0						my @common = grep ($counts{$_} == $max, keys %counts);
537	0						return @common;
538							}
539
540							1;