File Coverage

Bio/Align/Utilities.pm
Criterion Covered Total %
statement 120 192 62.5
branch 20 52 38.4
condition 9 25 36.0
subroutine 10 12 83.3
pod 6 6 100.0
total 165 287 57.4


line stmt bran cond sub pod time code
1             package Bio::Align::Utilities;
2 3     3   1703 use strict;
  3         4  
  3         74  
3 3     3   12 use warnings;
  3         3  
  3         72  
4 3     3   10 use Carp;
  3         4  
  3         153  
5 3     3   514 use Bio::Root::Version;
  3         22  
  3         15  
6              
7 3     3   103 use Exporter 'import';
  3         4  
  3         203  
8             our @EXPORT_OK = qw(
9             aa_to_dna_aln
10             bootstrap_replicates
11             cat
12             bootstrap_replicates_codons
13             dna_to_aa_aln
14             most_common_sequences
15             );
16             our %EXPORT_TAGS = (all => \@EXPORT_OK);
17              
18             #
19             # BioPerl module for Bio::Align::Utilities
20             #
21             # Please direct questions and support issues to
22             #
23             # Cared for by Jason Stajich and Brian Osborne
24             #
25             # Copyright Jason Stajich
26             #
27             # You may distribute this module under the same terms as perl itself
28              
29             # POD documentation - main docs before the code
30              
31             =head1 NAME
32              
33             Bio::Align::Utilities - A collection of utilities regarding converting
34             and manipulating alignment objects
35              
36             =head1 SYNOPSIS
37              
38             use Bio::Align::Utilities qw(:all);
39              
40             # Even if the protein alignments are local make sure the start/end
41             # stored in the LocatableSeq objects are to the full length protein.
42             # The coding sequence that is passed in should still be the full
43             # length CDS as the nt alignment will be generated.
44             # %dnaseqs is a hash of CDS sequences (spliced)
45             my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs);
46              
47             # The reverse, which is simpler. The input alignment has to be
48             # translate-able, with gap lengths and an overall length divisible by 3
49             my $aa_aln = dna_to_aa_aln($dna_al);
50              
51             # Generate bootstraps
52             my $replicates = bootstrap_replicates($aln,$count);
53              
54             =head1 DESCRIPTION
55              
56             This module contains utility methods for manipulating sequence
57             alignments (L) objects.
58              
59             The B utility is essentially the same as the B
60             program by Bill Pearson available at
61             ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this
62             is a pure-Perl implementation, but just to mention that if anything
63             seems odd you can check the alignments generated against Bill's
64             program.
65              
66             =head1 FEEDBACK
67              
68             =head2 Mailing Lists
69              
70             User feedback is an integral part of the evolution of this and other
71             Bioperl modules. Send your comments and suggestions preferably to
72             the Bioperl mailing list. Your participation is much appreciated.
73              
74             bioperl-l@bioperl.org - General discussion
75             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76              
77             =head2 Support
78              
79             Please direct usage questions or support issues to the mailing list:
80              
81             I
82              
83             rather than to the module maintainer directly. Many experienced and
84             reponsive experts will be able look at the problem and quickly
85             address it. Please include a thorough description of the problem
86             with code and data examples if at all possible.
87              
88             =head2 Reporting Bugs
89              
90             Report bugs to the Bioperl bug tracking system to help us keep track
91             of the bugs and their resolution. Bug reports can be submitted via the
92             web:
93              
94             https://github.com/bioperl/bioperl-live/issues
95              
96             =head1 AUTHOR - Jason Stajich
97              
98             Email jason-at-bioperl-dot-org
99              
100             =head1 APPENDIX
101              
102             The rest of the documentation details each of the object methods.
103             Internal methods are usually preceded with a _
104              
105             =cut
106              
107 3     3   13 use constant CODONSIZE => 3;
  3         3  
  3         5200  
108             our $GAP = '-';
109             our $CODONGAP = $GAP x CODONSIZE;
110              
111             =head2 aa_to_dna_aln
112              
113             Title : aa_to_dna_aln
114             Usage : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs);
115             Function: Will convert an AA alignment to DNA space given the
116             corresponding DNA sequences. Note that this method expects
117             the DNA sequences to be in frame +1 (GFF frame 0) as it will
118             start to project into coordinates starting at the first base of
119             the DNA sequence, if this alignment represents a different
120             frame for the cDNA you will need to edit the DNA sequences
121             to remove the 1st or 2nd bases (and revcom if things should be).
122             Returns : Bio::Align::AlignI object
123             Args : 2 arguments, the alignment and a hashref.
124             Alignment is a Bio::Align::AlignI of amino acid sequences.
125             The hash reference should have keys which are
126             the display_ids for the aa
127             sequences in the alignment and the values are a
128             Bio::PrimarySeqI object for the corresponding
129             spliced cDNA sequence.
130              
131             See also: L, L, L
132              
133             =cut
134              
135             sub aa_to_dna_aln {
136 2     2 1 19 my ( $aln, $dnaseqs ) = @_;
137 2 50 33     28 unless ( defined $aln
      33        
138             && ref($aln)
139             && $aln->isa('Bio::Align::AlignI') )
140             {
141 0         0 croak(
142             'Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature'
143             );
144             }
145 2         10 my $alnlen = $aln->length;
146 2         11 my $dnaalign = Bio::SimpleAlign->new();
147 2         12 $aln->map_chars( '\.', $GAP );
148              
149 2         6 foreach my $seq ( $aln->each_seq ) {
150 17         42 my $aa_seqstr = $seq->seq();
151 17         34 my $pepid = $seq->display_id;
152 17   33     52 my $dnaseq = $dnaseqs->{$pepid} || $aln->throw( "cannot find " . $seq->display_id );
153 17         38 my $start_offset = ( $seq->start - 1 ) * CODONSIZE;
154 17         65 $dnaseq = $dnaseq->seq();
155 17         45 my $dnalen = $dnaseqs->{$pepid}->length;
156 17   33     38 my $dnaid = $dnaseqs->{$pepid}->display_id || $pepid; # try to use DNAseq obj ID (issue #137)
157 17         24 my $nt_seqstr;
158 17         17 my $j = 0;
159 17         35 for ( my $i = 0 ; $i < $alnlen ; $i++ ) {
160 5552         5420 my $char = substr( $aa_seqstr, $i + $start_offset, 1 );
161 5552 100 66     9776 if ( $char eq $GAP || $j >= $dnalen ) {
162 765         965 $nt_seqstr .= $CODONGAP;
163             }
164             else {
165 4787         4654 $nt_seqstr .= substr( $dnaseq, $j, CODONSIZE );
166 4787         5912 $j += CODONSIZE;
167             }
168             }
169 17         32 $nt_seqstr .= $GAP x ( ( $alnlen * 3 ) - length($nt_seqstr) );
170              
171 17         58 my $newdna = Bio::LocatableSeq->new(
172             -display_id => $dnaid,
173             -alphabet => 'dna',
174             -start => $start_offset + 1,
175             -end => ( $seq->end * CODONSIZE ),
176             -strand => 1,
177             -seq => $nt_seqstr
178             );
179 17         60 $dnaalign->add_seq($newdna);
180             }
181 2         14 return $dnaalign;
182             }
183              
184             =head2 dna_to_aa_aln
185              
186             Title : dna_to_aa_aln
187             Usage : my $aa_aln = dna_to_aa_aln($dna_aln);
188             Function: Convert a DNA alignment to an amino acid alignment where
189             the length of all alignment strings and the lengths of any
190             gaps must be divisible by 3
191             Returns : Bio::Align::AlignI object
192             Args : the DNA alignment, a Bio::Align::AlignI of DNA sequences
193              
194             See also: L, L, L
195              
196             =cut
197              
198             sub dna_to_aa_aln {
199 1     1 1 15 my $dna_aln = shift;
200 1 50 33     16 unless ( defined $dna_aln
      33        
201             && ref($dna_aln)
202             && $dna_aln->isa('Bio::Align::AlignI') ) {
203 0         0 croak(
204             'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln'
205             );
206             }
207 1         8 my $codon_table = Bio::Tools::CodonTable->new;
208 1         9 my $aa_aln = Bio::SimpleAlign->new;
209              
210 1         10 for my $seq ( $dna_aln->each_seq ) {
211 14         41 my ($aa_str, $aa_len);
212 14         53 my @aln_str = split '', $seq->seq;
213 14 50       76 croak("All lines in the alignment must have lengths divisible by 3")
214             if ( scalar(@aln_str) % CODONSIZE );
215              
216 14         44 while ( @aln_str ) {
217 5516         11129 my $triplet = join '', (splice( @aln_str, 0, CODONSIZE ));
218              
219 5516 100       17733 if ( $triplet =~ /^[GATC]+$/i ) {
    50          
220 4754         9798 $aa_str .= $codon_table->translate($triplet);
221 4754         10776 $aa_len++;
222             }
223             elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) {
224 762         1835 $aa_str .= $GAP;
225             }
226             else {
227 0         0 croak("The triplet '$triplet' is neither a valid codon nor all gaps");
228             }
229             }
230 14         137 my $new_aa = Bio::LocatableSeq->new(
231             -display_id => $seq->display_id,
232             -alphabet => 'protein',
233             -start => 1,
234             -end => $aa_len,
235             -strand => 1,
236             -seq => $aa_str
237             );
238              
239 14         111 $aa_aln->add_seq($new_aa);
240             }
241              
242 1         9 $aa_aln;
243             }
244              
245             =head2 bootstrap_replicates
246              
247             Title : bootstrap_replicates
248             Usage : my $alns = &bootstrap_replicates($aln,100);
249             Function: Generate a pseudo-replicate of the data by randomly
250             sampling, with replacement, the columns from an alignment for
251             the non-parametric bootstrap.
252             Returns : Arrayref of L objects
253             Args : L object
254             Number of replicates to generate
255              
256             =cut
257              
258             sub bootstrap_replicates {
259 3     3 1 15 my ( $aln, $count ) = @_;
260 3   50     12 $count ||= 1;
261 3         15 my $alen = $aln->length;
262 3         6 my ( @seqs, @nm );
263 3         16 $aln->set_displayname_flat(1);
264 3         9 for my $s ( $aln->each_seq ) {
265 31         44 push @seqs, $s->seq();
266 31         40 push @nm, $s->id;
267             }
268 3         15 my ( @alns, $i );
269 3         16 while ( $count-- > 0 ) {
270 23         34 my @newseqs;
271 23         61 for ( $i = 0 ; $i < $alen ; $i++ ) {
272 7988         9835 my $index = int( rand($alen) );
273 7988         7261 my $c = 0;
274 7988         8370 for (@seqs) {
275 110644         130344 $newseqs[ $c++ ] .= substr( $_, $index, 1 );
276             }
277             }
278 23         181 my $newaln = Bio::SimpleAlign->new();
279 23         36 my $i = 0;
280 23         49 for my $s (@newseqs) {
281 289         6274 ( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g;
282 289         1334 $newaln->add_seq(
283             Bio::LocatableSeq->new(
284             -start => 1,
285             -end => length($tmp),
286             -display_id => $nm[ $i++ ],
287             -seq => $s
288             )
289             );
290             }
291 23         129 push @alns, $newaln;
292             }
293 3         24 return \@alns;
294             }
295              
296             =head2 bootstrap_replicates_codons
297              
298             Title : bootstrap_replicates_codons
299             Usage : my $alns = &bootstrap_replicates_codons($aln,100);
300             Function: Generate a pseudo-replicate of the data by randomly
301             sampling, with replacement, the columns from a codon alignment for
302             the non-parametric bootstrap. The alignment is assumed to start on
303             the first position of a codon.
304             Returns : Arrayref of L objects
305             Args : L object
306             Number of replicates to generate
307              
308             =cut
309              
310             sub bootstrap_replicates_codons {
311 0     0 1 0 my ( $aln, $count ) = @_;
312 0   0     0 $count ||= 1;
313 0         0 my $alen = $aln->length;
314 0         0 my $ncodon = int( $alen / 3 );
315 0         0 my ( @seqs, @nm );
316 0         0 $aln->set_displayname_flat(1);
317 0         0 for my $s ( $aln->each_seq ) {
318 0         0 push @seqs, $s->seq();
319 0         0 push @nm, $s->id;
320             }
321 0         0 my ( @alns, $i );
322 0         0 while ( $count-- > 0 ) {
323 0         0 my @newseqs;
324 0         0 for ( $i = 0 ; $i < $ncodon ; $i++ ) {
325 0         0 my $index = int( rand($ncodon) );
326 0         0 my $seqpos = $index * 3;
327 0         0 my $c = 0;
328 0         0 for (@seqs) {
329 0         0 $newseqs[ $c++ ] .= substr( $_, $seqpos, 3 );
330             }
331             }
332 0         0 my $newaln = Bio::SimpleAlign->new();
333 0         0 my $i = 0;
334 0         0 for my $s (@newseqs) {
335 0         0 ( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g;
336 0         0 $newaln->add_seq(
337             Bio::LocatableSeq->new(
338             -start => 1,
339             -end => length($tmp),
340             -display_id => $nm[ $i++ ],
341             -seq => $s
342             )
343             );
344             }
345 0         0 push @alns, $newaln;
346             }
347 0         0 return \@alns;
348             }
349              
350             =head2 cat
351              
352             Title : cat
353             Usage : $aln123 = cat($aln1, $aln2, $aln3)
354             Function : Concatenates alignment objects. Sequences are identified by id.
355             An error will be thrown if the sequence ids are not unique in the
356             first alignment. If any ids are not present or not unique in any
357             of the additional alignments then those sequences are omitted from
358             the concatenated alignment, and a warning is issued. An error will
359             be thrown if any of the alignments are not flush, since
360             concatenating such alignments is unlikely to make biological
361             sense.
362             Returns : A new Bio::SimpleAlign object
363             Args : A list of Bio::SimpleAlign objects
364              
365             =cut
366              
367             sub cat {
368 1     1 1 9 my ( $self, @aln ) = @_;
369 1 50       3 $self->throw("cat method called with no arguments") unless $self;
370 1         3 for ( $self, @aln ) {
371 2 50       8 $self->throw( $_->id . " is not a Bio::Align::AlignI object" )
372             unless $_->isa('Bio::Align::AlignI');
373 2 50       5 $self->throw( $_->id . " is not flush" ) unless $_->is_flush;
374             }
375 1         5 my $aln = $self->new;
376 1         3 $aln->id( $self->id );
377 1         3 $aln->annotation( $self->annotation );
378 1         2 my %unique;
379 1         7 SEQ: foreach my $seq ( $self->each_seq() ) {
380             throw( "ID: ", $seq->id, " is not unique in initial alignment." )
381 14 50       24 if exists $unique{ $seq->id };
382 14         21 $unique{ $seq->id } = 1;
383              
384             # Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array
385 14         22 my $new_seq = $seq->new(
386             -id => $seq->id,
387             -strand => $seq->strand,
388             -verbose => $self->verbose
389             );
390 14         30 $new_seq->seq( $seq->seq );
391 14         24 $new_seq->start( $seq->start );
392 14         23 $new_seq->end( $seq->end );
393 14 50       58 if ( $new_seq->isa('Bio::Seq::MetaI') ) {
394 0         0 for my $meta_name ( $seq->meta_names ) {
395 0         0 $new_seq->named_submeta( $meta_name, $new_seq->start,
396             $new_seq->end, $seq->named_meta($meta_name) );
397             }
398             }
399 14         22 for my $cat_aln (@aln) {
400 14         21 my @cat_seq = $cat_aln->each_seq_with_id( $seq->id );
401 14 50       22 if ( @cat_seq == 0 ) {
402 0         0 $self->warn( $seq->id
403             . " not found in alignment "
404             . $cat_aln->id
405             . ", skipping this sequence." );
406 0         0 next SEQ;
407             }
408 14 50       28 if ( @cat_seq > 1 ) {
409 0         0 $self->warn( $seq->id
410             . " found multiple times in alignment "
411             . $cat_aln->id
412             . ", skipping this sequence." );
413 0         0 next SEQ;
414             }
415 14         17 my $cat_seq = $cat_seq[0];
416 14         20 my $old_end = $new_seq->end;
417 14         17 $new_seq->seq( $new_seq->seq . $cat_seq->seq );
418              
419             # Not sure if this is a sensible way to deal with end coordinates
420 14         23 $new_seq->end(
421             $new_seq->end + $cat_seq->end + 1 - $cat_seq->start );
422              
423 14 50       92 if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) {
    50          
424 0 0       0 unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
425 0         0 my $meta_seq = Bio::Seq::Meta::Array->new;
426 0         0 $meta_seq->seq( $new_seq->seq );
427 0         0 $meta_seq->start( $new_seq->start );
428 0         0 $meta_seq->end( $new_seq->end );
429 0 0       0 if ( $new_seq->isa('Bio::Seq::Meta') ) {
430 0         0 for my $meta_name ( $new_seq->meta_names ) {
431 0         0 $meta_seq->named_submeta(
432             $meta_name,
433             $new_seq->start,
434             $old_end,
435             [
436             split(
437             //, $new_seq->named_meta($meta_name)
438             )
439             ]
440             );
441             }
442             }
443 0         0 $new_seq = $meta_seq;
444             }
445 0         0 for my $meta_name ( $cat_seq->meta_names ) {
446 0         0 $new_seq->named_submeta( $meta_name, $old_end + 1,
447             $new_seq->end, $cat_seq->named_meta($meta_name) );
448             }
449             }
450             elsif ( $cat_seq->isa('Bio::Seq::Meta') ) {
451 0 0       0 if ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
452 0         0 for my $meta_name ( $cat_seq->meta_names ) {
453 0         0 $new_seq->named_submeta( $meta_name, $old_end + 1,
454             $new_seq->end,
455             [ split( //, $cat_seq->named_meta($meta_name) ) ] );
456             }
457             }
458             else {
459 0 0       0 unless ( $new_seq->isa('Bio::Seq::Meta') ) {
460 0         0 my $meta_seq = Bio::Seq::Meta::Array->new;
461 0         0 $meta_seq->seq( $new_seq->seq );
462 0         0 $meta_seq->start( $new_seq->start );
463 0         0 $meta_seq->end( $new_seq->end );
464 0         0 $new_seq = $meta_seq;
465             }
466 0         0 for my $meta_name ( $cat_seq->meta_names ) {
467 0         0 $new_seq->named_submeta( $meta_name, $old_end + 1,
468             $new_seq->end, $cat_seq->named_meta($meta_name) );
469             }
470             }
471             }
472             }
473 14         28 $aln->add_seq($new_seq);
474             }
475 1         28 my $cons_meta = $self->consensus_meta;
476 1         2 my $new_cons_meta;
477 1 50       3 if ($cons_meta) {
478 0         0 $new_cons_meta = Bio::Seq::Meta->new();
479 0         0 for my $meta_name ( $cons_meta->meta_names ) {
480 0         0 $new_cons_meta->named_submeta( $meta_name, 1, $self->length,
481             $cons_meta->$meta_name );
482             }
483             }
484 1         4 my $end = $self->length;
485 1         2 for my $cat_aln (@aln) {
486 1         3 my $cat_cons_meta = $cat_aln->consensus_meta;
487 1 50       3 if ($cat_cons_meta) {
488 0 0       0 $new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta;
489 0         0 for my $meta_name ( $cat_cons_meta->meta_names ) {
490 0         0 $new_cons_meta->named_submeta(
491             $meta_name, $end + 1,
492             $end + $cat_aln->length,
493             $cat_cons_meta->$meta_name
494             );
495             }
496             }
497 1         5 $end += $cat_aln->length;
498             }
499 1 50       4 $aln->consensus_meta($new_cons_meta) if $new_cons_meta;
500 1         10 return $aln;
501             }
502              
503              
504             =head2 most_common_sequences
505              
506             Title : most_common_sequences
507             Usage : @common = most_common_sequences ($align, $case_sensitivity)
508             Function : Returns an array of the sequences that appear most often in the
509             alignment (although this probably makes more sense when there is
510             only a single most common sequence). Sequences are compared after
511             removing any "-" (gap characters), and ambiguous units (e.g., R
512             for purines) are only compared to themselves. The returned
513             sequence is also missing the "-" since they don't actually make
514             part of the sequence.
515             Returns : Array of text strings.
516             Arguments : Optional argument defining whether the comparison between sequences
517             to find the most common should be case sensitive. Defaults to
518             false, i.e, not case sensitive.
519              
520             =cut
521              
522             sub most_common_sequences {
523 0 0   0 1   my $align = shift
524             or croak ("Must provide Bio::AlignI object to Bio::Align::Utilities::most_common_sequences");
525 0           my $case_sensitive = shift; # defaults to false (we get undef if nothing)
526              
527             ## We keep track of the max on this loop. Saves us having to
528             ## transverse the hash table later to find the maximum value.
529 0           my $max = 0;
530 0           my %counts;
531 0           foreach ($align->each_seq) {
532 0           (my $seq = $_->seq) =~ tr/-//d;
533 0 0         $seq = uc ($seq) unless $case_sensitive;
534 0 0         $max++ if (++$counts{$seq} > $max);
535             }
536 0           my @common = grep ($counts{$_} == $max, keys %counts);
537 0           return @common;
538             }
539              
540             1;