File Coverage

Bio/SimpleAlign.pm
Criterion Covered Total %
statement 1013 1175 86.2
branch 315 456 69.0
condition 135 218 61.9
subroutine 80 96 83.3
pod 70 78 89.7
total 1613 2023 79.7


line stmt bran cond sub pod time code
1             package Bio::SimpleAlign;
2 48     48   1751 use strict;
  48         58  
  48         1204  
3 48     48   195 use warnings;
  48         50  
  48         1154  
4 48     48   2893 use Bio::LocatableSeq; # uses Seq's as list
  48         66  
  48         866  
5 48     48   4520 use Bio::Seq;
  48         51  
  48         809  
6 48     48   11583 use Bio::SeqFeature::Generic;
  48         953  
  48         1371  
7              
8 48     48   218 use parent qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI);
  48         62  
  48         336  
9              
10             # BioPerl module for SimpleAlign
11             #
12             # Please direct questions and support issues to
13             #
14             # Cared for by Heikki Lehvaslaiho
15             #
16             # Copyright Ewan Birney
17             #
18             # You may distribute this module under the same terms as perl itself
19              
20             # POD documentation - main docs before the code
21             #
22             # History:
23             # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
24             # May 2001 major rewrite - Heikki Lehvaslaiho
25              
26             =head1 NAME
27              
28             Bio::SimpleAlign - Multiple alignments held as a set of sequences
29              
30             =head1 SYNOPSIS
31              
32             # Use Bio::AlignIO to read in the alignment
33             $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
34             $aln = $str->next_aln();
35              
36             # Describe
37             print $aln->length;
38             print $aln->num_residues;
39             print $aln->is_flush;
40             print $aln->num_sequences;
41             print $aln->score;
42             print $aln->percentage_identity;
43             print $aln->consensus_string(50);
44              
45             # Find the position in the alignment for a sequence location
46             $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
47              
48             # Extract sequences and check values for the alignment column $pos
49             foreach $seq ($aln->each_seq) {
50             $res = $seq->subseq($pos, $pos);
51             $count{$res}++;
52             }
53             foreach $res (keys %count) {
54             printf "Res: %s Count: %2d\n", $res, $count{$res};
55             }
56              
57             # Manipulate
58             $aln->remove_seq($seq);
59             $mini_aln = $aln->slice(20,30); # get a block of columns
60             $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
61             $new_aln = $aln->remove_columns([20,30]); # remove by position
62             $new_aln = $aln->remove_columns(['mismatch']); # remove by property
63              
64             # Analyze
65             $str = $aln->consensus_string($threshold_percent);
66             $str = $aln->match_line();
67             $str = $aln->cigar_line();
68             $id = $aln->percentage_identity;
69              
70             # See the module documentation for details and more methods.
71              
72             =head1 DESCRIPTION
73              
74             SimpleAlign is an object that handles a multiple sequence alignment
75             (MSA). It is very permissive of types (it does not insist on sequences
76             being all same length, for example). Think of it as a set of sequences
77             with a whole series of built-in manipulations and methods for reading and
78             writing alignments.
79              
80             SimpleAlign uses L, a subclass of L,
81             to store its sequences. These are subsequences with a start and end
82             positions in the parent reference sequence. Each sequence in the
83             SimpleAlign object is a Bio::LocatableSeq.
84              
85             SimpleAlign expects the combination of name, start, and end for a
86             given sequence to be unique in the alignment, and this is the key for the
87             internal hashes (name, start, end are abbreviated C in the code).
88             However, in some cases people do not want the name/start-end to be displayed:
89             either multiple names in an alignment or names specific to the alignment
90             (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
91             C, and generally is what is used to print out the
92             alignment. They default to name/start-end.
93              
94             The SimpleAlign Module is derived from the Align module by Ewan Birney.
95              
96             =head1 FEEDBACK
97              
98             =head2 Mailing Lists
99              
100             User feedback is an integral part of the evolution of this and other
101             Bioperl modules. Send your comments and suggestions preferably to one
102             of the Bioperl mailing lists. Your participation is much appreciated.
103              
104             bioperl-l@bioperl.org - General discussion
105             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
106              
107             =head2 Support
108              
109             Please direct usage questions or support issues to the mailing list:
110              
111             I
112              
113             rather than to the module maintainer directly. Many experienced and
114             reponsive experts will be able look at the problem and quickly
115             address it. Please include a thorough description of the problem
116             with code and data examples if at all possible.
117              
118             =head2 Reporting Bugs
119              
120             Report bugs to the Bioperl bug tracking system to help us keep track
121             the bugs and their resolution. Bug reports can be submitted via the
122             web:
123              
124             https://github.com/bioperl/bioperl-live/issues
125              
126             =head1 AUTHOR
127              
128             Ewan Birney, birney@ebi.ac.uk
129              
130             =head1 CONTRIBUTORS
131              
132             Allen Day, allenday-at-ucla.edu,
133             Richard Adams, Richard.Adams-at-ed.ac.uk,
134             David J. Evans, David.Evans-at-vir.gla.ac.uk,
135             Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
136             Allen Smith, allens-at-cpan.org,
137             Jason Stajich, jason-at-bioperl.org,
138             Anthony Underwood, aunderwood-at-phls.org.uk,
139             Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
140             Brian Osborne, bosborne at alum.mit.edu
141             Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
142             Hongyu Zhang, forward at hongyu.org
143             Jay Hannah, jay at jays.net
144             Alexandr Bezginov, albezg at gmail.com
145              
146             =head1 SEE ALSO
147              
148             L
149              
150             =head1 APPENDIX
151              
152             The rest of the documentation details each of the object
153             methods. Internal methods are usually preceded with a _
154              
155             =cut
156              
157             ## This data should probably be in a more centralized module...
158             ## it is taken from Clustalw documentation.
159             ## These are all the positively scoring groups that occur in the
160             ## Gonnet Pam250 matrix. The strong and weak groups are
161             ## defined as strong score >0.5 and weak score =<0.5 respectively.
162             our %CONSERVATION_GROUPS = (
163             'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )],
164             'weak' => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)],
165             );
166              
167              
168             =head2 new
169              
170             Title : new
171             Usage : my $aln = Bio::SimpleAlign->new();
172             Function : Creates a new simple align object
173             Returns : Bio::SimpleAlign
174             Args : -source => string representing the source program
175             where this alignment came from
176             -annotation => Bio::AnnotationCollectionI
177             -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
178             -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
179             -consensus => consensus string
180             -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
181              
182             =cut
183              
184              
185             sub new {
186 291     291 1 513 my($class,@args) = @_;
187              
188 291         860 my $self = $class->SUPER::new(@args);
189              
190 291         1360 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
191             SOURCE
192             SCORE
193             ID
194             ACCESSION
195             DESCRIPTION
196             SEQS
197             FEATURES
198             ANNOTATION
199             SEQ_ANNOTATION
200             CONSENSUS
201             CONSENSUS_META
202             )], @args);
203 291 100       980 $src && $self->source($src);
204 291 100       516 defined $score && $self->score($score);
205             # we need to set up internal hashs first!
206              
207 291         692 $self->{'_seq'} = {};
208 291         415 $self->{'_order'} = {};
209 291         369 $self->{'_start_end_lists'} = {};
210 291         392 $self->{'_dis_name'} = {};
211 291         371 $self->{'_id'} = 'NoName';
212             # maybe we should automatically read in from args. Hmmm...
213 291 100       511 $id && $self->id($id);
214 291 100       477 $acc && $self->accession($acc);
215 291 100       465 $desc && $self->description($desc);
216 291 100       483 $coll && $self->annotation($coll);
217             # sequence annotation is layered into a provided annotation collection (or dies)
218 291 50       457 if ($sa) {
219 0 0       0 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
220             "with a sequence annotation collection")
221             if !$coll;
222 0         0 $coll->add_Annotation('seq_annotation', $sa);
223             }
224 291 100 66     711 if ($feats && ref $feats eq 'ARRAY') {
225 1         4 for my $feat (@$feats) {
226 6         8 $self->add_SeqFeature($feat);
227             }
228             }
229 291 50       489 $con && $self->consensus($con);
230 291 100       485 $cmeta && $self->consensus_meta($cmeta);
231             # assumes these are in correct alignment order
232 291 100 66     771 if ($seqs && ref($seqs) eq 'ARRAY') {
233 9         19 for my $seq (@$seqs) {
234 168         187 $self->add_seq($seq);
235             }
236             }
237              
238 291         547 return $self; # success - we hope!
239             }
240              
241             =head1 Modifier methods
242              
243             These methods modify the MSA by adding, removing or shuffling complete
244             sequences.
245              
246             =head2 add_seq
247              
248             Title : add_seq
249             Usage : $myalign->add_seq($newseq);
250             $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
251             Function : Adds another sequence to the alignment. *Does not* align
252             it - just adds it to the hashes.
253             If -ORDER is specified, the sequence is inserted at the
254             the position spec'd by -ORDER, and existing sequences
255             are pushed down the storage array.
256             Returns : nothing
257             Args : A Bio::LocatableSeq object
258             Positive integer for the sequence position (optional)
259              
260             See L for more information
261              
262             =cut
263              
264             sub addSeq {
265 0     0 0 0 my $self = shift;
266 0         0 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
267 0         0 $self->add_seq(@_);
268             }
269              
270             sub add_seq {
271 2420     2420 1 2400 my $self = shift;
272 2420         2632 my @args = @_;
273 2420         5593 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
274 2420         2430 my ($name,$id,$start,$end);
275              
276 2420 50       3507 unless ($seq) {
277 0         0 $self->throw("LocatableSeq argument required");
278             }
279 2420 50 33     8643 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
280 0         0 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
281             }
282 2420 100       3482 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
  2417         3597  
283 2420         1922 $order--; # jay's patch (user-specified order is 1-origin)
284            
285 2420 100       3329 if ($order < 0) {
286 1         6 $self->throw("User-specified value for ORDER must be >= 1");
287             }
288              
289 2419   66     3545 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
290              
291             # build the symbol list for this sequence,
292             # will prune out the gap and missing/match chars
293             # when actually asked for the symbol list in the
294             # symbol_chars
295             # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
296              
297 2419         3891 $name = $seq->get_nse;
298              
299 2419 50       3737 if( $self->{'_seq'}->{$name} ) {
300 0 0       0 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
301             }
302             else {
303 2419         6459 $self->debug( "Assigning $name to $order\n");
304              
305 2419         2629 my $ordh = $self->{'_order'};
306 2419 50       3589 if ($ordh->{$order}) {
307             # make space to insert
308             # $c->() returns (in reverse order) the first subsequence
309             # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
310             # (3,2,1), and $c->(2,4,5) returns (2).
311 0         0 my $c;
312 0 0   0   0 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
  0         0  
313             map {
314 0         0 $ordh->{$_+1} = $ordh->{$_}
315 0         0 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
  0         0  
  0         0  
  0         0  
316              
317             }
318 2419         3383 $ordh->{$order} = $name;
319              
320 2419 100       4041 unless( exists( $self->{'_start_end_lists'}->{$id})) {
321 2400         3950 $self->{'_start_end_lists'}->{$id} = [];
322             }
323 2419         1852 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
  2419         4161  
324             }
325              
326 2419         6869 $self->{'_seq'}->{$name} = $seq;
327              
328             }
329              
330              
331             =head2 remove_seq
332              
333             Title : remove_seq
334             Usage : $aln->remove_seq($seq);
335             Function : Removes a single sequence from an alignment
336             Returns :
337             Argument : a Bio::LocatableSeq object
338              
339             =cut
340              
341             sub removeSeq {
342 0     0 0 0 my $self = shift;
343 0         0 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
344 0         0 $self->remove_seq(@_);
345             }
346              
347             sub remove_seq {
348 13     13 1 11 my $self = shift;
349 13         10 my $seq = shift;
350 13         11 my ($name,$id);
351              
352 13 50 33     73 $self->throw("Need Bio::Locatable seq argument ")
353             unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
354              
355 13         25 $id = $seq->id();
356 13         26 $name = $seq->get_nse;
357              
358 13 50       24 if( !exists $self->{'_seq'}->{$name} ) {
359 0         0 $self->throw("Sequence $name does not exist in the alignment to remove!");
360             }
361              
362 13         16 delete $self->{'_seq'}->{$name};
363              
364             # we need to remove this seq from the start_end_lists hash
365              
366 13 50       25 if (exists $self->{'_start_end_lists'}->{$id}) {
367             # we need to find the sequence in the array.
368              
369 13         9 my ($i, $found);;
370 13         13 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
  13         25  
371 13 50       8 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
  13         36  
372 13         10 $found = 1;
373 13         12 last;
374             }
375             }
376 13 50       14 if ($found) {
377 13         11 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
  13         20  
378             }
379             else {
380 0         0 $self->throw("Could not find the sequence to remoce from the start-end list");
381             }
382             }
383             else {
384 0         0 $self->throw("There is no seq list for the name $id");
385             }
386             # we need to shift order hash
387 13         11 my %rev_order = reverse %{$self->{'_order'}};
  13         74  
388 13         19 my $no = $rev_order{$name};
389 13         27 my $num_sequences = $self->num_sequences;
390 13         22 for (; $no < $num_sequences; $no++) {
391 94         145 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
392             }
393 13         15 delete $self->{'_order'}->{$no};
394 13         31 return 1;
395             }
396              
397              
398             =head2 purge
399              
400             Title : purge
401             Usage : $aln->purge(0.7);
402             Function: Removes sequences above given sequence similarity
403             This function will grind on large alignments. Beware!
404             Example :
405             Returns : An array of the removed sequences
406             Args : float, threshold for similarity
407              
408             =cut
409              
410             sub purge {
411 1     1 1 2 my ($self,$perc) = @_;
412 1         2 my (%duplicate, @dups);
413              
414 1         2 my @seqs = $self->each_seq();
415              
416 1         5 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
417 15         13 my $seq = $seqs[$i];
418              
419             #skip if already in duplicate hash
420 15 100       28 next if exists $duplicate{$seq->display_id} ;
421 4         11 my $one = $seq->seq();
422              
423 4         69 my @one = split '', $one; #split to get 1aa per array element
424              
425 4         13 for (my $j=$i+1;$j < @seqs;$j++) {
426 28         33 my $seq2 = $seqs[$j];
427              
428             #skip if already in duplicate hash
429 28 100       71 next if exists $duplicate{$seq2->display_id} ;
430              
431 27         48 my $two = $seq2->seq();
432 27         449 my @two = split '', $two;
433              
434 27         23 my $count = 0;
435 27         16 my $res = 0;
436 27         46 for (my $k=0;$k<@one;$k++) {
437 6534 100 66     31424 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
      66        
      100        
438             $one[$k] eq $two[$k]) {
439 4329         2521 $count++;
440             }
441 6534 100 66     38589 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
      66        
      66        
      66        
442             $two[$k] ne '.' && $two[$k] ne '-' ) {
443 6320         7758 $res++;
444             }
445             }
446              
447 27         28 my $ratio = 0;
448 27 50       50 $ratio = $count/$res unless $res == 0;
449              
450             # if above threshold put in duplicate hash and push onto
451             # duplicate array for returning to get_unique
452 27 100       205 if ( $ratio > $perc ) {
453 12 50       54 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
454 12         32 $duplicate{$seq2->display_id} = 1;
455 12         148 push @dups, $seq2;
456             }
457             }
458             }
459 1         3 foreach my $seq (@dups) {
460 12         17 $self->remove_seq($seq);
461             }
462 1         13 return @dups;
463             }
464              
465             =head2 sort_alphabetically
466              
467             Title : sort_alphabetically
468             Usage : $ali->sort_alphabetically
469             Function : Changes the order of the alignment to alphabetical on name
470             followed by numerical by number.
471             Returns :
472             Argument :
473              
474             =cut
475              
476             sub sort_alphabetically {
477 1     1 1 3 my $self = shift;
478 1         1 my ($seq,$nse,@arr,%hash,$count);
479              
480 1         3 foreach $seq ( $self->each_seq() ) {
481 3         7 $nse = $seq->get_nse;
482 3         4 $hash{$nse} = $seq;
483             }
484              
485 1         2 $count = 0;
486              
487 1         1 %{$self->{'_order'}} = (); # reset the hash;
  1         3  
488              
489 1         5 foreach $nse ( sort _alpha_startend keys %hash) {
490 3         5 $self->{'_order'}->{$count} = $nse;
491              
492 3         3 $count++;
493             }
494 1         5 1;
495             }
496              
497             =head2 sort_by_list
498              
499             Title : sort_by_list
500             Usage : $aln_ordered=$aln->sort_by_list($list_file)
501             Function : Arbitrarily order sequences in an alignment
502             Returns : A new Bio::SimpleAlign object
503             Argument : a file listing sequence names in intended order (one name per line)
504              
505             =cut
506              
507             sub sort_by_list {
508 1     1 1 3 my ($self, $list) = @_;
509 1         4 my (@seq, @ids, %order);
510              
511 1         4 foreach my $seq ( $self->each_seq() ) {
512 6         9 push @seq, $seq;
513 6         14 push @ids, $seq->display_id;
514             }
515              
516 1         3 my $ct=1;
517 1 50       61 open my $listfh, '<', $list or $self->throw("Could not read file '$list': $!");
518 1         31 while (<$listfh>) {
519 6         8 chomp;
520 6         7 my $name=$_;
521 6 50       15 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
522 6         28 $order{$name}=$ct++;
523             }
524 1         11 close($listfh);
525            
526             # use the map-sort-map idiom:
527 1         4 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
  6         10  
  7         11  
  6         16  
528 1         9 my $aln = $self->new;
529 1         3 foreach (@sorted) { $aln->add_seq($_) }
  6         15  
530 1         8 return $aln;
531             }
532              
533             =head2 set_new_reference
534              
535             Title : set_new_reference
536             Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
537             the sequence whoes name is "B31" (full, exact, and case-sensitive),
538             as the reference (1st) sequence
539             Function : Change/Set a new reference (i.e., the first) sequence
540             Returns : a new Bio::SimpleAlign object.
541             Throws an exception if designated sequence not found
542             Argument : a positive integer of sequence order, or a sequence name
543             in the original alignment
544              
545             =cut
546              
547             sub set_new_reference {
548 2     2 1 3 my ($self, $seqid) = @_;
549 2         4 my $aln = $self->new;
550 2         2 my (@seq, @ids, @new_seq);
551 2         2 my $is_num=0;
552 2         4 foreach my $seq ( $self->each_seq() ) {
553 12         11 push @seq, $seq;
554 12         13 push @ids, $seq->display_id;
555             }
556              
557 2 100       10 if ($seqid =~ /^\d+$/) { # argument is seq position
558 1         2 $is_num=1;
559 1 50 33     6 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
560             } else { # argument is a seq name
561 1 50       4 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
562             }
563              
564 2         5 for (my $i=0; $i<=$#seq; $i++) {
565 12         6 my $pos=$i+1;
566 12 100 100     37 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
      100        
567 2         4 unshift @new_seq, $seq[$i];
568             } else {
569 10         19 push @new_seq, $seq[$i];
570             }
571             }
572 2         3 foreach (@new_seq) { $aln->add_seq($_); }
  12         16  
573 2         7 return $aln;
574             }
575              
576             sub _in_aln { # check if input name exists in the alignment
577 7     7   10 my ($str, $ref) = @_;
578 7         11 foreach (@$ref) {
579 24 100       53 return 1 if $str eq $_;
580             }
581 0         0 return 0;
582             }
583              
584              
585             =head2 uniq_seq
586              
587             Title : uniq_seq
588             Usage : $aln->uniq_seq(): Remove identical sequences in
589             in the alignment. Ambiguous base ("N", "n") and
590             leading and ending gaps ("-") are NOT counted as
591             differences.
592             Function : Make a new alignment of unique sequence types (STs)
593             Returns : 1a. if called in a scalar context,
594             a new Bio::SimpleAlign object (all sequences renamed as "ST")
595             1b. if called in an array context,
596             a new Bio::SimpleAlign object, and a hashref whose keys
597             are sequence types, and whose values are arrayrefs to
598             lists of sequence ids within the corresponding sequence type
599             2. if $aln->verbose > 0, ST of each sequence is sent to
600             STDERR (in a tabular format)
601             Argument : None
602              
603             =cut
604              
605             sub uniq_seq {
606 1     1 1 3 my ($self, $seqid) = @_;
607 1         3 my $aln = $self->new;
608 1         2 my (%member, %order, @seq, @uniq_str, $st);
609 1         2 my $order=0;
610 1         3 my $len = $self->length();
611 1         3 $st = {};
612 1         3 foreach my $seq ( $self->each_seq() ) {
613 15         27 my $str = $seq->seq();
614              
615             # it's necessary to ignore "n", "N", leading gaps and ending gaps in
616             # comparing two sequence strings
617              
618             # 1st, convert "n", "N" to "?" (for DNA sequence only):
619 15 50       47 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
620             # 2nd, convert leading and ending gaps to "?":
621 15         22 $str = &_convert_leading_ending_gaps($str, '-', '?');
622             # Note that '?' also can mean unknown residue.
623             # I don't like making global class member changes like this, too
624             # prone to errors... -- cjfields 08-11-18
625 15         16 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
626 15         30 my $new = Bio::LocatableSeq->new(
627             -id => $seq->id(),
628             -alphabet=> $seq->alphabet,
629             -seq => $str,
630             -start => $seq->start,
631             -end => $seq->end
632             );
633 15         30 push @seq, $new;
634             }
635              
636 1         4 foreach my $seq (@seq) {
637 15         46 my $str = $seq->seq();
638 15         33 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
639 15 100       28 if ($seen) { # seen before
640 4         4 my @memb = @{$member{$key}};
  4         15  
641 4         4 push @memb, $seq;
642 4         14 $member{$key} = \@memb;
643             } else { # not seen
644 11         20 push @uniq_str, $key;
645 11         11 $order++;
646 11         62 $member{$key} = [ ($seq) ];
647 11         35 $order{$key} = $order;
648             }
649             }
650              
651 1         10 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
  30         23  
652             # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
653 11         17 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
654             # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
655 11 50       42 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
656 11         11 my $gap='-';
657 11         13 my $end= CORE::length($str2);
658 11         334 $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
659 11         48 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
660             -seq =>$str2,
661             -start=>1,
662             -end =>$end
663             );
664 11         25 $aln->add_seq($new);
665 11         6 foreach (@{$member{$str}}) {
  11         20  
666 15         10 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
  15         37  
667 15         18 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
668             }
669             }
670 1 50       14 return wantarray ? ($aln, $st) : $aln;
671             }
672              
673             sub _check_uniq { # check if same seq exists in the alignment
674 15     15   15 my ($str1, $ref, $length) = @_;
675 15         389 my @char1=split //, $str1;
676 15         28 my @array=@$ref;
677              
678 15 100       43 return (0, $str1) if @array==0; # not seen (1st sequence)
679              
680 14         13 foreach my $str2 (@array) {
681 59         43 my $diff=0;
682 59         1435 my @char2=split //, $str2;
683 59         84 for (my $i=0; $i<=$length-1; $i++) {
684 24249 100       25413 next if $char1[$i] eq '?';
685 22382 100       21629 next if $char2[$i] eq '?';
686 21740 100       34125 $diff++ if $char1[$i] ne $char2[$i];
687             }
688 59 100       932 return (1, $str2) if $diff == 0; # seen before
689             }
690              
691 10         164 return (0, $str1); # not seen
692             }
693              
694             sub _convert_leading_ending_gaps {
695 26     26   21 my $s=shift;
696 26         21 my $sym1=shift;
697 26         21 my $sym2=shift;
698 26         684 my @array=split //, $s;
699             # convert leading char:
700 26         44 for (my $i=0; $i<=$#array; $i++) {
701 369 100       537 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
702             }
703             # convert ending char:
704 26         47 for (my $i = $#array; $i>= 0; $i--) {
705 552 100       718 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
706             }
707 26         247 my $s_new=join '', @array;
708 26         374 return $s_new;
709             }
710              
711             =head1 Sequence selection methods
712              
713             Methods returning one or more sequences objects.
714              
715             =head2 each_seq
716              
717             Title : each_seq
718             Usage : foreach $seq ( $align->each_seq() )
719             Function : Gets a Seq object from the alignment
720             Returns : Seq object
721             Argument :
722              
723             =cut
724              
725             sub eachSeq {
726 0     0 0 0 my $self = shift;
727 0         0 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
728 0         0 $self->each_seq();
729             }
730              
731             sub each_seq {
732 7623     7623 1 5247 my $self = shift;
733 7623         4703 my (@arr,$order);
734              
735 7623         4767 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
  127205         80460  
  7623         16839  
736 56096 100       73162 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
737 56083         56620 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
738             }
739             }
740 7623         15331 return @arr;
741             }
742              
743              
744             =head2 each_alphabetically
745              
746             Title : each_alphabetically
747             Usage : foreach $seq ( $ali->each_alphabetically() )
748             Function : Returns a sequence object, but the objects are returned
749             in alphabetically sorted order.
750             Does not change the order of the alignment.
751             Returns : Seq object
752             Argument :
753              
754             =cut
755              
756             sub each_alphabetically {
757 2     2 1 5 my $self = shift;
758 2         4 my ($seq,$nse,@arr,%hash,$count);
759              
760 2         7 foreach $seq ( $self->each_seq() ) {
761 32         48 $nse = $seq->get_nse;
762 32         44 $hash{$nse} = $seq;
763             }
764              
765 2         14 foreach $nse ( sort _alpha_startend keys %hash) {
766 32         26 push(@arr,$hash{$nse});
767             }
768 2         11 return @arr;
769             }
770              
771             sub _alpha_startend {
772 94     94   50 my ($aname,$astart,$bname,$bstart);
773 94         88 ($aname,$astart) = split (/-/,$a);
774 94         89 ($bname,$bstart) = split (/-/,$b);
775              
776 94 50       94 if( $aname eq $bname ) {
777 0         0 return $astart <=> $bstart;
778             }
779             else {
780 94         71 return $aname cmp $bname;
781             }
782             }
783              
784             =head2 each_seq_with_id
785              
786             Title : each_seq_with_id
787             Usage : foreach $seq ( $align->each_seq_with_id() )
788             Function : Gets a Seq objects from the alignment, the contents
789             being those sequences with the given name (there may be
790             more than one)
791             Returns : Seq object
792             Argument : a seq name
793              
794             =cut
795              
796             sub eachSeqWithId {
797 0     0 0 0 my $self = shift;
798 0         0 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
799 0         0 $self->each_seq_with_id(@_);
800             }
801              
802             sub each_seq_with_id {
803 89     89 1 81 my $self = shift;
804 89         82 my $id = shift;
805              
806 89 50       145 $self->throw("Method each_seq_with_id needs a sequence name argument")
807             unless defined $id;
808              
809 89         69 my (@arr, $seq);
810              
811 89 50       164 if (exists($self->{'_start_end_lists'}->{$id})) {
812 89         69 @arr = @{$self->{'_start_end_lists'}->{$id}};
  89         150  
813             }
814 89         155 return @arr;
815             }
816              
817             =head2 get_seq_by_pos
818              
819             Title : get_seq_by_pos
820             Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
821             Function : Gets a sequence based on its position in the alignment.
822             Numbering starts from 1. Sequence positions larger than
823             num_sequences() will throw an error.
824             Returns : a Bio::LocatableSeq object
825             Args : positive integer for the sequence position
826              
827             =cut
828              
829             sub get_seq_by_pos {
830              
831 258     258 1 8985 my $self = shift;
832 258         246 my ($pos) = @_;
833              
834 258 50 33     1577 $self->throw("Sequence position has to be a positive integer, not [$pos]")
835             unless $pos =~ /^\d+$/ and $pos > 0;
836 258 50       447 $self->throw("No sequence at position [$pos]")
837             unless $pos <= $self->num_sequences ;
838              
839 258         474 my $nse = $self->{'_order'}->{--$pos};
840 258         675 return $self->{'_seq'}->{$nse};
841             }
842              
843             =head2 get_seq_by_id
844              
845             Title : get_seq_by_id
846             Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
847             Function : Gets a sequence based on its name.
848             Sequences that do not exist will warn and return undef
849             Returns : a Bio::LocatableSeq object
850             Args : string for sequence name
851              
852             =cut
853              
854             sub get_seq_by_id {
855 18     18 1 2309 my ($self,$name) = @_;
856 18 50       36 unless( defined $name ) {
857 0         0 $self->warn("Must provide a sequence name");
858 0         0 return;
859             }
860 18         17 for my $seq ( values %{$self->{'_seq'}} ) {
  18         37  
861 45 100       63 if ( $seq->id eq $name) {
862 18         32 return $seq;
863             }
864             }
865 0         0 return;
866             }
867              
868             =head2 seq_with_features
869              
870             Title : seq_with_features
871             Usage : $seq = $aln->seq_with_features(-pos => 1,
872             -consensus => 60
873             -mask =>
874             sub { my $consensus = shift;
875              
876             for my $i (1..5){
877             my $n = 'N' x $i;
878             my $q = '\?' x $i;
879             while($consensus =~ /[^?]$q[^?]/){
880             $consensus =~ s/([^?])$q([^?])/$1$n$2/;
881             }
882             }
883             return $consensus;
884             }
885             );
886             Function: produces a Bio::Seq object by first splicing gaps from -pos
887             (by means of a splice_by_seq_pos() call), then creating
888             features using non-? chars (by means of a consensus_string()
889             call with stringency -consensus).
890             Returns : a Bio::Seq object
891             Args : -pos : required. sequence from which to build the Bio::Seq
892             object
893             -consensus : optional, defaults to consensus_string()'s
894             default cutoff value
895             -mask : optional, a coderef to apply to consensus_string()'s
896             output before building features. this may be useful for
897             closing gaps of 1 bp by masking over them with N, for
898             instance
899              
900             =cut
901              
902             sub seq_with_features{
903 0     0 1 0 my ($self,%arg) = @_;
904              
905             #first do the preparatory splice
906 0 0       0 $self->throw("must provide a -pos argument") unless $arg{-pos};
907 0         0 $self->splice_by_seq_pos($arg{-pos});
908              
909 0         0 my $consensus_string = $self->consensus_string($arg{-consensus});
910             $consensus_string = $arg{-mask}->($consensus_string)
911 0 0       0 if defined($arg{-mask});
912              
913 0         0 my(@bs,@es);
914              
915 0 0       0 push @bs, 1 if $consensus_string =~ /^[^?]/;
916              
917 0         0 while($consensus_string =~ /\?[^?]/g){
918 0         0 push @bs, pos($consensus_string);
919             }
920 0         0 while($consensus_string =~ /[^?]\?/g){
921 0         0 push @es, pos($consensus_string);
922             }
923              
924 0 0       0 push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
925              
926 0         0 my $seq = Bio::Seq->new();
927              
928             # my $rootfeature = Bio::SeqFeature::Generic->new(
929             # -source_tag => 'location',
930             # -start => $self->get_seq_by_pos($arg{-pos})->start,
931             # -end => $self->get_seq_by_pos($arg{-pos})->end,
932             # );
933             # $seq->add_SeqFeature($rootfeature);
934              
935 0         0 while(my $b = shift @bs){
936 0         0 my $e = shift @es;
937             $seq->add_SeqFeature(
938             Bio::SeqFeature::Generic->new(
939             -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
940 0   0     0 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
941             -source_tag => $self->source || 'MSA',
942             )
943             );
944             }
945              
946 0         0 return $seq;
947             }
948              
949              
950             =head1 Create new alignments
951              
952             The result of these methods are horizontal or vertical subsets of the
953             current MSA.
954              
955             =head2 select
956              
957             Title : select
958             Usage : $aln2 = $aln->select(1, 3) # three first sequences
959             Function : Creates a new alignment from a continuous subset of
960             sequences. Numbering starts from 1. Sequence positions
961             larger than num_sequences() will throw an error.
962             Returns : a Bio::SimpleAlign object
963             Args : positive integer for the first sequence
964             positive integer for the last sequence to include (optional)
965              
966             =cut
967              
968             sub select {
969 1     1 1 2 my $self = shift;
970 1         2 my ($start, $end) = @_;
971              
972 1 50 33     11 $self->throw("Select start has to be a positive integer, not [$start]")
973             unless $start =~ /^\d+$/ and $start > 0;
974 1 50 33     7 $self->throw("Select end has to be a positive integer, not [$end]")
975             unless $end =~ /^\d+$/ and $end > 0;
976 1 50       3 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
977             unless $start <= $end;
978              
979 1         4 my $aln = $self->new;
980 1         5 foreach my $pos ($start .. $end) {
981 3         6 $aln->add_seq($self->get_seq_by_pos($pos));
982             }
983 1         5 $aln->id($self->id);
984             # fix for meta, sf, ann
985 1         3 return $aln;
986             }
987              
988             =head2 select_noncont
989              
990             Title : select_noncont
991             Usage : # 1st and 3rd sequences, sorted
992             $aln2 = $aln->select_noncont(1, 3)
993              
994             # 1st and 3rd sequences, sorted (same as first)
995             $aln2 = $aln->select_noncont(3, 1)
996              
997             # 1st and 3rd sequences, unsorted
998             $aln2 = $aln->select_noncont('nosort',3, 1)
999              
1000             Function : Creates a new alignment from a subset of sequences. Numbering
1001             starts from 1. Sequence positions larger than num_sequences() will
1002             throw an error. Sorts the order added to new alignment by default,
1003             to prevent sorting pass 'nosort' as the first argument in the list.
1004             Returns : a Bio::SimpleAlign object
1005             Args : array of integers for the sequences. If the string 'nosort' is
1006             passed as the first argument, the sequences will not be sorted
1007             in the new alignment but will appear in the order listed.
1008              
1009             =cut
1010              
1011             sub select_noncont {
1012 8     8 1 10 my $self = shift;
1013 8         10 my $nosort = 0;
1014 8         16 my (@pos) = @_;
1015 8 100       38 if ($pos[0] !~ m{^\d+$}) {
1016 1         2 my $sortcmd = shift @pos;
1017 1 50       4 if ($sortcmd eq 'nosort') {
1018 1         2 $nosort = 1;
1019             } else {
1020 0         0 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1021             }
1022             }
1023            
1024 8         14 my $end = $self->num_sequences;
1025 8         13 foreach ( @pos ) {
1026 32 50 33     175 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
      33        
1027             unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1028             }
1029            
1030 8 100       23 @pos = sort {$a <=> $b} @pos unless $nosort;
  25         23  
1031            
1032 8         17 my $aln = $self->new;
1033 8         14 foreach my $p (@pos) {
1034 32         48 $aln->add_seq($self->get_seq_by_pos($p));
1035             }
1036 8         20 $aln->id($self->id);
1037             # fix for meta, sf, ann
1038 8         19 return $aln;
1039             }
1040              
1041             =head2 select_noncont_by_name
1042              
1043             Title : select_noncont_by_name
1044             Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1045             Function : Creates a new alignment from a subset of sequences which are
1046             selected by name (sequence ID).
1047             Returns : a Bio::SimpleAlign object
1048             Args : array of names (i.e., identifiers) for the sequences.
1049              
1050             =cut
1051              
1052             sub select_noncont_by_name {
1053 1     1 1 3 my ($self, @names) = @_;
1054            
1055 1         3 my $aln = $self->new;
1056 1         1 foreach my $name (@names) {
1057 3         6 $aln->add_seq($self->get_seq_by_id($name));
1058             }
1059 1         3 $aln->id($self->id);
1060              
1061 1         2 return $aln;
1062             }
1063              
1064             =head2 slice
1065              
1066             Title : slice
1067             Usage : $aln2 = $aln->slice(20,30)
1068             Function : Creates a slice from the alignment inclusive of start and
1069             end columns, and the first column in the alignment is denoted 1.
1070             Sequences with no residues in the slice are excluded from the
1071             new alignment and a warning is printed. Slice beyond the length of
1072             the sequence does not do padding.
1073             Returns : A Bio::SimpleAlign object
1074             Args : Positive integer for start column, positive integer for end column,
1075             optional boolean which if true will keep gap-only columns in the newly
1076             created slice. Example:
1077              
1078             $aln2 = $aln->slice(20,30,1)
1079              
1080             =cut
1081              
1082             sub slice {
1083 31     31 1 1866 my $self = shift;
1084 31         37 my ($start, $end, $keep_gap_only) = @_;
1085              
1086 31 50 33     198 $self->throw("Slice start has to be a positive integer, not [$start]")
1087             unless $start =~ /^\d+$/ and $start > 0;
1088 31 50 33     135 $self->throw("Slice end has to be a positive integer, not [$end]")
1089             unless $end =~ /^\d+$/ and $end > 0;
1090 31 50       49 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1091             unless $start <= $end;
1092 31 100       62 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1093             "[$start] is too big.") if $start > $self->length;
1094 30         70 my $cons_meta = $self->consensus_meta;
1095 30         51 my $aln = $self->new;
1096 30         55 $aln->id($self->id);
1097 30         45 foreach my $seq ( $self->each_seq() ) {
1098 107 50       591 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1099             Bio::Seq::Meta->new
1100             (-id => $seq->id,
1101             -alphabet => $seq->alphabet,
1102             -strand => $seq->strand,
1103             -verbose => $self->verbose) :
1104             Bio::LocatableSeq->new
1105             (-id => $seq->id,
1106             -alphabet => $seq->alphabet,
1107             -strand => $seq->strand,
1108             -verbose => $self->verbose);
1109            
1110             # seq
1111 107         157 my $seq_end = $end;
1112 107 50       195 $seq_end = $seq->length if( $end > $seq->length );
1113              
1114 107         184 my $slice_seq = $seq->subseq($start, $seq_end);
1115 107         157 $new_seq->seq( $slice_seq );
1116              
1117             # Allowed extra characters in string
1118 107         96 my $allowed_chars = '';
1119 107 100       168 if (exists $self->{_mask_char}) {
1120 24         18 $allowed_chars = $self->{_mask_char};
1121 24         24 $allowed_chars = quotemeta $allowed_chars;
1122             }
1123 107         641 $slice_seq =~ s/[^\w$allowed_chars]//g;
1124              
1125 107 100       146 if ($start > 1) {
1126 52         93 my $pre_start_seq = $seq->subseq(1, $start - 1);
1127 52         303 $pre_start_seq =~ s/[^\w$allowed_chars]//g;
1128 52 100       92 if (!defined($seq->strand)) {
    100          
1129 38         60 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130             } elsif ($seq->strand < 0){
1131 1         3 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132             } else {
1133 13         25 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1134             }
1135             } else {
1136 55 100 100     100 if ((defined $seq->strand)&&($seq->strand < 0)){
1137 2         6 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1138             } else {
1139 53         89 $new_seq->start( $seq->start);
1140             }
1141             }
1142 107 50       438 if ($new_seq->isa('Bio::Seq::MetaI')) {
1143 0         0 for my $meta_name ($seq->meta_names) {
1144 0         0 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1145             }
1146             }
1147 107         142 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1148              
1149 107 100 66     184 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1150 106         176 $aln->add_seq($new_seq);
1151             } else {
1152 1 50       4 if( $keep_gap_only ) {
1153 1         4 $aln->add_seq($new_seq);
1154             } else {
1155 0         0 my $nse = $seq->get_nse();
1156 0         0 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1157             " Sequence excluded from the new alignment.");
1158             }
1159             }
1160             }
1161 30 50       52 if ($cons_meta) {
1162 0         0 my $new = Bio::Seq::Meta->new();
1163 0         0 for my $meta_name ($cons_meta->meta_names) {
1164 0         0 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1165             }
1166 0         0 $aln->consensus_meta($new);
1167             }
1168 30         74 $aln->annotation($self->annotation);
1169             # fix for meta, sf, ann
1170 30         63 return $aln;
1171             }
1172              
1173             =head2 remove_columns
1174              
1175             Title : remove_columns
1176             Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1177             $aln2 = $aln->remove_columns([0,0],[6,8])
1178             Function : Creates an aligment with columns removed corresponding to
1179             the specified type or by specifying the columns by number.
1180             Returns : Bio::SimpleAlign object
1181             Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1182             'all_gaps_columns') or array ref where the referenced array
1183             contains a pair of integers that specify a range.
1184             The first column is 0
1185              
1186             =cut
1187              
1188             sub remove_columns {
1189 4     4 1 378 my ($self,@args) = @_;
1190 4 50       10 @args || $self->throw("Must supply column ranges or column types");
1191 4         4 my $aln;
1192              
1193 4 100       32 if ($args[0][0] =~ /^[a-z_]+$/i) {
    50          
1194 1         4 $aln = $self->_remove_columns_by_type($args[0]);
1195             } elsif ($args[0][0] =~ /^\d+$/) {
1196 3         13 $aln = $self->_remove_columns_by_num(\@args);
1197             } else {
1198 0         0 $self->throw("You must pass array references to remove_columns(), not @args");
1199             }
1200             # fix for meta, sf, ann
1201 4         12 $aln;
1202             }
1203              
1204              
1205             =head2 remove_gaps
1206              
1207             Title : remove_gaps
1208             Usage : $aln2 = $aln->remove_gaps
1209             Function : Creates an aligment with gaps removed
1210             Returns : a Bio::SimpleAlign object
1211             Args : a gap character(optional) if none specified taken
1212             from $self->gap_char,
1213             [optional] $all_gaps_columns flag (1 or 0, default is 0)
1214             indicates that only all-gaps columns should be deleted
1215              
1216             Used from method L in most cases. Set gap character
1217             using L.
1218              
1219             =cut
1220              
1221             sub remove_gaps {
1222 6     6 1 12 my ($self,$gapchar,$all_gaps_columns) = @_;
1223 6         9 my $gap_line;
1224 6 100       14 if ($all_gaps_columns) {
1225 2         8 $gap_line = $self->all_gap_line($gapchar);
1226             } else {
1227 4         13 $gap_line = $self->gap_line($gapchar);
1228             }
1229 6         13 my $aln = $self->new;
1230              
1231 6         8 my @remove;
1232 6         7 my $length = 0;
1233 6   66     16 my $del_char = $gapchar || $self->gap_char;
1234             # Do the matching to get the segments to remove
1235 6         47 while ($gap_line =~ m/[$del_char]/g) {
1236 14         14 my $start = pos($gap_line)-1;
1237 14         29 $gap_line =~ m/\G[$del_char]+/gc;
1238 14         15 my $end = pos($gap_line)-1;
1239              
1240             #have to offset the start and end for subsequent removes
1241 14         12 $start-=$length;
1242 14         8 $end -=$length;
1243 14         13 $length += ($end-$start+1);
1244 14         37 push @remove, [$start,$end];
1245             }
1246              
1247             #remove the segments
1248 6 100       26 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1249             # fix for meta, sf, ann
1250 6         20 return $aln;
1251             }
1252              
1253              
1254             sub _remove_col {
1255 9     9   13 my ($self,$aln,$remove) = @_;
1256 9         10 my @new;
1257            
1258 9         19 my $gap = $self->gap_char;
1259            
1260             # splice out the segments and create new seq
1261 9         17 foreach my $seq($self->each_seq){
1262 42         96 my $new_seq = Bio::LocatableSeq->new(
1263             -id => $seq->id,
1264             -alphabet=> $seq->alphabet,
1265             -strand => $seq->strand,
1266             -verbose => $self->verbose);
1267 42         90 my $sequence = $seq->seq;
1268 42         36 foreach my $pair(@{$remove}){
  42         94  
1269 658         437 my $start = $pair->[0];
1270 658         417 my $end = $pair->[1];
1271 658 50       697 $sequence = $seq->seq unless $sequence;
1272 658         439 my $orig = $sequence;
1273 658 100       826 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1274 658 100       845 my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1275 658         510 $sequence = $head.$tail;
1276             # start
1277 658 100       728 unless (defined $new_seq->start) {
1278 42 100       55 if ($start == 0) {
1279 31         109 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1280 31         51 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1281             }
1282             else {
1283 11         46 my $start_adjust = $orig =~ /^$gap+/;
1284 11 100       20 if ($start_adjust) {
1285 5         12 $start_adjust = $+[0] == $start;
1286             }
1287 11         22 $new_seq->start($seq->start + $start_adjust);
1288             }
1289             }
1290             # end
1291 658 100       698 if (($end + 1) >= CORE::length($orig)) {
1292 32         81 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1293 32         48 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1294             }
1295             else {
1296 626         705 $new_seq->end($seq->end);
1297             }
1298             }
1299            
1300 42 100       58 if ($new_seq->end < $new_seq->start) {
1301             # we removed all columns except for gaps: set to 0 to indicate no
1302             # sequence
1303 1         3 $new_seq->start(0);
1304 1         2 $new_seq->end(0);
1305             }
1306            
1307 42 50       86 $new_seq->seq($sequence) if $sequence;
1308 42         73 push @new, $new_seq;
1309             }
1310             # add the new seqs to the alignment
1311 9         13 foreach my $new(@new){
1312 42         61 $aln->add_seq($new);
1313             }
1314             # fix for meta, sf, ann
1315 9         17 return $aln;
1316             }
1317              
1318             sub _remove_columns_by_type {
1319 1     1   1 my ($self,$type) = @_;
1320 1         2 my $aln = $self->new;
1321 1         1 my @remove;
1322              
1323 1 50       1 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
  1         4  
  1         2  
1324 1 50       1 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
  1         4  
1325 1         6 my %matchchars = ( 'match' => '\*',
1326             'weak' => '\.',
1327             'strong' => ':',
1328             'mismatch' => ' ',
1329             'gaps' => '',
1330             'all_gaps_columns' => ''
1331             );
1332             # get the characters to delete against
1333 1         1 my $del_char;
1334 1         1 foreach my $type (@{$type}){
  1         2  
1335 1         2 $del_char.= $matchchars{$type};
1336             }
1337              
1338 1         1 my $length = 0;
1339 1         4 my $match_line = $self->match_line;
1340             # do the matching to get the segments to remove
1341 1 50       5 if($del_char){
1342 1         37 while($match_line =~ m/[$del_char]/g ){
1343 37         28 my $start = pos($match_line)-1;
1344 37         56 $match_line=~/\G[$del_char]+/gc;
1345 37         25 my $end = pos($match_line)-1;
1346              
1347             #have to offset the start and end for subsequent removes
1348 37         24 $start-=$length;
1349 37         19 $end -=$length;
1350 37         26 $length += ($end-$start+1);
1351 37         72 push @remove, [$start,$end];
1352             }
1353             }
1354              
1355             # remove the segments
1356 1 50       9 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1357 1 50       4 $aln = $aln->remove_gaps() if $gap;
1358 1 50       63 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1359             # fix for meta, sf, ann
1360 1         10 $aln;
1361             }
1362              
1363              
1364             sub _remove_columns_by_num {
1365 3     3   5 my ($self,$positions) = @_;
1366 3         6 my $aln = $self->new;
1367              
1368             # sort the positions
1369 3         11 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
  3         4  
1370            
1371 3         5 my @remove;
1372 3         4 my $length = 0;
1373 3         5 foreach my $pos (@{$positions}) {
  3         7  
1374 5         6 my ($start, $end) = @{$pos};
  5         8  
1375            
1376             #have to offset the start and end for subsequent removes
1377 5         8 $start-=$length;
1378 5         5 $end -=$length;
1379 5         7 $length += ($end-$start+1);
1380 5         282 push @remove, [$start,$end];
1381             }
1382              
1383             #remove the segments
1384 3 50       15 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1385             # fix for meta, sf, ann
1386 3         7 $aln;
1387             }
1388              
1389              
1390             =head1 Change sequences within the MSA
1391              
1392             These methods affect characters in all sequences without changing the
1393             alignment.
1394              
1395             =head2 splice_by_seq_pos
1396              
1397             Title : splice_by_seq_pos
1398             Usage : $status = splice_by_seq_pos(1);
1399             Function: splices all aligned sequences where the specified sequence
1400             has gaps.
1401             Example :
1402             Returns : 1 on success
1403             Args : position of sequence to splice by
1404              
1405              
1406             =cut
1407              
1408             sub splice_by_seq_pos{
1409 0     0 1 0 my ($self,$pos) = @_;
1410              
1411 0         0 my $guide = $self->get_seq_by_pos($pos);
1412 0         0 my $guide_seq = $guide->seq;
1413              
1414 0         0 $guide_seq =~ s/\./\-/g;
1415              
1416 0         0 my @gaps = ();
1417 0         0 $pos = -1;
1418 0         0 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1419 0         0 unshift @gaps, $pos;
1420 0         0 $pos++;
1421             }
1422              
1423 0         0 foreach my $seq ($self->each_seq){
1424 0         0 my @bases = split '', $seq->seq;
1425              
1426 0         0 splice(@bases, $_, 1) foreach @gaps;
1427 0         0 $seq->seq(join('', @bases));
1428             }
1429              
1430 0         0 1;
1431             }
1432              
1433             =head2 map_chars
1434              
1435             Title : map_chars
1436             Usage : $ali->map_chars('\.','-')
1437             Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1438             characters.
1439              
1440             Note that the first argument is interpreted as a regexp
1441             so be careful and escape any wild card characters (e.g.
1442             do $ali->map_chars('\.','-') to replace periods with dashes.
1443             Returns : 1 on success
1444             Argument : A regexp and a string
1445              
1446             =cut
1447              
1448             sub map_chars {
1449 4     4 1 7 my $self = shift;
1450 4         6 my $from = shift;
1451 4         6 my $to = shift;
1452 4         6 my ( $seq, $temp );
1453              
1454 4 50 33     27 $self->throw("Need two arguments: a regexp and a string")
1455             unless defined $from and defined $to;
1456              
1457 4         13 foreach $seq ( $self->each_seq() ) {
1458 39         60 $temp = $seq->seq();
1459 39         144 $temp =~ s/$from/$to/g;
1460 39         58 $seq->seq($temp);
1461             }
1462 4         13 return 1;
1463             }
1464              
1465              
1466             =head2 uppercase
1467              
1468             Title : uppercase()
1469             Usage : $ali->uppercase()
1470             Function : Sets all the sequences to uppercase
1471             Returns : 1 on success
1472             Argument :
1473              
1474             =cut
1475              
1476             sub uppercase {
1477 1     1 1 2 my $self = shift;
1478 1         1 my $seq;
1479             my $temp;
1480              
1481 1         4 foreach $seq ( $self->each_seq() ) {
1482 16         23 $temp = $seq->seq();
1483 16         25 $temp =~ tr/[a-z]/[A-Z]/;
1484              
1485 16         18 $seq->seq($temp);
1486             }
1487 1         5 return 1;
1488             }
1489              
1490             =head2 cigar_line
1491              
1492             Title : cigar_line()
1493             Usage : %cigars = $align->cigar_line()
1494             Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1495             Report) line for each sequence in the alignment. Examples are
1496             "1,60" or "5,10:12,58", where the numbers refer to conserved
1497             positions within the alignment. The keys of the hash are the
1498             NSEs (name/start/end) assigned to each sequence.
1499             Args : threshold (optional, defaults to 100)
1500             Returns : Hash of strings (cigar lines)
1501              
1502             =cut
1503              
1504             sub cigar_line {
1505 1     1 1 344 my $self = shift;
1506 1   50     6 my $thr=shift||100;
1507 1         1 my %cigars;
1508              
1509 1         4 my @consensus = split "",($self->consensus_string($thr));
1510 1         3 my $len = $self->length;
1511 1         3 my $gapchar = $self->gap_char;
1512              
1513             # create a precursor, something like (1,4,5,6,7,33,45),
1514             # where each number corresponds to a conserved position
1515 1         2 foreach my $seq ( $self->each_seq ) {
1516 4         7 my @seq = split "", uc ($seq->seq);
1517 4         2 my $pos = 1;
1518 4         10 for (my $x = 0 ; $x < $len ; $x++ ) {
1519 132 100       186 if ($seq[$x] eq $consensus[$x]) {
    100          
1520 28         18 push @{$cigars{$seq->get_nse}},$pos;
  28         30  
1521 28         45 $pos++;
1522             } elsif ($seq[$x] ne $gapchar) {
1523 84         94 $pos++;
1524             }
1525             }
1526             }
1527             # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1528 1         3 for my $name (keys %cigars) {
1529 4         5 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
  4         5  
1530 4 50       3 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
  4         6  
  4         8  
1531 4         2 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
  4         4  
  4         4  
1532 4         6 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
  4         3  
1533 4 50       3 ${$cigars{$name}}[$#{$cigars{$name}}] );
  4         7  
  4         3  
1534 4         5 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
  32         40  
1535 28 100 100     16 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
  28         17  
  28         44  
1536 8         12 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
  8         18  
1537 4         1 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
  4         5  
  4         5  
1538             }
1539             }
1540             }
1541             # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1542 1         2 for my $name (keys %cigars) {
1543 4         5 my @remove;
1544 4         2 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
  40         48  
1545 36 100 100     17 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
  36         28  
  36         51  
1546 12         10 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
  12         26  
1547 8         8 unshift @remove,$x;
1548             }
1549             }
1550 4         5 for my $pos (@remove) {
1551 8         6 splice @{$cigars{$name}}, $pos, 1;
  8         8  
1552             }
1553             }
1554             # join and punctuate
1555 1         12 for my $name (keys %cigars) {
1556 4         4 my ($start,$end,$str) = "";
1557 4         4 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
  20         32  
1558 16         19 $str .= ($start . "," . $end . ":");
1559             }
1560 4         10 $str =~ s/:$//;
1561 4         7 $cigars{$name} = $str;
1562             }
1563 1         8 %cigars;
1564             }
1565              
1566              
1567             =head2 match_line
1568              
1569             Title : match_line()
1570             Usage : $line = $align->match_line()
1571             Function : Generates a match line - much like consensus string
1572             except that a line indicating the '*' for a match.
1573             Args : (optional) Match line characters ('*' by default)
1574             (optional) Strong match char (':' by default)
1575             (optional) Weak match char ('.' by default)
1576             Returns : String
1577              
1578             =cut
1579              
1580             sub match_line {
1581 6     6 1 14 my ($self,$matchlinechar, $strong, $weak) = @_;
1582 6   50     79 my %matchchars = ('match' => $matchlinechar || '*',
      50        
      50        
1583             'weak' => $weak || '.',
1584             'strong' => $strong || ':',
1585             'mismatch' => ' ',
1586             );
1587              
1588 6         6 my @seqchars;
1589             my $alphabet;
1590 6         20 foreach my $seq ( $self->each_seq ) {
1591 69         132 push @seqchars, [ split(//, uc ($seq->seq)) ];
1592 69 100       1117 $alphabet = $seq->alphabet unless defined $alphabet;
1593             }
1594 6         15 my $refseq = shift @seqchars;
1595             # let's just march down the columns
1596 6         9 my $matchline;
1597             POS:
1598 6         24 foreach my $pos ( 0..$self->length ) {
1599 3138         2503 my $refchar = $refseq->[$pos];
1600 3138         2028 my $char = $matchchars{'mismatch'};
1601 3138 100       3796 unless( defined $refchar ) {
1602 6 50       43 last if $pos == $self->length; # short circuit on last residue
1603             # this in place to handle jason's soon-to-be-committed
1604             # intron mapping code
1605 0         0 goto bottom;
1606             }
1607 3132         2866 my %col = ($refchar => 1);
1608 3132   66     9975 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1609 3132         2515 foreach my $seq ( @seqchars ) {
1610 26906 50       28048 next if $pos >= scalar @$seq;
1611 26906 100 100     89370 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
      66        
1612             $seq->[$pos] eq ' ' );
1613 26906 50       33526 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1614             }
1615 3132         4945 my @colresidues = sort keys %col;
1616              
1617             # if all the values are the same
1618 3132 100       4494 if( $dash ) { $char = $matchchars{'mismatch'} }
  758 100       605  
    100          
1619 1721         1344 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1620             elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1621             # matches for protein seqs
1622             TYPE:
1623 472         369 foreach my $type ( qw(strong weak) ) {
1624             # iterate through categories
1625 804         494 my %groups;
1626             # iterate through each of the aa in the col
1627             # look to see which groups it is in
1628 804         577 foreach my $c ( @colresidues ) {
1629 3075         1759 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
  30455         25027  
  3075         2802  
1630 6808         3699 push @{$groups{$f}},$c;
  6808         7975  
1631             }
1632             }
1633             GRP:
1634 804         850 foreach my $cols ( values %groups ) {
1635 3948         4424 @$cols = sort @$cols;
1636             # now we are just testing to see if two arrays
1637             # are identical w/o changing either one
1638             # have to be same len
1639 3948 100       6081 next if( scalar @$cols != scalar @colresidues );
1640             # walk down the length and check each slot
1641 196         302 for($_=0;$_ < (scalar @$cols);$_++ ) {
1642 453 50       850 next GRP if( $cols->[$_] ne $colresidues[$_] );
1643             }
1644 196         175 $char = $matchchars{$type};
1645 196         347 last TYPE;
1646             }
1647             }
1648             }
1649             bottom:
1650 3132         3919 $matchline .= $char;
1651             }
1652 6         1265 return $matchline;
1653             }
1654              
1655              
1656             =head2 gap_line
1657              
1658             Title : gap_line()
1659             Usage : $line = $align->gap_line()
1660             Function : Generates a gap line - much like consensus string
1661             except that a line where '-' represents gap
1662             Args : (optional) gap line characters ('-' by default)
1663             Returns : string
1664              
1665             =cut
1666              
1667             sub gap_line {
1668 14     14 1 17 my ($self,$gapchar) = @_;
1669 14   33     54 $gapchar = $gapchar || $self->gap_char;
1670 14         14 my %gap_hsh; # column gaps vector
1671 14         23 foreach my $seq ( $self->each_seq ) {
1672 33         24 my $i = 0;
1673 223         739 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
  4020         4640  
1674 33         68 map {[$i++, $_]} split(//, uc ($seq->seq));
  4020         3495  
1675             }
1676 14         17 my $gap_line;
1677 14         34 foreach my $pos ( 0..$self->length-1 ) {
1678 1973 100       2015 $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1679             }
1680 14         49 return $gap_line;
1681             }
1682              
1683             =head2 all_gap_line
1684              
1685             Title : all_gap_line()
1686             Usage : $line = $align->all_gap_line()
1687             Function : Generates a gap line - much like consensus string
1688             except that a line where '-' represents all-gap column
1689             Args : (optional) gap line characters ('-' by default)
1690             Returns : string
1691              
1692             =cut
1693              
1694             sub all_gap_line {
1695 2     2 1 4 my ($self,$gapchar) = @_;
1696 2   66     8 $gapchar = $gapchar || $self->gap_char;
1697 2         3 my %gap_hsh; # column gaps counter hash
1698 2         5 my @seqs = $self->each_seq;
1699 2         5 foreach my $seq ( @seqs ) {
1700 5         8 my $i = 0;
1701 7         17 map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
  54         101  
1702 5         9 map {[$i++, $_]} split(//, uc ($seq->seq));
  54         56  
1703             }
1704 2         3 my $gap_line;
1705 2         6 foreach my $pos ( 0..$self->length-1 ) {
1706 22 100 100     43 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1707             # gaps column
1708 1         2 $gap_line .= $self->gap_char;
1709             } else {
1710 21         18 $gap_line .= '.';
1711             }
1712             }
1713 2         7 return $gap_line;
1714             }
1715              
1716             =head2 gap_col_matrix
1717              
1718             Title : gap_col_matrix()
1719             Usage : my $cols = $align->gap_col_matrix()
1720             Function : Generates an array where each element in the array is a
1721             hash reference with a key of the sequence name and a
1722             value of 1 if the sequence has a gap at that column
1723             Returns : Reference to an array
1724             Args : Optional: gap line character ($aln->gap_char or '-' by default)
1725              
1726             =cut
1727              
1728             sub gap_col_matrix {
1729 0     0 1 0 my ( $self, $gapchar ) = @_;
1730 0   0     0 $gapchar = $gapchar || $self->gap_char;
1731 0         0 my %gap_hsh; # column gaps vector
1732             my @cols;
1733 0         0 foreach my $seq ( $self->each_seq ) {
1734 0         0 my $i = 0;
1735 0         0 my $str = $seq->seq;
1736 0         0 my $len = $seq->length;
1737 0         0 my $ch;
1738 0         0 my $id = $seq->display_id;
1739 0         0 while ( $i < $len ) {
1740 0         0 $ch = substr( $str, $i, 1 );
1741 0         0 $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1742             }
1743             }
1744 0         0 return \@cols;
1745             }
1746              
1747             =head2 match
1748              
1749             Title : match()
1750             Usage : $ali->match()
1751             Function : Goes through all columns and changes residues that are
1752             identical to residue in first sequence to match '.'
1753             character. Sets match_char.
1754              
1755             USE WITH CARE: Most MSA formats do not support match
1756             characters in sequences, so this is mostly for output
1757             only. NEXUS format (Bio::AlignIO::nexus) can handle
1758             it.
1759             Returns : 1 on success
1760             Argument : a match character, optional, defaults to '.'
1761              
1762             =cut
1763              
1764             sub match {
1765 1     1 1 1 my ( $self, $match ) = @_;
1766              
1767 1   50     5 $match ||= '.';
1768 1         1 my ($matching_char) = $match;
1769 1 50       5 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/; #';
1770 1         5 $self->map_chars( $matching_char, '-' );
1771              
1772 1         2 my @seqs = $self->each_seq();
1773 1 50       4 return 1 unless scalar @seqs > 1;
1774              
1775 1         2 my $refseq = shift @seqs;
1776 1         2 my @refseq = split //, $refseq->seq;
1777 1         3 my $gapchar = $self->gap_char;
1778              
1779 1         2 foreach my $seq (@seqs) {
1780 5         8 my @varseq = split //, $seq->seq();
1781 5         10 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1782 720 100 66     3120 $varseq[$i] = $match
      33        
      66        
1783             if defined $refseq[$i]
1784             && ( $refseq[$i] =~ /[A-Za-z\*]/
1785             || $refseq[$i] =~ /$gapchar/ )
1786             && $refseq[$i] eq $varseq[$i];
1787             }
1788 5         31 $seq->seq( join '', @varseq );
1789             }
1790 1         3 $self->match_char($match);
1791 1         6 return 1;
1792             }
1793              
1794              
1795              
1796             =head2 unmatch
1797              
1798             Title : unmatch()
1799             Usage : $ali->unmatch()
1800             Function : Undoes the effect of method match. Unsets match_char.
1801             Returns : 1 on success
1802             Argument : a match character, optional, defaults to '.'
1803              
1804             See L and L
1805              
1806             =cut
1807              
1808             sub unmatch {
1809 5     5 1 8 my ( $self, $match ) = @_;
1810              
1811 5   100     17 $match ||= '.';
1812              
1813 5         16 my @seqs = $self->each_seq();
1814 5 50       18 return 1 unless scalar @seqs > 1;
1815              
1816 5         7 my $refseq = shift @seqs;
1817 5         17 my @refseq = split //, $refseq->seq;
1818 5         15 my $gapchar = $self->gap_char;
1819 5         8 foreach my $seq (@seqs) {
1820 114         168 my @varseq = split //, $seq->seq();
1821 114         200 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1822 19782 100 100     86466 $varseq[$i] = $refseq[$i]
      33        
      100        
1823             if defined $refseq[$i]
1824             && ( $refseq[$i] =~ /[A-Za-z\*]/
1825             || $refseq[$i] =~ /$gapchar/ )
1826             && $varseq[$i] eq $match;
1827             }
1828 114         834 $seq->seq( join '', @varseq );
1829             }
1830 5         20 $self->match_char('');
1831 5         71 return 1;
1832             }
1833              
1834              
1835             =head1 MSA attributes
1836              
1837             Methods for setting and reading the MSA attributes.
1838              
1839             Note that the methods defining character semantics depend on the user
1840             to set them sensibly. They are needed only by certain input/output
1841             methods. Unset them by setting to an empty string ('').
1842              
1843             =head2 id
1844              
1845             Title : id
1846             Usage : $myalign->id("Ig")
1847             Function : Gets/sets the id field of the alignment
1848             Returns : An id string
1849             Argument : An id string (optional)
1850              
1851             =cut
1852              
1853             sub id {
1854 142     142 1 159 my ( $self, $name ) = @_;
1855              
1856 142 100       244 if ( defined($name) ) {
1857 72         99 $self->{'_id'} = $name;
1858             }
1859              
1860 142         259 return $self->{'_id'};
1861             }
1862              
1863             =head2 accession
1864              
1865             Title : accession
1866             Usage : $myalign->accession("PF00244")
1867             Function : Gets/sets the accession field of the alignment
1868             Returns : An acc string
1869             Argument : An acc string (optional)
1870              
1871             =cut
1872              
1873             sub accession {
1874 16     16 1 19 my ( $self, $acc ) = @_;
1875              
1876 16 100       36 if ( defined($acc) ) {
1877 8         17 $self->{'_accession'} = $acc;
1878             }
1879              
1880 16         34 return $self->{'_accession'};
1881             }
1882              
1883             =head2 description
1884              
1885             Title : description
1886             Usage : $myalign->description("14-3-3 proteins")
1887             Function : Gets/sets the description field of the alignment
1888             Returns : An description string
1889             Argument : An description string (optional)
1890              
1891             =cut
1892              
1893             sub description {
1894 25     25 1 28 my ( $self, $name ) = @_;
1895              
1896 25 100       65 if ( defined($name) ) {
1897 13         31 $self->{'_description'} = $name;
1898             }
1899              
1900 25         49 return $self->{'_description'};
1901             }
1902              
1903             =head2 missing_char
1904              
1905             Title : missing_char
1906             Usage : $myalign->missing_char("?")
1907             Function : Gets/sets the missing_char attribute of the alignment
1908             It is generally recommended to set it to 'n' or 'N'
1909             for nucleotides and to 'X' for protein.
1910             Returns : An missing_char string,
1911             Argument : An missing_char string (optional)
1912              
1913             =cut
1914              
1915             sub missing_char {
1916 32     32 1 39 my ( $self, $char ) = @_;
1917              
1918 32 100       59 if ( defined $char ) {
1919 22 50       42 $self->throw("Single missing character, not [$char]!")
1920             if CORE::length($char) > 1;
1921 22         38 $self->{'_missing_char'} = $char;
1922             }
1923              
1924 32         46 return $self->{'_missing_char'};
1925             }
1926              
1927              
1928             =head2 match_char
1929              
1930             Title : match_char
1931             Usage : $myalign->match_char('.')
1932             Function : Gets/sets the match_char attribute of the alignment
1933             Returns : An match_char string,
1934             Argument : An match_char string (optional)
1935              
1936             =cut
1937              
1938             sub match_char {
1939 18     18 1 29 my ( $self, $char ) = @_;
1940              
1941 18 100       35 if ( defined $char ) {
1942 9 50       22 $self->throw("Single match character, not [$char]!")
1943             if CORE::length($char) > 1;
1944 9         19 $self->{'_match_char'} = $char;
1945             }
1946              
1947 18         40 return $self->{'_match_char'};
1948             }
1949              
1950             =head2 gap_char
1951              
1952             Title : gap_char
1953             Usage : $myalign->gap_char('-')
1954             Function : Gets/sets the gap_char attribute of the alignment
1955             Returns : An gap_char string, defaults to '-'
1956             Argument : An gap_char string (optional)
1957              
1958             =cut
1959              
1960             sub gap_char {
1961 11242     11242 1 7597 my ( $self, $char ) = @_;
1962              
1963 11242 100 100     29178 if ( defined $char || !defined $self->{'_gap_char'} ) {
1964 51 100       108 $char = '-' unless defined $char;
1965 51 50       102 $self->throw("Single gap character, not [$char]!")
1966             if CORE::length($char) > 1;
1967 51         134 $self->{'_gap_char'} = $char;
1968             }
1969 11242         15187 return $self->{'_gap_char'};
1970             }
1971              
1972              
1973             =head2 symbol_chars
1974              
1975             Title : symbol_chars
1976             Usage : my @symbolchars = $aln->symbol_chars;
1977             Function: Returns all the seen symbols (other than gaps)
1978             Returns : array of characters that are the seen symbols
1979             Args : boolean to include the gap/missing/match characters
1980              
1981             =cut
1982              
1983             sub symbol_chars{
1984 5     5 1 7 my ($self,$includeextra) = @_;
1985              
1986 5 100       12 unless ($self->{'_symbols'}) {
1987 3         7 foreach my $seq ($self->each_seq) {
1988 12         22 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
  2778         2017  
1989             }
1990             }
1991 5         6 my %copy = %{$self->{'_symbols'}};
  5         26  
1992 5 50       14 if( ! $includeextra ) {
1993 5         7 foreach my $char ( $self->gap_char, $self->match_char,
1994             $self->missing_char) {
1995 15 100       26 delete $copy{$char} if( defined $char );
1996             }
1997             }
1998 5         30 return keys %copy;
1999             }
2000              
2001             =head1 Alignment descriptors
2002              
2003             These read only methods describe the MSA in various ways.
2004              
2005              
2006             =head2 score
2007              
2008             Title : score
2009             Usage : $str = $ali->score()
2010             Function : get/set a score of the alignment
2011             Returns : a score for the alignment
2012             Argument : an optional score to set
2013              
2014             =cut
2015              
2016             sub score {
2017 26     26 1 994 my $self = shift;
2018 26 100       69 $self->{score} = shift if @_;
2019 26         53 return $self->{score};
2020             }
2021              
2022             =head2 consensus_string
2023              
2024             Title : consensus_string
2025             Usage : $str = $ali->consensus_string($threshold_percent)
2026             Function : Makes a strict consensus
2027             Returns : Consensus string
2028             Argument : Optional threshold ranging from 0 to 100.
2029             The consensus residue has to appear at least threshold %
2030             of the sequences at a given location, otherwise a '?'
2031             character will be placed at that location.
2032             (Default value = 0%)
2033              
2034             =cut
2035              
2036             sub consensus_string {
2037 11     11 1 2724 my $self = shift;
2038 11         14 my $threshold = shift;
2039              
2040 11         20 my $out = "";
2041 11         30 my $len = $self->length - 1;
2042              
2043 11         26 foreach ( 0 .. $len ) {
2044 2100         2262 $out .= $self->_consensus_aa( $_, $threshold );
2045             }
2046 11         89 return $out;
2047             }
2048              
2049              
2050             =head2 consensus_conservation
2051              
2052             Title : consensus_conservation
2053             Usage : @conservation = $ali->consensus_conservation();
2054             Function : Conservation (as a percent) of each position of alignment
2055             Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2056             Argument :
2057            
2058             =cut
2059              
2060             sub consensus_conservation {
2061 1     1 1 4 my $self = shift;
2062 1         2 my @cons;
2063 1         4 my $num_sequences = $self->num_sequences;
2064 1         6 foreach my $point (0..$self->length-1) {
2065 422         437 my %hash = $self->_consensus_counts($point);
2066             # max frequency of a non-gap letter
2067 422         582 my $max = (sort {$b<=>$a} values %hash )[0];
  562         452  
2068 422         615 push @cons, 100 * $max / $num_sequences;
2069             }
2070 1         62 return @cons;
2071             }
2072              
2073             sub _consensus_aa {
2074 2100     2100   1458 my $self = shift;
2075 2100         1324 my $point = shift;
2076 2100   100     4140 my $threshold_percent = shift || -1 ;
2077 2100         1411 my ($seq,%hash,$count,$letter,$key);
2078 2100         2170 my $gapchar = $self->gap_char;
2079 2100         2286 %hash = $self->_consensus_counts($point);
2080 2100         2594 my $number_of_sequences = $self->num_sequences();
2081 2100         2231 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2082 2100         1354 $count = -1;
2083 2100         1321 $letter = '?';
2084              
2085 2100         3097 foreach $key ( sort keys %hash ) {
2086             # print "Now at $key $hash{$key}\n";
2087 4898 100 100     11665 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2088 2472         1585 $letter = $key;
2089 2472         2090 $count = $hash{$key};
2090             }
2091             }
2092 2100         3730 return $letter;
2093             }
2094              
2095             # Frequency of each letter in one column
2096             sub _consensus_counts {
2097 2522     2522   1638 my $self = shift;
2098 2522         1577 my $point = shift;
2099 2522         1566 my %hash;
2100 2522         2377 my $gapchar = $self->gap_char;
2101 2522         2636 foreach my $seq ( $self->each_seq() ) {
2102 22078         24364 my $letter = substr($seq->seq,$point,1);
2103 22078 50       25351 $self->throw("--$point-----------") if $letter eq '';
2104 22078 100 100     50815 ($letter eq $gapchar || $letter =~ /\./) && next;
2105 17334         15788 $hash{$letter}++;
2106             }
2107 2522         5440 return %hash;
2108             }
2109              
2110              
2111             =head2 consensus_iupac
2112              
2113             Title : consensus_iupac
2114             Usage : $str = $ali->consensus_iupac()
2115             Function : Makes a consensus using IUPAC ambiguity codes from DNA
2116             and RNA. The output is in upper case except when gaps in
2117             a column force output to be in lower case.
2118              
2119             Note that if your alignment sequences contain a lot of
2120             IUPAC ambiquity codes you often have to manually set
2121             alphabet. Bio::PrimarySeq::_guess_type thinks they
2122             indicate a protein sequence.
2123             Returns : consensus string
2124             Argument : none
2125             Throws : on protein sequences
2126              
2127             =cut
2128              
2129             sub consensus_iupac {
2130 1     1 1 5 my $self = shift;
2131 1         2 my $out = "";
2132 1         5 my $len = $self->length - 1;
2133              
2134             # only DNA and RNA sequences are valid
2135 1         4 foreach my $seq ( $self->each_seq() ) {
2136 2 50       5 $self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2137             if $seq->alphabet eq 'protein';
2138             }
2139              
2140             # loop over the alignment columns
2141 1         3 foreach my $count ( 0 .. $len ) {
2142 10         13 $out .= $self->_consensus_iupac($count);
2143             }
2144 1         6 return $out;
2145             }
2146              
2147             sub _consensus_iupac {
2148 10     10   7 my ($self, $column) = @_;
2149 10         5 my ($string, $char, $rna);
2150              
2151             #determine all residues in a column
2152 10         13 foreach my $seq ( $self->each_seq() ) {
2153 20         24 $string .= substr($seq->seq, $column, 1);
2154             }
2155 10         10 $string = uc $string;
2156              
2157             # quick exit if there's an N in the string
2158 10 100       15 if ($string =~ /N/) {
2159 1 50       6 $string =~ /\W/ ? return 'n' : return 'N';
2160             }
2161             # ... or if there are only gap characters
2162 9 100       18 return '-' if $string =~ /^\W+$/;
2163              
2164             # treat RNA as DNA in regexps
2165 7 50       10 if ($string =~ /U/) {
2166 0         0 $string =~ s/U/T/;
2167 0         0 $rna = 1;
2168             }
2169              
2170             # the following s///'s only need to be done to the _first_ ambiguity code
2171             # as we only need to see the _range_ of characters in $string
2172              
2173 7 50       12 if ($string =~ /[VDHB]/) {
2174 0         0 $string =~ s/V/AGC/;
2175 0         0 $string =~ s/D/AGT/;
2176 0         0 $string =~ s/H/ACT/;
2177 0         0 $string =~ s/B/CTG/;
2178             }
2179              
2180 7 100       11 if ($string =~ /[SKYRWM]/) {
2181 1         2 $string =~ s/S/GC/;
2182 1         2 $string =~ s/K/GT/;
2183 1         1 $string =~ s/Y/CT/;
2184 1         2 $string =~ s/R/AG/;
2185 1         2 $string =~ s/W/AT/;
2186 1         2 $string =~ s/M/AC/;
2187             }
2188              
2189             # and now the guts of the thing
2190              
2191 7 100       21 if ($string =~ /A/) {
    50          
    50          
    50          
2192 5         2 $char = 'A'; # A A
2193 5 50       18 if ($string =~ /G/) {
    50          
    100          
2194 0         0 $char = 'R'; # A and G (purines) R
2195 0 0       0 if ($string =~ /C/) {
    0          
2196 0         0 $char = 'V'; # A and G and C V
2197 0 0       0 if ($string =~ /T/) {
2198 0         0 $char = 'N'; # A and G and C and T N
2199             }
2200             } elsif ($string =~ /T/) {
2201 0         0 $char = 'D'; # A and G and T D
2202             }
2203             } elsif ($string =~ /C/) {
2204 0         0 $char = 'M'; # A and C M
2205 0 0       0 if ($string =~ /T/) {
2206 0         0 $char = 'H'; # A and C and T H
2207             }
2208             } elsif ($string =~ /T/) {
2209 2         2 $char = 'W'; # A and T W
2210             }
2211             } elsif ($string =~ /C/) {
2212 0         0 $char = 'C'; # C C
2213 0 0       0 if ($string =~ /T/) {
    0          
2214 0         0 $char = 'Y'; # C and T (pyrimidines) Y
2215 0 0       0 if ($string =~ /G/) {
2216 0         0 $char = 'B'; # C and T and G B
2217             }
2218             } elsif ($string =~ /G/) {
2219 0         0 $char = 'S'; # C and G S
2220             }
2221             } elsif ($string =~ /G/) {
2222 0         0 $char = 'G'; # G G
2223 0 0       0 if ($string =~ /C/) {
    0          
2224 0         0 $char = 'S'; # G and C S
2225             } elsif ($string =~ /T/) {
2226 0         0 $char = 'K'; # G and T K
2227             }
2228             } elsif ($string =~ /T/) {
2229 2         2 $char = 'T'; # T T
2230             }
2231              
2232 7 50 33     13 $char = 'U' if $rna and $char eq 'T';
2233 7 100       12 $char = lc $char if $string =~ /\W/;
2234              
2235 7         10 return $char;
2236             }
2237              
2238              
2239             =head2 consensus_meta
2240              
2241             Title : consensus_meta
2242             Usage : $seqmeta = $ali->consensus_meta()
2243             Function : Returns a Bio::Seq::Meta object containing the consensus
2244             strings derived from meta data analysis.
2245             Returns : Bio::Seq::Meta
2246             Argument : Bio::Seq::Meta
2247             Throws : non-MetaI object
2248              
2249             =cut
2250              
2251             sub consensus_meta {
2252 56     56 1 67 my ($self, $meta) = @_;
2253 56 50 33     239 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
      66        
2254 0         0 $self->throw('Not a Bio::Seq::MetaI object');
2255             }
2256 56 100       111 return $self->{'_aln_meta'} = $meta if $meta;
2257 39         56 return $self->{'_aln_meta'}
2258             }
2259              
2260             =head2 is_flush
2261              
2262             Title : is_flush
2263             Usage : if ( $ali->is_flush() )
2264             Function : Tells you whether the alignment
2265             : is flush, i.e. all of the same length
2266             Returns : 1 or 0
2267             Argument :
2268              
2269             =cut
2270              
2271             sub is_flush {
2272 54     54 1 66 my ( $self, $report ) = @_;
2273 54         49 my $seq;
2274 54         54 my $length = (-1);
2275 54         49 my $temp;
2276              
2277 54         107 foreach $seq ( $self->each_seq() ) {
2278 250 100       337 if ( $length == (-1) ) {
2279 54         129 $length = CORE::length( $seq->seq() );
2280 54         67 next;
2281             }
2282              
2283 196         241 $temp = CORE::length( $seq->seq() );
2284 196 50       307 if ( $temp != $length ) {
2285 0 0       0 $self->warn(
2286             "expecting $length not $temp from " . $seq->display_id )
2287             if ($report);
2288 0         0 $self->debug(
2289             "expecting $length not $temp from " . $seq->display_id );
2290 0         0 $self->debug( $seq->seq() . "\n" );
2291 0         0 return 0;
2292             }
2293             }
2294              
2295 54         147 return 1;
2296             }
2297              
2298              
2299             =head2 length
2300              
2301             Title : length()
2302             Usage : $len = $ali->length()
2303             Function : Returns the maximum length of the alignment.
2304             To be sure the alignment is a block, use is_flush
2305             Returns : Integer
2306             Argument :
2307              
2308             =cut
2309              
2310             sub length_aln {
2311 0     0 0 0 my $self = shift;
2312 0         0 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2313 0         0 $self->length(@_);
2314             }
2315              
2316             sub length {
2317 2217     2217 1 1918 my $self = shift;
2318 2217         1431 my $seq;
2319 2217         1374 my $length = -1;
2320 2217         1415 my $temp;
2321            
2322 2217         2484 foreach $seq ( $self->each_seq() ) {
2323 8526         9547 $temp = $seq->length();
2324 8526 100       10420 if( $temp > $length ) {
2325 2227         1675 $length = $temp;
2326             }
2327             }
2328              
2329 2217         4047 return $length;
2330             }
2331              
2332              
2333             =head2 maxdisplayname_length
2334              
2335             Title : maxdisplayname_length
2336             Usage : $ali->maxdisplayname_length()
2337             Function : Gets the maximum length of the displayname in the
2338             alignment. Used in writing out various MSA formats.
2339             Returns : integer
2340             Argument :
2341              
2342             =cut
2343              
2344             sub maxname_length {
2345 0     0 1 0 my $self = shift;
2346 0         0 $self->deprecated("maxname_length - deprecated method.".
2347             " Use maxdisplayname_length() instead.");
2348 0         0 $self->maxdisplayname_length();
2349             }
2350              
2351             sub maxnse_length {
2352 0     0 0 0 my $self = shift;
2353 0         0 $self->deprecated("maxnse_length - deprecated method.".
2354             " Use maxnse_length() instead.");
2355 0         0 $self->maxdisplayname_length();
2356             }
2357              
2358             sub maxdisplayname_length {
2359 21     21 1 29 my $self = shift;
2360 21         27 my $maxname = (-1);
2361 21         23 my ( $seq, $len );
2362              
2363 21         42 foreach $seq ( $self->each_seq() ) {
2364 122         174 $len = CORE::length $self->displayname( $seq->get_nse() );
2365              
2366 122 100       187 if ( $len > $maxname ) {
2367 52         61 $maxname = $len;
2368             }
2369             }
2370              
2371 21         46 return $maxname;
2372             }
2373              
2374             =head2 max_metaname_length
2375              
2376             Title : max_metaname_length
2377             Usage : $ali->max_metaname_length()
2378             Function : Gets the maximum length of the meta name tags in the
2379             alignment for the sequences and for the alignment.
2380             Used in writing out various MSA formats.
2381             Returns : integer
2382             Argument : None
2383              
2384             =cut
2385              
2386             sub max_metaname_length {
2387 2     2 1 2 my $self = shift;
2388 2         3 my $maxname = (-1);
2389 2         3 my ($seq,$len);
2390            
2391             # check seq meta first
2392 2         5 for $seq ( $self->each_seq() ) {
2393 17 100       35 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2394 11         15 for my $mtag ($seq->meta_names) {
2395 0         0 $len = CORE::length $mtag;
2396 0 0       0 if( $len > $maxname ) {
2397 0         0 $maxname = $len;
2398             }
2399             }
2400             }
2401            
2402             # alignment meta
2403 2         6 for my $meta ($self->consensus_meta) {
2404 2 100       6 next unless $meta;
2405 1         2 for my $name ($meta->meta_names) {
2406 1         1 $len = CORE::length $name;
2407 1 50       3 if( $len > $maxname ) {
2408 1         2 $maxname = $len;
2409             }
2410             }
2411             }
2412              
2413 2         6 return $maxname;
2414             }
2415              
2416             =head2 num_residues
2417              
2418             Title : num_residues
2419             Usage : $no = $ali->num_residues
2420             Function : number of residues in total in the alignment
2421             Returns : integer
2422             Argument :
2423             Note : replaces no_residues()
2424              
2425             =cut
2426              
2427             sub num_residues {
2428 2     2 1 5 my $self = shift;
2429 2         3 my $count = 0;
2430              
2431 2         6 foreach my $seq ( $self->each_seq ) {
2432 19         41 my $str = $seq->seq();
2433              
2434 19         787 $count += ( $str =~ s/[A-Za-z]//g );
2435             }
2436              
2437 2         6 return $count;
2438             }
2439              
2440             =head2 num_sequences
2441              
2442             Title : num_sequences
2443             Usage : $depth = $ali->num_sequences
2444             Function : number of sequence in the sequence alignment
2445             Returns : integer
2446             Argument : none
2447             Note : replaces no_sequences()
2448              
2449             =cut
2450              
2451             sub num_sequences {
2452 2553     2553 1 2960 my $self = shift;
2453 2553         2736 return scalar($self->each_seq);
2454             }
2455              
2456             =head2 average_percentage_identity
2457              
2458             Title : average_percentage_identity
2459             Usage : $id = $align->average_percentage_identity
2460             Function: The function uses a fast method to calculate the average
2461             percentage identity of the alignment
2462             Returns : The average percentage identity of the alignment
2463             Args : None
2464             Notes : This method implemented by Kevin Howe calculates a figure that is
2465             designed to be similar to the average pairwise identity of the
2466             alignment (identical in the absence of gaps), without having to
2467             explicitly calculate pairwise identities proposed by Richard Durbin.
2468             Validated by Ewan Birney ad Alex Bateman.
2469              
2470             =cut
2471              
2472             sub average_percentage_identity{
2473 9     9 1 14 my ($self,@args) = @_;
2474              
2475 9         45 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2476             'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2477              
2478 9         9 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2479              
2480 9 50       24 if (! $self->is_flush()) {
2481 0         0 $self->throw("All sequences in the alignment must be the same length");
2482             }
2483              
2484 9         19 @seqs = $self->each_seq();
2485 9         20 $len = $self->length();
2486              
2487             # load the each hash with correct keys for existence checks
2488              
2489 9         26 for( my $index=0; $index < $len; $index++) {
2490 2096         1455 foreach my $letter (@alphabet) {
2491 54496         41826 $countHashes[$index]->{$letter} = 0;
2492             }
2493             }
2494 9         17 foreach my $seq (@seqs) {
2495 32         81 my @seqChars = split //, $seq->seq();
2496 32         72 for( my $column=0; $column < @seqChars; $column++ ) {
2497 7580         4511 my $char = uc($seqChars[$column]);
2498 7580 100       8518 if (exists $countHashes[$column]->{$char}) {
2499 7077         8883 $countHashes[$column]->{$char}++;
2500             }
2501             }
2502             }
2503              
2504 9         15 $total = 0;
2505 9         11 $divisor = 0;
2506 9         31 for(my $column =0; $column < $len; $column++) {
2507 2096         1145 my %hash = %{$countHashes[$column]};
  2096         8751  
2508 2096         2447 $subdivisor = 0;
2509 2096         4246 foreach my $res (keys %hash) {
2510 54496         36561 $total += $hash{$res}*($hash{$res} - 1);
2511 54496         36182 $subdivisor += $hash{$res};
2512             }
2513 2096         5751 $divisor += $subdivisor * ($subdivisor - 1);
2514             }
2515 9 50       1888 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2516             }
2517              
2518             =head2 percentage_identity
2519              
2520             Title : percentage_identity
2521             Usage : $id = $align->percentage_identity
2522             Function: The function calculates the average percentage identity
2523             (aliased to average_percentage_identity)
2524             Returns : The average percentage identity
2525             Args : None
2526              
2527             =cut
2528              
2529             sub percentage_identity {
2530 5     5 1 8 my $self = shift;
2531 5         14 return $self->average_percentage_identity();
2532             }
2533              
2534             =head2 overall_percentage_identity
2535              
2536             Title : overall_percentage_identity
2537             Usage : $id = $align->overall_percentage_identity
2538             $id = $align->overall_percentage_identity('short')
2539             Function: The function calculates the percentage identity of
2540             the conserved columns
2541             Returns : The percentage identity of the conserved columns
2542             Args : length value to use, optional defaults to alignment length
2543             possible values: 'align', 'short', 'long'
2544              
2545             The argument values 'short' and 'long' refer to shortest and longest
2546             sequence in the alignment. Method modification code by Hongyu Zhang.
2547              
2548             =cut
2549              
2550             sub overall_percentage_identity{
2551 18     18 1 36 my ($self, $length_measure) = @_;
2552              
2553 18         43 my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
  468         492  
2554              
2555 18         47 my %enum = map {$_ => undef} qw (align short long);
  54         73  
2556              
2557             $self->throw("Unknown argument [$length_measure]")
2558 18 50 66     59 if $length_measure and not exists $enum{$length_measure};
2559 18   100     57 $length_measure ||= 'align';
2560              
2561 18 50       39 if (! $self->is_flush()) {
2562 0         0 $self->throw("All sequences in the alignment must be the same length");
2563             }
2564              
2565             # Count the residues seen at each position
2566 18         19 my $len;
2567 18         20 my $total = 0; # number of positions with identical residues
2568 18         17 my @countHashes;
2569 18         27 my @seqs = $self->each_seq;
2570 18         22 my $nof_seqs = scalar @seqs;
2571 18         43 my $aln_len = $self->length();
2572 18         27 for my $seq (@seqs) {
2573 92         186 my $seqstr = $seq->seq;
2574              
2575             # Count residues for given sequence
2576 92         158 for my $column (0 .. $aln_len-1) {
2577 17812         12067 my $char = uc( substr($seqstr, $column, 1) );
2578 17812 100       19106 if ( exists $alphabet{$char} ) {
2579              
2580             # This is a valid char
2581 17043 100       15394 if ( defined $countHashes[$column]->{$char} ) {
2582 12763         8168 $countHashes[$column]->{$char}++;
2583             } else {
2584 4280         3604 $countHashes[$column]->{$char} = 1;
2585             }
2586              
2587 17043 100       20443 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2588             # All sequences have this same residue
2589 819         599 $total++;
2590             }
2591              
2592             }
2593             }
2594              
2595             # Sequence length
2596 92 100 100     340 if ($length_measure eq 'short' || $length_measure eq 'long') {
2597 32         40 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2598 32 100       48 if ($length_measure eq 'short') {
    50          
2599 16 100 100     60 if ( (not defined $len) || ($seq_len < $len) ) {
2600 5         7 $len = $seq_len;
2601             }
2602             } elsif ($length_measure eq 'long') {
2603 16 100 100     47 if ( (not defined $len) || ($seq_len > $len) ) {
2604 2         4 $len = $seq_len;
2605             }
2606             }
2607             }
2608              
2609             }
2610              
2611 18 100       36 if ($length_measure eq 'align') {
2612 16         23 $len = $aln_len;
2613             }
2614              
2615 18         580 return ($total / $len ) * 100.0;
2616             }
2617              
2618              
2619              
2620             =head1 Alignment positions
2621              
2622             Methods to map a sequence position into an alignment column and back.
2623             column_from_residue_number() does the former. The latter is really a
2624             property of the sequence object and can done using
2625             L:
2626              
2627             # select somehow a sequence from the alignment, e.g.
2628             my $seq = $aln->get_seq_by_pos(1);
2629             #$loc is undef or Bio::LocationI object
2630             my $loc = $seq->location_from_column(5);
2631              
2632             =head2 column_from_residue_number
2633              
2634             Title : column_from_residue_number
2635             Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2636             Function: This function gives the position in the alignment
2637             (i.e. column number) of the given residue number in the
2638             sequence with the given name. For example, for the
2639             alignment
2640              
2641             Seq1/91-97 AC..DEF.GH.
2642             Seq2/24-30 ACGG.RTY...
2643             Seq3/43-51 AC.DDEF.GHI
2644              
2645             column_from_residue_number( "Seq1", 94 ) returns 6.
2646             column_from_residue_number( "Seq2", 25 ) returns 2.
2647             column_from_residue_number( "Seq3", 50 ) returns 10.
2648              
2649             An exception is thrown if the residue number would lie
2650             outside the length of the aligment
2651             (e.g. column_from_residue_number( "Seq2", 22 )
2652              
2653             Note: If the the parent sequence is represented by more than
2654             one alignment sequence and the residue number is present in
2655             them, this method finds only the first one.
2656              
2657             Returns : A column number for the position in the alignment of the
2658             given residue in the given sequence (1 = first column)
2659             Args : A sequence id/name (not a name/start-end)
2660             A residue number in the whole sequence (not just that
2661             segment of it in the alignment)
2662              
2663             =cut
2664              
2665             sub column_from_residue_number {
2666 27     27 1 494 my ( $self, $name, $resnumber ) = @_;
2667              
2668             $self->throw("No sequence with name [$name]")
2669 27 50       51 unless $self->{'_start_end_lists'}->{$name};
2670 27 50       38 $self->throw("Second argument residue number missing") unless $resnumber;
2671              
2672 27         46 foreach my $seq ( $self->each_seq_with_id($name) ) {
2673 27         19 my $col;
2674 27         26 eval { $col = $seq->column_from_residue_number($resnumber); };
  27         49  
2675 27 50       41 next if $@;
2676 27         50 return $col;
2677             }
2678              
2679 0         0 $self->throw( "Could not find a sequence segment in $name "
2680             . "containing residue number $resnumber" );
2681              
2682             }
2683              
2684             =head1 Sequence names
2685              
2686             Methods to manipulate the display name. The default name based on the
2687             sequence id and subsequence positions can be overridden in various
2688             ways.
2689              
2690             =head2 displayname
2691              
2692             Title : displayname
2693             Usage : $myalign->displayname("Ig", "IgA")
2694             Function : Gets/sets the display name of a sequence in the alignment
2695             Returns : A display name string
2696             Argument : name of the sequence
2697             displayname of the sequence (optional)
2698              
2699             =cut
2700              
2701             sub displayname {
2702 550     550 1 494 my ( $self, $name, $disname ) = @_;
2703              
2704             $self->throw("No sequence with name [$name]")
2705 550 50       831 unless defined $self->{'_seq'}->{$name};
2706              
2707 550 100 66     1230 if ( $disname and $name ) {
    100          
2708 112         142 $self->{'_dis_name'}->{$name} = $disname;
2709 112         139 return $disname;
2710             }
2711             elsif ( defined $self->{'_dis_name'}->{$name} ) {
2712 48         148 return $self->{'_dis_name'}->{$name};
2713             }
2714             else {
2715 390         623 return $name;
2716             }
2717             }
2718              
2719             sub get_displayname {
2720 0     0 0 0 my $self = shift;
2721 0         0 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2722 0         0 $self->displayname(@_);
2723             }
2724              
2725             sub set_displayname {
2726 0     0 0 0 my $self = shift;
2727 0         0 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2728 0         0 $self->displayname(@_);
2729             }
2730              
2731              
2732             =head2 set_displayname_count
2733              
2734             Title : set_displayname_count
2735             Usage : $ali->set_displayname_count
2736             Function : Sets the names to be name_# where # is the number of
2737             times this name has been used.
2738             Returns : 1, on success
2739             Argument :
2740              
2741             =cut
2742              
2743             sub set_displayname_count {
2744 1     1 1 3 my $self= shift;
2745 1         2 my (@arr,$name,$seq,$count,$temp,$nse);
2746              
2747 1         5 foreach $seq ( $self->each_alphabetically() ) {
2748 16         25 $nse = $seq->get_nse();
2749              
2750             #name will be set when this is the second
2751             #time (or greater) is has been seen
2752              
2753 16 50 66     36 if( defined $name and $name eq ($seq->id()) ) {
2754 0         0 $temp = sprintf("%s_%s",$name,$count);
2755 0         0 $self->displayname($nse,$temp);
2756 0         0 $count++;
2757             } else {
2758 16         14 $count = 1;
2759 16         20 $name = $seq->id();
2760 16         33 $temp = sprintf("%s_%s",$name,$count);
2761 16         21 $self->displayname($nse,$temp);
2762 16         19 $count++;
2763             }
2764             }
2765 1         6 return 1;
2766             }
2767              
2768             =head2 set_displayname_flat
2769              
2770             Title : set_displayname_flat
2771             Usage : $ali->set_displayname_flat()
2772             Function : Makes all the sequences be displayed as just their name,
2773             not name/start-end (NSE)
2774             Returns : 1
2775             Argument :
2776              
2777             =cut
2778              
2779             sub set_displayname_flat {
2780 11     11 1 19 my $self = shift;
2781 11         16 my ( $nse, $seq );
2782              
2783 11         25 foreach $seq ( $self->each_seq() ) {
2784 79         121 $nse = $seq->get_nse();
2785 79         111 $self->displayname( $nse, $seq->id() );
2786             }
2787 11         27 return 1;
2788             }
2789              
2790              
2791             =head2 set_displayname_normal
2792              
2793             Title : set_displayname_normal
2794             Usage : $ali->set_displayname_normal()
2795             Function : Makes all the sequences be displayed as name/start-end (NSE)
2796             Returns : 1, on success
2797             Argument :
2798              
2799             =cut
2800              
2801             sub set_displayname_normal {
2802 1     1 1 3 my $self = shift;
2803 1         2 my ( $nse, $seq );
2804              
2805 1         4 foreach $seq ( $self->each_seq() ) {
2806 16         25 $nse = $seq->get_nse();
2807 16         19 $self->displayname( $nse, $nse );
2808             }
2809 1         5 return 1;
2810             }
2811              
2812             =head2 source
2813              
2814             Title : source
2815             Usage : $obj->source($newval)
2816             Function: sets the Alignment source program
2817             Example :
2818             Returns : value of source
2819             Args : newvalue (optional)
2820              
2821             =cut
2822              
2823             sub source {
2824 123     123 1 598 my ( $self, $value ) = @_;
2825 123 100       331 if ( defined $value ) {
2826 111         204 $self->{'_source'} = $value;
2827             }
2828 123         182 return $self->{'_source'};
2829             }
2830              
2831              
2832             =head2 set_displayname_safe
2833              
2834             Title : set_displayname_safe
2835             Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2836             Function : Assign machine-generated serial names to sequences in input order.
2837             Designed to protect names during PHYLIP runs. Assign 10-char string
2838             in the form of "S000000001" to "S999999999". Restore the original
2839             names using "restore_displayname".
2840             Returns : 1. a new $aln with system names;
2841             2. a hash ref for restoring names
2842             Argument : Number for id length (default 10)
2843              
2844             =cut
2845              
2846             sub set_displayname_safe {
2847 1     1 1 3 my $self = shift;
2848 1   50     8 my $idlength = shift || 10;
2849 1         2 my ( $seq, %phylip_name );
2850 1         3 my $ct = 0;
2851 1         6 my $new = Bio::SimpleAlign->new();
2852 1         5 foreach $seq ( $self->each_seq() ) {
2853 14         17 $ct++;
2854 14         74 my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2855 14         45 $phylip_name{$pname} = $seq->id();
2856             my $new_seq = Bio::LocatableSeq->new(
2857             -id => $pname,
2858             -seq => $seq->seq(),
2859             -alphabet => $seq->alphabet,
2860             -start => $seq->{_start},
2861             -end => $seq->{_end}
2862 14         49 );
2863 14         62 $new->add_seq($new_seq);
2864             }
2865              
2866             $self->debug(
2867 1         8 "$ct seq names changed. Restore names by using restore_displayname.");
2868 1         4 return ( $new, \%phylip_name );
2869             }
2870              
2871              
2872             =head2 restore_displayname
2873              
2874             Title : restore_displayname
2875             Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2876             Function : Restore original sequence names (after running
2877             $ali->set_displayname_safe)
2878             Returns : a new $aln with names restored.
2879             Argument : a hash reference of names from "set_displayname_safe".
2880              
2881             =cut
2882              
2883             sub restore_displayname {
2884 1     1 1 3 my $self = shift;
2885 1         2 my $ref=shift;
2886 1         13 my %name=%$ref;
2887 1         5 my $new=Bio::SimpleAlign->new();
2888 1         4 foreach my $seq ( $self->each_seq() ) {
2889 14 50       47 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2890             my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2891             -seq => $seq->seq(),
2892             -alphabet => $seq->alphabet,
2893             -start => $seq->{_start},
2894             -end => $seq->{_end}
2895 14         41 );
2896 14         60 $new->add_seq($new_seq);
2897             }
2898 1         9 return $new;
2899             }
2900              
2901             =head2 sort_by_start
2902              
2903             Title : sort_by_start
2904             Usage : $ali->sort_by_start
2905             Function : Changes the order of the alignment to the start position of each
2906             subalignment
2907             Returns : 1 on success
2908             Argument :
2909              
2910             =cut
2911              
2912             sub sort_by_start {
2913 1     1 1 3 my $self = shift;
2914 1         3 my ($seq,$nse,@arr,%hash,$count);
2915 1         3 foreach $seq ( $self->each_seq() ) {
2916 3         10 $nse = $seq->get_nse;
2917 3         9 $hash{$nse} = $seq;
2918             }
2919 1         4 $count = 0;
2920 1         3 %{$self->{'_order'}} = (); # reset the hash;
  1         5  
2921 1         8 foreach $nse ( sort _startend keys %hash) {
2922 3         7 $self->{'_order'}->{$count} = $nse;
2923 3         5 $count++;
2924             }
2925 1         5 1;
2926             }
2927              
2928             sub _startend {
2929 3     3   9 my ($aname,$arange) = split (/[\/]/,$a);
2930 3         18 my ($bname,$brange) = split (/[\/]/,$b);
2931 3         6 my ($astart,$aend) = split(/\-/,$arange);
2932 3         7 my ($bstart,$bend) = split(/\-/,$brange);
2933 3         9 return $astart <=> $bstart;
2934             }
2935              
2936             =head2 bracket_string
2937              
2938             Title : bracket_string
2939             Usage : my @params = (-refseq => 'testseq',
2940             -allele1 => 'allele1',
2941             -allele2 => 'allele2',
2942             -delimiters => '{}',
2943             -separator => '/');
2944             $str = $aln->bracket_string(@params)
2945              
2946             Function : When supplied with a list of parameters (see below), returns a
2947             string in BIC format. This is used for allelic comparisons.
2948             Briefly, if either allele contains a base change when compared to
2949             the refseq, the base or gap for each allele is represented in
2950             brackets in the order present in the 'alleles' parameter.
2951              
2952             For the following data:
2953              
2954             >testseq
2955             GGATCCATTGCTACT
2956             >allele1
2957             GGATCCATTCCTACT
2958             >allele2
2959             GGAT--ATTCCTCCT
2960              
2961             the returned string with parameters 'refseq => testseq' and
2962             'alleles => [qw(allele1 allele2)]' would be:
2963              
2964             GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2965             Returns : BIC-formatted string
2966             Argument : Required args
2967             refseq : string (ID) of the reference sequence used
2968             as basis for comparison
2969             allele1 : string (ID) of the first allele
2970             allele2 : string (ID) of the second allele
2971             Optional args
2972             delimiters: two symbol string of left and right delimiters.
2973             Only the first two symbols are used
2974             default = '[]'
2975             separator : string used as a separator. Only the first
2976             symbol is used
2977             default = '/'
2978             Throws : On no refseq/alleles, or invalid refseq/alleles.
2979              
2980             =cut
2981              
2982             sub bracket_string {
2983 14     14 1 5497 my ($self, @args) = @_;
2984 14         60 my ($ref, $a1, $a2, $delim, $sep) =
2985             $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2986 14 50 33     76 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
      33        
2987 14         13 my ($ld, $rd);
2988 14 100       32 ($ld, $rd) = split('', $delim, 2) if $delim;
2989 14   100     38 $ld ||= '[';
2990 14   100     32 $rd ||= ']';
2991 14   100     27 $sep ||= '/';
2992             my ($refseq, $allele1, $allele2) =
2993 14         18 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
  42         64  
2994 14 50 33     65 if (!$refseq || !$allele1 || !$allele2) {
      33        
2995 0         0 $self->throw("One of your refseq/allele IDs is invalid!");
2996             }
2997 14         21 my $len = $self->length-1;
2998 14         17 my $bic = '';
2999             # loop over the alignment columns
3000 14         24 for my $column ( 0 .. $len ) {
3001 210         130 my $string;
3002             my ($compres, $res1, $res2) =
3003 210         172 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
  630         762  
3004             # are any of the allele symbols different from the refseq?
3005 210 100 100     654 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3006             $ld.$res1.$sep.$res2.$rd;
3007 210         222 $bic .= $string;
3008             }
3009 14         44 return $bic;
3010             }
3011              
3012              
3013             =head2 methods implementing Bio::FeatureHolderI
3014              
3015             FeatureHolderI implementation to support labeled character sets like one
3016             would get from NEXUS represented data.
3017              
3018             =head2 get_SeqFeatures
3019              
3020             Usage : @features = $aln->get_SeqFeatures
3021             Function: Get the feature objects held by this feature holder.
3022             Example :
3023             Returns : an array of Bio::SeqFeatureI implementing objects
3024             Args : optional filter coderef, taking a Bio::SeqFeatureI
3025             : as argument, returning TRUE if wanted, FALSE if
3026             : unwanted
3027              
3028             =cut
3029              
3030             sub get_SeqFeatures {
3031 4     4 1 10 my $self = shift;
3032 4         5 my $filter_cb = shift;
3033 4 50 33     17 $self->throw("Arg (filter callback) must be a coderef")
3034             unless !defined($filter_cb)
3035             or ref($filter_cb) eq 'CODE';
3036 4 100       12 if ( !defined $self->{'_as_feat'} ) {
3037 2         4 $self->{'_as_feat'} = [];
3038             }
3039 4 50       9 if ($filter_cb) {
3040 0         0 return grep { $filter_cb->($_) } @{ $self->{'_as_feat'} };
  0         0  
  0         0  
3041             }
3042 4         4 return @{ $self->{'_as_feat'} };
  4         12  
3043             }
3044              
3045              
3046             =head2 add_SeqFeature
3047              
3048             Usage : $aln->add_SeqFeature($subfeat);
3049             Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3050             (see L) is supported, but has no effect.
3051             Example :
3052             Returns : 1 on success
3053             Args : a Bio::SeqFeatureI object
3054              
3055             =cut
3056              
3057             sub add_SeqFeature {
3058 9     9 1 10 my ($self, @feat) = @_;
3059              
3060 9 100       20 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3061              
3062 9 50       12 if (scalar @feat > 1) {
3063 0         0 $self->deprecated(
3064             -message => 'Providing an array of features to Bio::SimpleAlign add_SeqFeature()'.
3065             ' is deprecated and will be removed in a future version. '.
3066             'Add a single feature at a time instead.',
3067             -warn_version => 1.007,
3068             -throw_version => 1.009,
3069             );
3070             }
3071              
3072 9         11 for my $feat ( @feat ) {
3073              
3074 9 50       17 next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3075              
3076 9 50       33 if( !$feat->isa("Bio::SeqFeatureI") ) {
3077 0         0 $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3078             }
3079              
3080 9         11 push @{$self->{'_as_feat'}}, $feat;
  9         15  
3081             }
3082 9         9 return 1;
3083             }
3084              
3085              
3086             =head2 remove_SeqFeatures
3087              
3088             Usage : $obj->remove_SeqFeatures
3089             Function: Removes all SeqFeatures. If you want to remove only a subset,
3090             remove that subset from the returned array, and add back the rest.
3091             Returns : The array of Bio::SeqFeatureI features that was
3092             deleted from this alignment.
3093             Args : none
3094              
3095             =cut
3096              
3097             sub remove_SeqFeatures {
3098 0     0 1 0 my $self = shift;
3099              
3100 0 0       0 return () unless $self->{'_as_feat'};
3101 0         0 my @feats = @{$self->{'_as_feat'}};
  0         0  
3102 0         0 $self->{'_as_feat'} = [];
3103 0         0 return @feats;
3104             }
3105              
3106             =head2 feature_count
3107              
3108             Title : feature_count
3109             Usage : $obj->feature_count()
3110             Function: Return the number of SeqFeatures attached to the alignment
3111             Returns : integer representing the number of SeqFeatures
3112             Args : None
3113              
3114             =cut
3115              
3116             sub feature_count {
3117 2     2 1 3 my ($self) = @_;
3118              
3119 2 50       6 if (defined($self->{'_as_feat'})) {
3120 2         1 return ($#{$self->{'_as_feat'}} + 1);
  2         10  
3121             } else {
3122 0         0 return 0;
3123             }
3124             }
3125              
3126             =head2 get_all_SeqFeatures
3127              
3128             Title : get_all_SeqFeatures
3129             Usage :
3130             Function: Get all SeqFeatures.
3131             Example :
3132             Returns : an array of Bio::SeqFeatureI implementing objects
3133             Args : none
3134             Note : Falls through to Bio::FeatureHolderI implementation.
3135              
3136             =cut
3137              
3138             =head2 methods for Bio::AnnotatableI
3139              
3140             AnnotatableI implementation to support sequence alignments which
3141             contain annotation (NEXUS, Stockholm).
3142              
3143             =head2 annotation
3144              
3145             Title : annotation
3146             Usage : $ann = $aln->annotation or
3147             $aln->annotation($ann)
3148             Function: Gets or sets the annotation
3149             Returns : Bio::AnnotationCollectionI object
3150             Args : None or Bio::AnnotationCollectionI object
3151              
3152             See L and L
3153             for more information
3154              
3155             =cut
3156              
3157             sub annotation {
3158 87     87 1 88 my ($obj,$value) = @_;
3159 87 100       185 if( defined $value ) {
    100          
3160 46 50       178 $obj->throw("object of class ".ref($value)." does not implement ".
3161             "Bio::AnnotationCollectionI. Too bad.")
3162             unless $value->isa("Bio::AnnotationCollectionI");
3163 46         59 $obj->{'_annotation'} = $value;
3164             } elsif( ! defined $obj->{'_annotation'}) {
3165 21         89 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3166             }
3167 87         131 return $obj->{'_annotation'};
3168             }
3169              
3170             =head1 Deprecated methods
3171              
3172             =cut
3173              
3174             =head2 no_residues
3175              
3176             Title : no_residues
3177             Usage : $no = $ali->no_residues
3178             Function : number of residues in total in the alignment
3179             Returns : integer
3180             Argument :
3181             Note : deprecated in favor of num_residues()
3182              
3183             =cut
3184              
3185             sub no_residues {
3186 0     0 1 0 my $self = shift;
3187 0         0 $self->deprecated(-warn_version => 1.0069,
3188             -throw_version => 1.0075,
3189             -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3190 0         0 $self->num_residues(@_);
3191             }
3192              
3193             =head2 no_sequences
3194              
3195             Title : no_sequences
3196             Usage : $depth = $ali->no_sequences
3197             Function : number of sequence in the sequence alignment
3198             Returns : integer
3199             Argument :
3200             Note : deprecated in favor of num_sequences()
3201              
3202             =cut
3203              
3204             sub no_sequences {
3205 0     0 1 0 my $self = shift;
3206 0         0 $self->deprecated(-warn_version => 1.0069,
3207             -throw_version => 1.0075,
3208             -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3209 0         0 $self->num_sequences(@_);
3210             }
3211              
3212             =head2 mask_columns
3213              
3214             Title : mask_columns
3215             Usage : $aln2 = $aln->mask_columns(20,30)
3216             Function : Masks a slice of the alignment inclusive of start and
3217             end columns, and the first column in the alignment is denoted 1.
3218             Mask beyond the length of the sequence does not do padding.
3219             Returns : A Bio::SimpleAlign object
3220             Args : Positive integer for start column, positive integer for end column,
3221             optional string value use for the mask. Example:
3222              
3223             $aln2 = $aln->mask_columns(20,30,'?')
3224             Note : Masking must use a character that is not used for gaps or
3225             frameshifts. These can be adjusted using the relevant global
3226             variables, but be aware these may be (uncontrollably) modified
3227             elsewhere within BioPerl (see bug 2715)
3228              
3229             =cut
3230              
3231             sub mask_columns {
3232             #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3233 6     6 1 1065 my $self = shift;
3234              
3235 6         14 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3236             $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3237            
3238             # coordinates are alignment-based, not sequence-based
3239 6         9 my ($start, $end, $mask_char) = @_;
3240 6 50       18 unless (defined $mask_char) { $mask_char = 'N' }
  0         0  
3241              
3242 6 50 33     59 $self->throw("Mask start has to be a positive integer and less than ".
      33        
3243             "alignment length, not [$start]")
3244             unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3245 6 50 33     46 $self->throw("Mask end has to be a positive integer and less than ".
      33        
3246             "alignment length, not [$end]")
3247             unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3248 6 50       11 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3249             "end [$end]") unless $start <= $end;
3250 6 50 33     46 $self->throw("Mask character $mask_char has to be a single character ".
3251             "and not a gap or frameshift symbol")
3252             unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3253            
3254 6         14 my $aln = $self->new;
3255 6         18 $aln->id($self->id);
3256 6         12 foreach my $seq ( $self->each_seq() ) {
3257 21         47 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3258             -alphabet => $seq->alphabet,
3259             -strand => $seq->strand,
3260             -verbose => $self->verbose);
3261            
3262             # convert from 1-based alignment coords!
3263 21         49 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3264 21         133 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3265 21         37 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3266 21         29 $new_seq->seq($new_dna_string);
3267 21         39 $aln->add_seq($new_seq);
3268             }
3269             # Preserve chosen mask character, it may be need later (like in 'slice')
3270 6         12 $aln->{_mask_char} = $mask_char;
3271 6         14 return $aln;
3272             }
3273              
3274             1;