File Coverage

Bio/SimpleAlign.pm
Criterion Covered Total %
statement 1013 1175 86.2
branch 315 456 69.0
condition 135 218 61.9
subroutine 80 96 83.3
pod 70 78 89.7
total 1613 2023 79.7


line stmt bran cond sub pod time code
1             package Bio::SimpleAlign;
2 48     48   1758 use strict;
  48         53  
  48         1200  
3 48     48   162 use warnings;
  48         48  
  48         1102  
4 48     48   3002 use Bio::LocatableSeq; # uses Seq's as list
  48         59  
  48         785  
5 48     48   4791 use Bio::Seq;
  48         76  
  48         816  
6 48     48   11448 use Bio::SeqFeature::Generic;
  48         77  
  48         1373  
7              
8 48     48   220 use parent qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI);
  48         60  
  48         320  
9              
10             # BioPerl module for SimpleAlign
11             #
12             # Please direct questions and support issues to
13             #
14             # Cared for by Heikki Lehvaslaiho
15             #
16             # Copyright Ewan Birney
17             #
18             # You may distribute this module under the same terms as perl itself
19              
20             # POD documentation - main docs before the code
21             #
22             # History:
23             # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
24             # May 2001 major rewrite - Heikki Lehvaslaiho
25              
26             =head1 NAME
27              
28             Bio::SimpleAlign - Multiple alignments held as a set of sequences
29              
30             =head1 SYNOPSIS
31              
32             # Use Bio::AlignIO to read in the alignment
33             $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
34             $aln = $str->next_aln();
35              
36             # Describe
37             print $aln->length;
38             print $aln->num_residues;
39             print $aln->is_flush;
40             print $aln->num_sequences;
41             print $aln->score;
42             print $aln->percentage_identity;
43             print $aln->consensus_string(50);
44              
45             # Find the position in the alignment for a sequence location
46             $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
47              
48             # Extract sequences and check values for the alignment column $pos
49             foreach $seq ($aln->each_seq) {
50             $res = $seq->subseq($pos, $pos);
51             $count{$res}++;
52             }
53             foreach $res (keys %count) {
54             printf "Res: %s Count: %2d\n", $res, $count{$res};
55             }
56              
57             # Manipulate
58             $aln->remove_seq($seq);
59             $mini_aln = $aln->slice(20,30); # get a block of columns
60             $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
61             $new_aln = $aln->remove_columns([20,30]); # remove by position
62             $new_aln = $aln->remove_columns(['mismatch']); # remove by property
63              
64             # Analyze
65             $str = $aln->consensus_string($threshold_percent);
66             $str = $aln->match_line();
67             $str = $aln->cigar_line();
68             $id = $aln->percentage_identity;
69              
70             # See the module documentation for details and more methods.
71              
72             =head1 DESCRIPTION
73              
74             SimpleAlign is an object that handles a multiple sequence alignment
75             (MSA). It is very permissive of types (it does not insist on sequences
76             being all same length, for example). Think of it as a set of sequences
77             with a whole series of built-in manipulations and methods for reading and
78             writing alignments.
79              
80             SimpleAlign uses L, a subclass of L,
81             to store its sequences. These are subsequences with a start and end
82             positions in the parent reference sequence. Each sequence in the
83             SimpleAlign object is a Bio::LocatableSeq.
84              
85             SimpleAlign expects the combination of name, start, and end for a
86             given sequence to be unique in the alignment, and this is the key for the
87             internal hashes (name, start, end are abbreviated C in the code).
88             However, in some cases people do not want the name/start-end to be displayed:
89             either multiple names in an alignment or names specific to the alignment
90             (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
91             C, and generally is what is used to print out the
92             alignment. They default to name/start-end.
93              
94             The SimpleAlign Module is derived from the Align module by Ewan Birney.
95              
96             =head1 FEEDBACK
97              
98             =head2 Mailing Lists
99              
100             User feedback is an integral part of the evolution of this and other
101             Bioperl modules. Send your comments and suggestions preferably to one
102             of the Bioperl mailing lists. Your participation is much appreciated.
103              
104             bioperl-l@bioperl.org - General discussion
105             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
106              
107             =head2 Support
108              
109             Please direct usage questions or support issues to the mailing list:
110              
111             I
112              
113             rather than to the module maintainer directly. Many experienced and
114             reponsive experts will be able look at the problem and quickly
115             address it. Please include a thorough description of the problem
116             with code and data examples if at all possible.
117              
118             =head2 Reporting Bugs
119              
120             Report bugs to the Bioperl bug tracking system to help us keep track
121             the bugs and their resolution. Bug reports can be submitted via the
122             web:
123              
124             https://github.com/bioperl/bioperl-live/issues
125              
126             =head1 AUTHOR
127              
128             Ewan Birney, birney@ebi.ac.uk
129              
130             =head1 CONTRIBUTORS
131              
132             Allen Day, allenday-at-ucla.edu,
133             Richard Adams, Richard.Adams-at-ed.ac.uk,
134             David J. Evans, David.Evans-at-vir.gla.ac.uk,
135             Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
136             Allen Smith, allens-at-cpan.org,
137             Jason Stajich, jason-at-bioperl.org,
138             Anthony Underwood, aunderwood-at-phls.org.uk,
139             Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
140             Brian Osborne, bosborne at alum.mit.edu
141             Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
142             Hongyu Zhang, forward at hongyu.org
143             Jay Hannah, jay at jays.net
144             Alexandr Bezginov, albezg at gmail.com
145              
146             =head1 SEE ALSO
147              
148             L
149              
150             =head1 APPENDIX
151              
152             The rest of the documentation details each of the object
153             methods. Internal methods are usually preceded with a _
154              
155             =cut
156              
157             ## This data should probably be in a more centralized module...
158             ## it is taken from Clustalw documentation.
159             ## These are all the positively scoring groups that occur in the
160             ## Gonnet Pam250 matrix. The strong and weak groups are
161             ## defined as strong score >0.5 and weak score =<0.5 respectively.
162             our %CONSERVATION_GROUPS = (
163             'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )],
164             'weak' => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)],
165             );
166              
167              
168             =head2 new
169              
170             Title : new
171             Usage : my $aln = Bio::SimpleAlign->new();
172             Function : Creates a new simple align object
173             Returns : Bio::SimpleAlign
174             Args : -source => string representing the source program
175             where this alignment came from
176             -annotation => Bio::AnnotationCollectionI
177             -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
178             -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
179             -consensus => consensus string
180             -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
181              
182             =cut
183              
184              
185             sub new {
186 291     291 1 521 my($class,@args) = @_;
187              
188 291         897 my $self = $class->SUPER::new(@args);
189              
190 291         1347 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
191             SOURCE
192             SCORE
193             ID
194             ACCESSION
195             DESCRIPTION
196             SEQS
197             FEATURES
198             ANNOTATION
199             SEQ_ANNOTATION
200             CONSENSUS
201             CONSENSUS_META
202             )], @args);
203 291 100       1058 $src && $self->source($src);
204 291 100       509 defined $score && $self->score($score);
205             # we need to set up internal hashs first!
206              
207 291         712 $self->{'_seq'} = {};
208 291         399 $self->{'_order'} = {};
209 291         367 $self->{'_start_end_lists'} = {};
210 291         378 $self->{'_dis_name'} = {};
211 291         355 $self->{'_id'} = 'NoName';
212             # maybe we should automatically read in from args. Hmmm...
213 291 100       503 $id && $self->id($id);
214 291 100       504 $acc && $self->accession($acc);
215 291 100       450 $desc && $self->description($desc);
216 291 100       494 $coll && $self->annotation($coll);
217             # sequence annotation is layered into a provided annotation collection (or dies)
218 291 50       446 if ($sa) {
219 0 0       0 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
220             "with a sequence annotation collection")
221             if !$coll;
222 0         0 $coll->add_Annotation('seq_annotation', $sa);
223             }
224 291 100 66     753 if ($feats && ref $feats eq 'ARRAY') {
225 1         3 for my $feat (@$feats) {
226 6         7 $self->add_SeqFeature($feat);
227             }
228             }
229 291 50       761 $con && $self->consensus($con);
230 291 100       502 $cmeta && $self->consensus_meta($cmeta);
231             # assumes these are in correct alignment order
232 291 100 66     768 if ($seqs && ref($seqs) eq 'ARRAY') {
233 9         16 for my $seq (@$seqs) {
234 168         204 $self->add_seq($seq);
235             }
236             }
237              
238 291         573 return $self; # success - we hope!
239             }
240              
241             =head1 Modifier methods
242              
243             These methods modify the MSA by adding, removing or shuffling complete
244             sequences.
245              
246             =head2 add_seq
247              
248             Title : add_seq
249             Usage : $myalign->add_seq($newseq);
250             $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
251             Function : Adds another sequence to the alignment. *Does not* align
252             it - just adds it to the hashes.
253             If -ORDER is specified, the sequence is inserted at the
254             the position spec'd by -ORDER, and existing sequences
255             are pushed down the storage array.
256             Returns : nothing
257             Args : A Bio::LocatableSeq object
258             Positive integer for the sequence position (optional)
259              
260             See L for more information
261              
262             =cut
263              
264             sub addSeq {
265 0     0 0 0 my $self = shift;
266 0         0 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
267 0         0 $self->add_seq(@_);
268             }
269              
270             sub add_seq {
271 2420     2420 1 2327 my $self = shift;
272 2420         2578 my @args = @_;
273 2420         5464 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
274 2420         2491 my ($name,$id,$start,$end);
275              
276 2420 50       3340 unless ($seq) {
277 0         0 $self->throw("LocatableSeq argument required");
278             }
279 2420 50 33     8399 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
280 0         0 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
281             }
282 2420 100       3362 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
  2417         3722  
283 2420         1953 $order--; # jay's patch (user-specified order is 1-origin)
284            
285 2420 100       3387 if ($order < 0) {
286 1         4 $self->throw("User-specified value for ORDER must be >= 1");
287             }
288              
289 2419   66     3381 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
290              
291             # build the symbol list for this sequence,
292             # will prune out the gap and missing/match chars
293             # when actually asked for the symbol list in the
294             # symbol_chars
295             # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
296              
297 2419         3935 $name = $seq->get_nse;
298              
299 2419 50       3561 if( $self->{'_seq'}->{$name} ) {
300 0 0       0 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
301             }
302             else {
303 2419         6395 $self->debug( "Assigning $name to $order\n");
304              
305 2419         2514 my $ordh = $self->{'_order'};
306 2419 50       3566 if ($ordh->{$order}) {
307             # make space to insert
308             # $c->() returns (in reverse order) the first subsequence
309             # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
310             # (3,2,1), and $c->(2,4,5) returns (2).
311 0         0 my $c;
312 0 0   0   0 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
  0         0  
313             map {
314 0         0 $ordh->{$_+1} = $ordh->{$_}
315 0         0 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
  0         0  
  0         0  
  0         0  
316              
317             }
318 2419         3349 $ordh->{$order} = $name;
319              
320 2419 100       3784 unless( exists( $self->{'_start_end_lists'}->{$id})) {
321 2400         3829 $self->{'_start_end_lists'}->{$id} = [];
322             }
323 2419         1924 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
  2419         3952  
324             }
325              
326 2419         6698 $self->{'_seq'}->{$name} = $seq;
327              
328             }
329              
330              
331             =head2 remove_seq
332              
333             Title : remove_seq
334             Usage : $aln->remove_seq($seq);
335             Function : Removes a single sequence from an alignment
336             Returns :
337             Argument : a Bio::LocatableSeq object
338              
339             =cut
340              
341             sub removeSeq {
342 0     0 0 0 my $self = shift;
343 0         0 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
344 0         0 $self->remove_seq(@_);
345             }
346              
347             sub remove_seq {
348 13     13 1 9 my $self = shift;
349 13         9 my $seq = shift;
350 13         9 my ($name,$id);
351              
352 13 50 33     63 $self->throw("Need Bio::Locatable seq argument ")
353             unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
354              
355 13         21 $id = $seq->id();
356 13         22 $name = $seq->get_nse;
357              
358 13 50       17 if( !exists $self->{'_seq'}->{$name} ) {
359 0         0 $self->throw("Sequence $name does not exist in the alignment to remove!");
360             }
361              
362 13         14 delete $self->{'_seq'}->{$name};
363              
364             # we need to remove this seq from the start_end_lists hash
365              
366 13 50       18 if (exists $self->{'_start_end_lists'}->{$id}) {
367             # we need to find the sequence in the array.
368              
369 13         11 my ($i, $found);;
370 13         14 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
  13         21  
371 13 50       10 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
  13         31  
372 13         9 $found = 1;
373 13         12 last;
374             }
375             }
376 13 50       15 if ($found) {
377 13         11 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
  13         18  
378             }
379             else {
380 0         0 $self->throw("Could not find the sequence to remoce from the start-end list");
381             }
382             }
383             else {
384 0         0 $self->throw("There is no seq list for the name $id");
385             }
386             # we need to shift order hash
387 13         10 my %rev_order = reverse %{$self->{'_order'}};
  13         60  
388 13         16 my $no = $rev_order{$name};
389 13         19 my $num_sequences = $self->num_sequences;
390 13         20 for (; $no < $num_sequences; $no++) {
391 94         141 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
392             }
393 13         14 delete $self->{'_order'}->{$no};
394 13         28 return 1;
395             }
396              
397              
398             =head2 purge
399              
400             Title : purge
401             Usage : $aln->purge(0.7);
402             Function: Removes sequences above given sequence similarity
403             This function will grind on large alignments. Beware!
404             Example :
405             Returns : An array of the removed sequences
406             Args : float, threshold for similarity
407              
408             =cut
409              
410             sub purge {
411 1     1 1 2 my ($self,$perc) = @_;
412 1         1 my (%duplicate, @dups);
413              
414 1         2 my @seqs = $self->each_seq();
415              
416 1         4 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
417 15         13 my $seq = $seqs[$i];
418              
419             #skip if already in duplicate hash
420 15 100       21 next if exists $duplicate{$seq->display_id} ;
421 4         8 my $one = $seq->seq();
422              
423 4         65 my @one = split '', $one; #split to get 1aa per array element
424              
425 4         7 for (my $j=$i+1;$j < @seqs;$j++) {
426 28         22 my $seq2 = $seqs[$j];
427              
428             #skip if already in duplicate hash
429 28 100       55 next if exists $duplicate{$seq2->display_id} ;
430              
431 27         43 my $two = $seq2->seq();
432 27         386 my @two = split '', $two;
433              
434 27         22 my $count = 0;
435 27         21 my $res = 0;
436 27         37 for (my $k=0;$k<@one;$k++) {
437 6534 100 66     29896 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
      66        
      100        
438             $one[$k] eq $two[$k]) {
439 4329         2717 $count++;
440             }
441 6534 100 66     37370 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
      66        
      66        
      66        
442             $two[$k] ne '.' && $two[$k] ne '-' ) {
443 6320         7629 $res++;
444             }
445             }
446              
447 27         24 my $ratio = 0;
448 27 50       44 $ratio = $count/$res unless $res == 0;
449              
450             # if above threshold put in duplicate hash and push onto
451             # duplicate array for returning to get_unique
452 27 100       200 if ( $ratio > $perc ) {
453 12 50       37 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
454 12         23 $duplicate{$seq2->display_id} = 1;
455 12         137 push @dups, $seq2;
456             }
457             }
458             }
459 1         3 foreach my $seq (@dups) {
460 12         15 $self->remove_seq($seq);
461             }
462 1         11 return @dups;
463             }
464              
465             =head2 sort_alphabetically
466              
467             Title : sort_alphabetically
468             Usage : $ali->sort_alphabetically
469             Function : Changes the order of the alignment to alphabetical on name
470             followed by numerical by number.
471             Returns :
472             Argument :
473              
474             =cut
475              
476             sub sort_alphabetically {
477 1     1 1 3 my $self = shift;
478 1         1 my ($seq,$nse,@arr,%hash,$count);
479              
480 1         2 foreach $seq ( $self->each_seq() ) {
481 3         5 $nse = $seq->get_nse;
482 3         4 $hash{$nse} = $seq;
483             }
484              
485 1         2 $count = 0;
486              
487 1         1 %{$self->{'_order'}} = (); # reset the hash;
  1         3  
488              
489 1         5 foreach $nse ( sort _alpha_startend keys %hash) {
490 3         3 $self->{'_order'}->{$count} = $nse;
491              
492 3         4 $count++;
493             }
494 1         5 1;
495             }
496              
497             =head2 sort_by_list
498              
499             Title : sort_by_list
500             Usage : $aln_ordered=$aln->sort_by_list($list_file)
501             Function : Arbitrarily order sequences in an alignment
502             Returns : A new Bio::SimpleAlign object
503             Argument : a file listing sequence names in intended order (one name per line)
504              
505             =cut
506              
507             sub sort_by_list {
508 1     1 1 2 my ($self, $list) = @_;
509 1         1 my (@seq, @ids, %order);
510              
511 1         2 foreach my $seq ( $self->each_seq() ) {
512 6         5 push @seq, $seq;
513 6         8 push @ids, $seq->display_id;
514             }
515              
516 1         2 my $ct=1;
517 1 50       30 open my $listfh, '<', $list or $self->throw("Could not read file '$list': $!");
518 1         18 while (<$listfh>) {
519 6         5 chomp;
520 6         7 my $name=$_;
521 6 50       6 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
522 6         13 $order{$name}=$ct++;
523             }
524 1         5 close($listfh);
525            
526             # use the map-sort-map idiom:
527 1         3 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
  6         6  
  7         6  
  6         7  
528 1         4 my $aln = $self->new;
529 1         3 foreach (@sorted) { $aln->add_seq($_) }
  6         8  
530 1         6 return $aln;
531             }
532              
533             =head2 set_new_reference
534              
535             Title : set_new_reference
536             Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
537             the sequence whoes name is "B31" (full, exact, and case-sensitive),
538             as the reference (1st) sequence
539             Function : Change/Set a new reference (i.e., the first) sequence
540             Returns : a new Bio::SimpleAlign object.
541             Throws an exception if designated sequence not found
542             Argument : a positive integer of sequence order, or a sequence name
543             in the original alignment
544              
545             =cut
546              
547             sub set_new_reference {
548 2     2 1 3 my ($self, $seqid) = @_;
549 2         3 my $aln = $self->new;
550 2         1 my (@seq, @ids, @new_seq);
551 2         3 my $is_num=0;
552 2         2 foreach my $seq ( $self->each_seq() ) {
553 12         9 push @seq, $seq;
554 12         14 push @ids, $seq->display_id;
555             }
556              
557 2 100       9 if ($seqid =~ /^\d+$/) { # argument is seq position
558 1         1 $is_num=1;
559 1 50 33     5 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
560             } else { # argument is a seq name
561 1 50       4 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
562             }
563              
564 2         5 for (my $i=0; $i<=$#seq; $i++) {
565 12         9 my $pos=$i+1;
566 12 100 100     34 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
      100        
567 2         4 unshift @new_seq, $seq[$i];
568             } else {
569 10         17 push @new_seq, $seq[$i];
570             }
571             }
572 2         3 foreach (@new_seq) { $aln->add_seq($_); }
  12         15  
573 2         6 return $aln;
574             }
575              
576             sub _in_aln { # check if input name exists in the alignment
577 7     7   6 my ($str, $ref) = @_;
578 7         8 foreach (@$ref) {
579 24 100       38 return 1 if $str eq $_;
580             }
581 0         0 return 0;
582             }
583              
584              
585             =head2 uniq_seq
586              
587             Title : uniq_seq
588             Usage : $aln->uniq_seq(): Remove identical sequences in
589             in the alignment. Ambiguous base ("N", "n") and
590             leading and ending gaps ("-") are NOT counted as
591             differences.
592             Function : Make a new alignment of unique sequence types (STs)
593             Returns : 1a. if called in a scalar context,
594             a new Bio::SimpleAlign object (all sequences renamed as "ST")
595             1b. if called in an array context,
596             a new Bio::SimpleAlign object, and a hashref whose keys
597             are sequence types, and whose values are arrayrefs to
598             lists of sequence ids within the corresponding sequence type
599             2. if $aln->verbose > 0, ST of each sequence is sent to
600             STDERR (in a tabular format)
601             Argument : None
602              
603             =cut
604              
605             sub uniq_seq {
606 1     1 1 1 my ($self, $seqid) = @_;
607 1         3 my $aln = $self->new;
608 1         1 my (%member, %order, @seq, @uniq_str, $st);
609 1         1 my $order=0;
610 1         3 my $len = $self->length();
611 1         1 $st = {};
612 1         3 foreach my $seq ( $self->each_seq() ) {
613 15         23 my $str = $seq->seq();
614              
615             # it's necessary to ignore "n", "N", leading gaps and ending gaps in
616             # comparing two sequence strings
617              
618             # 1st, convert "n", "N" to "?" (for DNA sequence only):
619 15 50       41 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
620             # 2nd, convert leading and ending gaps to "?":
621 15         25 $str = &_convert_leading_ending_gaps($str, '-', '?');
622             # Note that '?' also can mean unknown residue.
623             # I don't like making global class member changes like this, too
624             # prone to errors... -- cjfields 08-11-18
625 15         11 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
626 15         28 my $new = Bio::LocatableSeq->new(
627             -id => $seq->id(),
628             -alphabet=> $seq->alphabet,
629             -seq => $str,
630             -start => $seq->start,
631             -end => $seq->end
632             );
633 15         31 push @seq, $new;
634             }
635              
636 1         2 foreach my $seq (@seq) {
637 15         33 my $str = $seq->seq();
638 15         24 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
639 15 100       24 if ($seen) { # seen before
640 4         3 my @memb = @{$member{$key}};
  4         12  
641 4         4 push @memb, $seq;
642 4         13 $member{$key} = \@memb;
643             } else { # not seen
644 11         16 push @uniq_str, $key;
645 11         9 $order++;
646 11         57 $member{$key} = [ ($seq) ];
647 11         29 $order{$key} = $order;
648             }
649             }
650              
651 1         8 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
  24         20  
652             # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
653 11         16 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
654             # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
655 11 50       37 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
656 11         8 my $gap='-';
657 11         9 my $end= CORE::length($str2);
658 11         332 $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
659 11         46 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
660             -seq =>$str2,
661             -start=>1,
662             -end =>$end
663             );
664 11         24 $aln->add_seq($new);
665 11         8 foreach (@{$member{$str}}) {
  11         27  
666 15         13 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
  15         41  
667 15         20 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
668             }
669             }
670 1 50       11 return wantarray ? ($aln, $st) : $aln;
671             }
672              
673             sub _check_uniq { # check if same seq exists in the alignment
674 15     15   9 my ($str1, $ref, $length) = @_;
675 15         359 my @char1=split //, $str1;
676 15         22 my @array=@$ref;
677              
678 15 100       35 return (0, $str1) if @array==0; # not seen (1st sequence)
679              
680 14         15 foreach my $str2 (@array) {
681 59         44 my $diff=0;
682 59         1369 my @char2=split //, $str2;
683 59         78 for (my $i=0; $i<=$length-1; $i++) {
684 24249 100       25454 next if $char1[$i] eq '?';
685 22382 100       22373 next if $char2[$i] eq '?';
686 21740 100       35222 $diff++ if $char1[$i] ne $char2[$i];
687             }
688 59 100       899 return (1, $str2) if $diff == 0; # seen before
689             }
690              
691 10         163 return (0, $str1); # not seen
692             }
693              
694             sub _convert_leading_ending_gaps {
695 26     26   37 my $s=shift;
696 26         19 my $sym1=shift;
697 26         17 my $sym2=shift;
698 26         661 my @array=split //, $s;
699             # convert leading char:
700 26         45 for (my $i=0; $i<=$#array; $i++) {
701 369 100       535 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
702             }
703             # convert ending char:
704 26         47 for (my $i = $#array; $i>= 0; $i--) {
705 552 100       762 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
706             }
707 26         248 my $s_new=join '', @array;
708 26         369 return $s_new;
709             }
710              
711             =head1 Sequence selection methods
712              
713             Methods returning one or more sequences objects.
714              
715             =head2 each_seq
716              
717             Title : each_seq
718             Usage : foreach $seq ( $align->each_seq() )
719             Function : Gets a Seq object from the alignment
720             Returns : Seq object
721             Argument :
722              
723             =cut
724              
725             sub eachSeq {
726 0     0 0 0 my $self = shift;
727 0         0 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
728 0         0 $self->each_seq();
729             }
730              
731             sub each_seq {
732 7620     7620 1 5635 my $self = shift;
733 7620         4610 my (@arr,$order);
734              
735 7620         4783 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
  120939         76245  
  7620         17004  
736 56090 100       72807 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
737 56077         55677 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
738             }
739             }
740 7620         15516 return @arr;
741             }
742              
743              
744             =head2 each_alphabetically
745              
746             Title : each_alphabetically
747             Usage : foreach $seq ( $ali->each_alphabetically() )
748             Function : Returns a sequence object, but the objects are returned
749             in alphabetically sorted order.
750             Does not change the order of the alignment.
751             Returns : Seq object
752             Argument :
753              
754             =cut
755              
756             sub each_alphabetically {
757 2     2 1 4 my $self = shift;
758 2         2 my ($seq,$nse,@arr,%hash,$count);
759              
760 2         4 foreach $seq ( $self->each_seq() ) {
761 32         39 $nse = $seq->get_nse;
762 32         40 $hash{$nse} = $seq;
763             }
764              
765 2         12 foreach $nse ( sort _alpha_startend keys %hash) {
766 32         25 push(@arr,$hash{$nse});
767             }
768 2         11 return @arr;
769             }
770              
771             sub _alpha_startend {
772 95     95   54 my ($aname,$astart,$bname,$bstart);
773 95         86 ($aname,$astart) = split (/-/,$a);
774 95         89 ($bname,$bstart) = split (/-/,$b);
775              
776 95 50       90 if( $aname eq $bname ) {
777 0         0 return $astart <=> $bstart;
778             }
779             else {
780 95         74 return $aname cmp $bname;
781             }
782             }
783              
784             =head2 each_seq_with_id
785              
786             Title : each_seq_with_id
787             Usage : foreach $seq ( $align->each_seq_with_id() )
788             Function : Gets a Seq objects from the alignment, the contents
789             being those sequences with the given name (there may be
790             more than one)
791             Returns : Seq object
792             Argument : a seq name
793              
794             =cut
795              
796             sub eachSeqWithId {
797 0     0 0 0 my $self = shift;
798 0         0 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
799 0         0 $self->each_seq_with_id(@_);
800             }
801              
802             sub each_seq_with_id {
803 89     89 1 73 my $self = shift;
804 89         74 my $id = shift;
805              
806 89 50       128 $self->throw("Method each_seq_with_id needs a sequence name argument")
807             unless defined $id;
808              
809 89         62 my (@arr, $seq);
810              
811 89 50       140 if (exists($self->{'_start_end_lists'}->{$id})) {
812 89         66 @arr = @{$self->{'_start_end_lists'}->{$id}};
  89         133  
813             }
814 89         132 return @arr;
815             }
816              
817             =head2 get_seq_by_pos
818              
819             Title : get_seq_by_pos
820             Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
821             Function : Gets a sequence based on its position in the alignment.
822             Numbering starts from 1. Sequence positions larger than
823             num_sequences() will throw an error.
824             Returns : a Bio::LocatableSeq object
825             Args : positive integer for the sequence position
826              
827             =cut
828              
829             sub get_seq_by_pos {
830              
831 258     258 1 10718 my $self = shift;
832 258         328 my ($pos) = @_;
833              
834 258 50 33     1525 $self->throw("Sequence position has to be a positive integer, not [$pos]")
835             unless $pos =~ /^\d+$/ and $pos > 0;
836 258 50       403 $self->throw("No sequence at position [$pos]")
837             unless $pos <= $self->num_sequences ;
838              
839 258         459 my $nse = $self->{'_order'}->{--$pos};
840 258         667 return $self->{'_seq'}->{$nse};
841             }
842              
843             =head2 get_seq_by_id
844              
845             Title : get_seq_by_id
846             Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
847             Function : Gets a sequence based on its name.
848             Sequences that do not exist will warn and return undef
849             Returns : a Bio::LocatableSeq object
850             Args : string for sequence name
851              
852             =cut
853              
854             sub get_seq_by_id {
855 18     18 1 2336 my ($self,$name) = @_;
856 18 50       37 unless( defined $name ) {
857 0         0 $self->warn("Must provide a sequence name");
858 0         0 return;
859             }
860 18         18 for my $seq ( values %{$self->{'_seq'}} ) {
  18         39  
861 46 100       68 if ( $seq->id eq $name) {
862 18         31 return $seq;
863             }
864             }
865 0         0 return;
866             }
867              
868             =head2 seq_with_features
869              
870             Title : seq_with_features
871             Usage : $seq = $aln->seq_with_features(-pos => 1,
872             -consensus => 60
873             -mask =>
874             sub { my $consensus = shift;
875              
876             for my $i (1..5){
877             my $n = 'N' x $i;
878             my $q = '\?' x $i;
879             while($consensus =~ /[^?]$q[^?]/){
880             $consensus =~ s/([^?])$q([^?])/$1$n$2/;
881             }
882             }
883             return $consensus;
884             }
885             );
886             Function: produces a Bio::Seq object by first splicing gaps from -pos
887             (by means of a splice_by_seq_pos() call), then creating
888             features using non-? chars (by means of a consensus_string()
889             call with stringency -consensus).
890             Returns : a Bio::Seq object
891             Args : -pos : required. sequence from which to build the Bio::Seq
892             object
893             -consensus : optional, defaults to consensus_string()'s
894             default cutoff value
895             -mask : optional, a coderef to apply to consensus_string()'s
896             output before building features. this may be useful for
897             closing gaps of 1 bp by masking over them with N, for
898             instance
899              
900             =cut
901              
902             sub seq_with_features{
903 0     0 1 0 my ($self,%arg) = @_;
904              
905             #first do the preparatory splice
906 0 0       0 $self->throw("must provide a -pos argument") unless $arg{-pos};
907 0         0 $self->splice_by_seq_pos($arg{-pos});
908              
909 0         0 my $consensus_string = $self->consensus_string($arg{-consensus});
910             $consensus_string = $arg{-mask}->($consensus_string)
911 0 0       0 if defined($arg{-mask});
912              
913 0         0 my(@bs,@es);
914              
915 0 0       0 push @bs, 1 if $consensus_string =~ /^[^?]/;
916              
917 0         0 while($consensus_string =~ /\?[^?]/g){
918 0         0 push @bs, pos($consensus_string);
919             }
920 0         0 while($consensus_string =~ /[^?]\?/g){
921 0         0 push @es, pos($consensus_string);
922             }
923              
924 0 0       0 push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
925              
926 0         0 my $seq = Bio::Seq->new();
927              
928             # my $rootfeature = Bio::SeqFeature::Generic->new(
929             # -source_tag => 'location',
930             # -start => $self->get_seq_by_pos($arg{-pos})->start,
931             # -end => $self->get_seq_by_pos($arg{-pos})->end,
932             # );
933             # $seq->add_SeqFeature($rootfeature);
934              
935 0         0 while(my $b = shift @bs){
936 0         0 my $e = shift @es;
937             $seq->add_SeqFeature(
938             Bio::SeqFeature::Generic->new(
939             -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
940 0   0     0 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
941             -source_tag => $self->source || 'MSA',
942             )
943             );
944             }
945              
946 0         0 return $seq;
947             }
948              
949              
950             =head1 Create new alignments
951              
952             The result of these methods are horizontal or vertical subsets of the
953             current MSA.
954              
955             =head2 select
956              
957             Title : select
958             Usage : $aln2 = $aln->select(1, 3) # three first sequences
959             Function : Creates a new alignment from a continuous subset of
960             sequences. Numbering starts from 1. Sequence positions
961             larger than num_sequences() will throw an error.
962             Returns : a Bio::SimpleAlign object
963             Args : positive integer for the first sequence
964             positive integer for the last sequence to include (optional)
965              
966             =cut
967              
968             sub select {
969 1     1 1 2 my $self = shift;
970 1         1 my ($start, $end) = @_;
971              
972 1 50 33     10 $self->throw("Select start has to be a positive integer, not [$start]")
973             unless $start =~ /^\d+$/ and $start > 0;
974 1 50 33     7 $self->throw("Select end has to be a positive integer, not [$end]")
975             unless $end =~ /^\d+$/ and $end > 0;
976 1 50       2 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
977             unless $start <= $end;
978              
979 1         3 my $aln = $self->new;
980 1         3 foreach my $pos ($start .. $end) {
981 3         4 $aln->add_seq($self->get_seq_by_pos($pos));
982             }
983 1         4 $aln->id($self->id);
984             # fix for meta, sf, ann
985 1         2 return $aln;
986             }
987              
988             =head2 select_noncont
989              
990             Title : select_noncont
991             Usage : # 1st and 3rd sequences, sorted
992             $aln2 = $aln->select_noncont(1, 3)
993              
994             # 1st and 3rd sequences, sorted (same as first)
995             $aln2 = $aln->select_noncont(3, 1)
996              
997             # 1st and 3rd sequences, unsorted
998             $aln2 = $aln->select_noncont('nosort',3, 1)
999              
1000             Function : Creates a new alignment from a subset of sequences. Numbering
1001             starts from 1. Sequence positions larger than num_sequences() will
1002             throw an error. Sorts the order added to new alignment by default,
1003             to prevent sorting pass 'nosort' as the first argument in the list.
1004             Returns : a Bio::SimpleAlign object
1005             Args : array of integers for the sequences. If the string 'nosort' is
1006             passed as the first argument, the sequences will not be sorted
1007             in the new alignment but will appear in the order listed.
1008              
1009             =cut
1010              
1011             sub select_noncont {
1012 8     8 1 9 my $self = shift;
1013 8         9 my $nosort = 0;
1014 8         11 my (@pos) = @_;
1015 8 100       35 if ($pos[0] !~ m{^\d+$}) {
1016 1         1 my $sortcmd = shift @pos;
1017 1 50       4 if ($sortcmd eq 'nosort') {
1018 1         1 $nosort = 1;
1019             } else {
1020 0         0 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1021             }
1022             }
1023            
1024 8         16 my $end = $self->num_sequences;
1025 8         12 foreach ( @pos ) {
1026 32 50 33     165 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
      33        
1027             unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1028             }
1029            
1030 8 100       18 @pos = sort {$a <=> $b} @pos unless $nosort;
  25         20  
1031            
1032 8         15 my $aln = $self->new;
1033 8         11 foreach my $p (@pos) {
1034 32         39 $aln->add_seq($self->get_seq_by_pos($p));
1035             }
1036 8         15 $aln->id($self->id);
1037             # fix for meta, sf, ann
1038 8         16 return $aln;
1039             }
1040              
1041             =head2 select_noncont_by_name
1042              
1043             Title : select_noncont_by_name
1044             Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1045             Function : Creates a new alignment from a subset of sequences which are
1046             selected by name (sequence ID).
1047             Returns : a Bio::SimpleAlign object
1048             Args : array of names (i.e., identifiers) for the sequences.
1049              
1050             =cut
1051              
1052             sub select_noncont_by_name {
1053 1     1 1 3 my ($self, @names) = @_;
1054            
1055 1         2 my $aln = $self->new;
1056 1         2 foreach my $name (@names) {
1057 3         6 $aln->add_seq($self->get_seq_by_id($name));
1058             }
1059 1         3 $aln->id($self->id);
1060              
1061 1         2 return $aln;
1062             }
1063              
1064             =head2 slice
1065              
1066             Title : slice
1067             Usage : $aln2 = $aln->slice(20,30)
1068             Function : Creates a slice from the alignment inclusive of start and
1069             end columns, and the first column in the alignment is denoted 1.
1070             Sequences with no residues in the slice are excluded from the
1071             new alignment and a warning is printed. Slice beyond the length of
1072             the sequence does not do padding.
1073             Returns : A Bio::SimpleAlign object
1074             Args : Positive integer for start column, positive integer for end column,
1075             optional boolean which if true will keep gap-only columns in the newly
1076             created slice. Example:
1077              
1078             $aln2 = $aln->slice(20,30,1)
1079              
1080             =cut
1081              
1082             sub slice {
1083 31     31 1 1964 my $self = shift;
1084 31         35 my ($start, $end, $keep_gap_only) = @_;
1085              
1086 31 50 33     187 $self->throw("Slice start has to be a positive integer, not [$start]")
1087             unless $start =~ /^\d+$/ and $start > 0;
1088 31 50 33     143 $self->throw("Slice end has to be a positive integer, not [$end]")
1089             unless $end =~ /^\d+$/ and $end > 0;
1090 31 50       44 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1091             unless $start <= $end;
1092 31 100       55 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1093             "[$start] is too big.") if $start > $self->length;
1094 30         50 my $cons_meta = $self->consensus_meta;
1095 30         50 my $aln = $self->new;
1096 30         57 $aln->id($self->id);
1097 30         49 foreach my $seq ( $self->each_seq() ) {
1098 107 50       568 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1099             Bio::Seq::Meta->new
1100             (-id => $seq->id,
1101             -alphabet => $seq->alphabet,
1102             -strand => $seq->strand,
1103             -verbose => $self->verbose) :
1104             Bio::LocatableSeq->new
1105             (-id => $seq->id,
1106             -alphabet => $seq->alphabet,
1107             -strand => $seq->strand,
1108             -verbose => $self->verbose);
1109            
1110             # seq
1111 107         139 my $seq_end = $end;
1112 107 50       194 $seq_end = $seq->length if( $end > $seq->length );
1113              
1114 107         187 my $slice_seq = $seq->subseq($start, $seq_end);
1115 107         170 $new_seq->seq( $slice_seq );
1116              
1117             # Allowed extra characters in string
1118 107         88 my $allowed_chars = '';
1119 107 100       175 if (exists $self->{_mask_char}) {
1120 24         20 $allowed_chars = $self->{_mask_char};
1121 24         27 $allowed_chars = quotemeta $allowed_chars;
1122             }
1123 107         622 $slice_seq =~ s/[^\w$allowed_chars]//g;
1124              
1125 107 100       163 if ($start > 1) {
1126 52         88 my $pre_start_seq = $seq->subseq(1, $start - 1);
1127 52         297 $pre_start_seq =~ s/[^\w$allowed_chars]//g;
1128 52 100       90 if (!defined($seq->strand)) {
    100          
1129 38         58 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130             } elsif ($seq->strand < 0){
1131 1         3 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132             } else {
1133 13         21 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1134             }
1135             } else {
1136 55 100 100     92 if ((defined $seq->strand)&&($seq->strand < 0)){
1137 2         5 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1138             } else {
1139 53         82 $new_seq->start( $seq->start);
1140             }
1141             }
1142 107 50       462 if ($new_seq->isa('Bio::Seq::MetaI')) {
1143 0         0 for my $meta_name ($seq->meta_names) {
1144 0         0 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1145             }
1146             }
1147 107         131 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1148              
1149 107 100 66     167 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1150 106         185 $aln->add_seq($new_seq);
1151             } else {
1152 1 50       3 if( $keep_gap_only ) {
1153 1         3 $aln->add_seq($new_seq);
1154             } else {
1155 0         0 my $nse = $seq->get_nse();
1156 0         0 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1157             " Sequence excluded from the new alignment.");
1158             }
1159             }
1160             }
1161 30 50       54 if ($cons_meta) {
1162 0         0 my $new = Bio::Seq::Meta->new();
1163 0         0 for my $meta_name ($cons_meta->meta_names) {
1164 0         0 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1165             }
1166 0         0 $aln->consensus_meta($new);
1167             }
1168 30         61 $aln->annotation($self->annotation);
1169             # fix for meta, sf, ann
1170 30         63 return $aln;
1171             }
1172              
1173             =head2 remove_columns
1174              
1175             Title : remove_columns
1176             Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1177             $aln2 = $aln->remove_columns([0,0],[6,8])
1178             Function : Creates an aligment with columns removed corresponding to
1179             the specified type or by specifying the columns by number.
1180             Returns : Bio::SimpleAlign object
1181             Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1182             'all_gaps_columns') or array ref where the referenced array
1183             contains a pair of integers that specify a range.
1184             The first column is 0
1185              
1186             =cut
1187              
1188             sub remove_columns {
1189 4     4 1 434 my ($self,@args) = @_;
1190 4 50       11 @args || $self->throw("Must supply column ranges or column types");
1191 4         4 my $aln;
1192              
1193 4 100       27 if ($args[0][0] =~ /^[a-z_]+$/i) {
    50          
1194 1         3 $aln = $self->_remove_columns_by_type($args[0]);
1195             } elsif ($args[0][0] =~ /^\d+$/) {
1196 3         10 $aln = $self->_remove_columns_by_num(\@args);
1197             } else {
1198 0         0 $self->throw("You must pass array references to remove_columns(), not @args");
1199             }
1200             # fix for meta, sf, ann
1201 4         8 $aln;
1202             }
1203              
1204              
1205             =head2 remove_gaps
1206              
1207             Title : remove_gaps
1208             Usage : $aln2 = $aln->remove_gaps
1209             Function : Creates an aligment with gaps removed
1210             Returns : a Bio::SimpleAlign object
1211             Args : a gap character(optional) if none specified taken
1212             from $self->gap_char,
1213             [optional] $all_gaps_columns flag (1 or 0, default is 0)
1214             indicates that only all-gaps columns should be deleted
1215              
1216             Used from method L in most cases. Set gap character
1217             using L.
1218              
1219             =cut
1220              
1221             sub remove_gaps {
1222 6     6 1 11 my ($self,$gapchar,$all_gaps_columns) = @_;
1223 6         4 my $gap_line;
1224 6 100       12 if ($all_gaps_columns) {
1225 2         5 $gap_line = $self->all_gap_line($gapchar);
1226             } else {
1227 4         9 $gap_line = $self->gap_line($gapchar);
1228             }
1229 6         10 my $aln = $self->new;
1230              
1231 6         7 my @remove;
1232 6         4 my $length = 0;
1233 6   66     15 my $del_char = $gapchar || $self->gap_char;
1234             # Do the matching to get the segments to remove
1235 6         44 while ($gap_line =~ m/[$del_char]/g) {
1236 14         13 my $start = pos($gap_line)-1;
1237 14         26 $gap_line =~ m/\G[$del_char]+/gc;
1238 14         12 my $end = pos($gap_line)-1;
1239              
1240             #have to offset the start and end for subsequent removes
1241 14         9 $start-=$length;
1242 14         8 $end -=$length;
1243 14         12 $length += ($end-$start+1);
1244 14         38 push @remove, [$start,$end];
1245             }
1246              
1247             #remove the segments
1248 6 100       19 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1249             # fix for meta, sf, ann
1250 6         18 return $aln;
1251             }
1252              
1253              
1254             sub _remove_col {
1255 9     9   11 my ($self,$aln,$remove) = @_;
1256 9         7 my @new;
1257            
1258 9         14 my $gap = $self->gap_char;
1259            
1260             # splice out the segments and create new seq
1261 9         15 foreach my $seq($self->each_seq){
1262 42         85 my $new_seq = Bio::LocatableSeq->new(
1263             -id => $seq->id,
1264             -alphabet=> $seq->alphabet,
1265             -strand => $seq->strand,
1266             -verbose => $self->verbose);
1267 42         80 my $sequence = $seq->seq;
1268 42         34 foreach my $pair(@{$remove}){
  42         54  
1269 658         448 my $start = $pair->[0];
1270 658         421 my $end = $pair->[1];
1271 658 50       701 $sequence = $seq->seq unless $sequence;
1272 658         451 my $orig = $sequence;
1273 658 100       831 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1274 658 100       803 my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1275 658         505 $sequence = $head.$tail;
1276             # start
1277 658 100       746 unless (defined $new_seq->start) {
1278 42 100       46 if ($start == 0) {
1279 31         90 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1280 31         54 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1281             }
1282             else {
1283 11         43 my $start_adjust = $orig =~ /^$gap+/;
1284 11 100       19 if ($start_adjust) {
1285 5         12 $start_adjust = $+[0] == $start;
1286             }
1287 11         18 $new_seq->start($seq->start + $start_adjust);
1288             }
1289             }
1290             # end
1291 658 100       678 if (($end + 1) >= CORE::length($orig)) {
1292 32         74 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1293 32         44 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1294             }
1295             else {
1296 626         682 $new_seq->end($seq->end);
1297             }
1298             }
1299            
1300 42 100       50 if ($new_seq->end < $new_seq->start) {
1301             # we removed all columns except for gaps: set to 0 to indicate no
1302             # sequence
1303 1         3 $new_seq->start(0);
1304 1         3 $new_seq->end(0);
1305             }
1306            
1307 42 50       92 $new_seq->seq($sequence) if $sequence;
1308 42         60 push @new, $new_seq;
1309             }
1310             # add the new seqs to the alignment
1311 9         13 foreach my $new(@new){
1312 42         50 $aln->add_seq($new);
1313             }
1314             # fix for meta, sf, ann
1315 9         17 return $aln;
1316             }
1317              
1318             sub _remove_columns_by_type {
1319 1     1   2 my ($self,$type) = @_;
1320 1         2 my $aln = $self->new;
1321 1         1 my @remove;
1322              
1323 1 50       2 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
  1         4  
  1         2  
1324 1 50       1 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
  1         3  
1325 1         5 my %matchchars = ( 'match' => '\*',
1326             'weak' => '\.',
1327             'strong' => ':',
1328             'mismatch' => ' ',
1329             'gaps' => '',
1330             'all_gaps_columns' => ''
1331             );
1332             # get the characters to delete against
1333 1         1 my $del_char;
1334 1         1 foreach my $type (@{$type}){
  1         2  
1335 1         2 $del_char.= $matchchars{$type};
1336             }
1337              
1338 1         1 my $length = 0;
1339 1         3 my $match_line = $self->match_line;
1340             # do the matching to get the segments to remove
1341 1 50       4 if($del_char){
1342 1         16 while($match_line =~ m/[$del_char]/g ){
1343 37         26 my $start = pos($match_line)-1;
1344 37         47 $match_line=~/\G[$del_char]+/gc;
1345 37         27 my $end = pos($match_line)-1;
1346              
1347             #have to offset the start and end for subsequent removes
1348 37         18 $start-=$length;
1349 37         24 $end -=$length;
1350 37         17 $length += ($end-$start+1);
1351 37         77 push @remove, [$start,$end];
1352             }
1353             }
1354              
1355             # remove the segments
1356 1 50       5 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1357 1 50       3 $aln = $aln->remove_gaps() if $gap;
1358 1 50       11 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1359             # fix for meta, sf, ann
1360 1         7 $aln;
1361             }
1362              
1363              
1364             sub _remove_columns_by_num {
1365 3     3   4 my ($self,$positions) = @_;
1366 3         5 my $aln = $self->new;
1367              
1368             # sort the positions
1369 3         9 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
  3         5  
1370            
1371 3         2 my @remove;
1372 3         4 my $length = 0;
1373 3         3 foreach my $pos (@{$positions}) {
  3         6  
1374 5         3 my ($start, $end) = @{$pos};
  5         6  
1375            
1376             #have to offset the start and end for subsequent removes
1377 5         6 $start-=$length;
1378 5         5 $end -=$length;
1379 5         3 $length += ($end-$start+1);
1380 5         219 push @remove, [$start,$end];
1381             }
1382              
1383             #remove the segments
1384 3 50       14 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1385             # fix for meta, sf, ann
1386 3         6 $aln;
1387             }
1388              
1389              
1390             =head1 Change sequences within the MSA
1391              
1392             These methods affect characters in all sequences without changing the
1393             alignment.
1394              
1395             =head2 splice_by_seq_pos
1396              
1397             Title : splice_by_seq_pos
1398             Usage : $status = splice_by_seq_pos(1);
1399             Function: splices all aligned sequences where the specified sequence
1400             has gaps.
1401             Example :
1402             Returns : 1 on success
1403             Args : position of sequence to splice by
1404              
1405              
1406             =cut
1407              
1408             sub splice_by_seq_pos{
1409 0     0 1 0 my ($self,$pos) = @_;
1410              
1411 0         0 my $guide = $self->get_seq_by_pos($pos);
1412 0         0 my $guide_seq = $guide->seq;
1413              
1414 0         0 $guide_seq =~ s/\./\-/g;
1415              
1416 0         0 my @gaps = ();
1417 0         0 $pos = -1;
1418 0         0 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1419 0         0 unshift @gaps, $pos;
1420 0         0 $pos++;
1421             }
1422              
1423 0         0 foreach my $seq ($self->each_seq){
1424 0         0 my @bases = split '', $seq->seq;
1425              
1426 0         0 splice(@bases, $_, 1) foreach @gaps;
1427 0         0 $seq->seq(join('', @bases));
1428             }
1429              
1430 0         0 1;
1431             }
1432              
1433             =head2 map_chars
1434              
1435             Title : map_chars
1436             Usage : $ali->map_chars('\.','-')
1437             Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1438             characters.
1439              
1440             Note that the first argument is interpreted as a regexp
1441             so be careful and escape any wild card characters (e.g.
1442             do $ali->map_chars('\.','-') to replace periods with dashes.
1443             Returns : 1 on success
1444             Argument : A regexp and a string
1445              
1446             =cut
1447              
1448             sub map_chars {
1449 4     4 1 6 my $self = shift;
1450 4         8 my $from = shift;
1451 4         5 my $to = shift;
1452 4         5 my ( $seq, $temp );
1453              
1454 4 50 33     21 $self->throw("Need two arguments: a regexp and a string")
1455             unless defined $from and defined $to;
1456              
1457 4         9 foreach $seq ( $self->each_seq() ) {
1458 39         45 $temp = $seq->seq();
1459 39         144 $temp =~ s/$from/$to/g;
1460 39         48 $seq->seq($temp);
1461             }
1462 4         12 return 1;
1463             }
1464              
1465              
1466             =head2 uppercase
1467              
1468             Title : uppercase()
1469             Usage : $ali->uppercase()
1470             Function : Sets all the sequences to uppercase
1471             Returns : 1 on success
1472             Argument :
1473              
1474             =cut
1475              
1476             sub uppercase {
1477 1     1 1 3 my $self = shift;
1478 1         1 my $seq;
1479             my $temp;
1480              
1481 1         4 foreach $seq ( $self->each_seq() ) {
1482 16         21 $temp = $seq->seq();
1483 16         21 $temp =~ tr/[a-z]/[A-Z]/;
1484              
1485 16         21 $seq->seq($temp);
1486             }
1487 1         4 return 1;
1488             }
1489              
1490             =head2 cigar_line
1491              
1492             Title : cigar_line()
1493             Usage : %cigars = $align->cigar_line()
1494             Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1495             Report) line for each sequence in the alignment. Examples are
1496             "1,60" or "5,10:12,58", where the numbers refer to conserved
1497             positions within the alignment. The keys of the hash are the
1498             NSEs (name/start/end) assigned to each sequence.
1499             Args : threshold (optional, defaults to 100)
1500             Returns : Hash of strings (cigar lines)
1501              
1502             =cut
1503              
1504             sub cigar_line {
1505 1     1 1 438 my $self = shift;
1506 1   50     6 my $thr=shift||100;
1507 1         2 my %cigars;
1508              
1509 1         3 my @consensus = split "",($self->consensus_string($thr));
1510 1         3 my $len = $self->length;
1511 1         3 my $gapchar = $self->gap_char;
1512              
1513             # create a precursor, something like (1,4,5,6,7,33,45),
1514             # where each number corresponds to a conserved position
1515 1         2 foreach my $seq ( $self->each_seq ) {
1516 4         7 my @seq = split "", uc ($seq->seq);
1517 4         5 my $pos = 1;
1518 4         9 for (my $x = 0 ; $x < $len ; $x++ ) {
1519 132 100       189 if ($seq[$x] eq $consensus[$x]) {
    100          
1520 28         15 push @{$cigars{$seq->get_nse}},$pos;
  28         31  
1521 28         48 $pos++;
1522             } elsif ($seq[$x] ne $gapchar) {
1523 84         97 $pos++;
1524             }
1525             }
1526             }
1527             # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1528 1         3 for my $name (keys %cigars) {
1529 4         4 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
  4         4  
1530 4 50       2 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
  4         5  
  4         6  
1531 4         4 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
  4         4  
  4         2  
1532 4         4 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
  4         4  
1533 4 50       3 ${$cigars{$name}}[$#{$cigars{$name}}] );
  4         6  
  4         4  
1534 4         5 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
  32         39  
1535 28 100 100     14 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
  28         24  
  28         42  
1536 8         4 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
  8         20  
1537 4         3 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
  4         3  
  4         6  
1538             }
1539             }
1540             }
1541             # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1542 1         3 for my $name (keys %cigars) {
1543 4         1 my @remove;
1544 4         4 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
  40         46  
1545 36 100 100     25 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
  36         21  
  36         55  
1546 12         8 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
  12         26  
1547 8         8 unshift @remove,$x;
1548             }
1549             }
1550 4         4 for my $pos (@remove) {
1551 8         4 splice @{$cigars{$name}}, $pos, 1;
  8         11  
1552             }
1553             }
1554             # join and punctuate
1555 1         2 for my $name (keys %cigars) {
1556 4         5 my ($start,$end,$str) = "";
1557 4         3 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
  20         30  
1558 16         17 $str .= ($start . "," . $end . ":");
1559             }
1560 4         10 $str =~ s/:$//;
1561 4         6 $cigars{$name} = $str;
1562             }
1563 1         7 %cigars;
1564             }
1565              
1566              
1567             =head2 match_line
1568              
1569             Title : match_line()
1570             Usage : $line = $align->match_line()
1571             Function : Generates a match line - much like consensus string
1572             except that a line indicating the '*' for a match.
1573             Args : (optional) Match line characters ('*' by default)
1574             (optional) Strong match char (':' by default)
1575             (optional) Weak match char ('.' by default)
1576             Returns : String
1577              
1578             =cut
1579              
1580             sub match_line {
1581 6     6 1 13 my ($self,$matchlinechar, $strong, $weak) = @_;
1582 6   50     73 my %matchchars = ('match' => $matchlinechar || '*',
      50        
      50        
1583             'weak' => $weak || '.',
1584             'strong' => $strong || ':',
1585             'mismatch' => ' ',
1586             );
1587              
1588 6         8 my @seqchars;
1589             my $alphabet;
1590 6         15 foreach my $seq ( $self->each_seq ) {
1591 69         112 push @seqchars, [ split(//, uc ($seq->seq)) ];
1592 69 100       1106 $alphabet = $seq->alphabet unless defined $alphabet;
1593             }
1594 6         15 my $refseq = shift @seqchars;
1595             # let's just march down the columns
1596 6         7 my $matchline;
1597             POS:
1598 6         20 foreach my $pos ( 0..$self->length ) {
1599 3138         2447 my $refchar = $refseq->[$pos];
1600 3138         2072 my $char = $matchchars{'mismatch'};
1601 3138 100       3490 unless( defined $refchar ) {
1602 6 50       28 last if $pos == $self->length; # short circuit on last residue
1603             # this in place to handle jason's soon-to-be-committed
1604             # intron mapping code
1605 0         0 goto bottom;
1606             }
1607 3132         2875 my %col = ($refchar => 1);
1608 3132   66     9885 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1609 3132         2499 foreach my $seq ( @seqchars ) {
1610 26906 50       28125 next if $pos >= scalar @$seq;
1611 26906 100 100     87824 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
      66        
1612             $seq->[$pos] eq ' ' );
1613 26906 50       34760 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1614             }
1615 3132         4640 my @colresidues = sort keys %col;
1616              
1617             # if all the values are the same
1618 3132 100       4437 if( $dash ) { $char = $matchchars{'mismatch'} }
  758 100       566  
    100          
1619 1721         1282 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1620             elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1621             # matches for protein seqs
1622             TYPE:
1623 472         363 foreach my $type ( qw(strong weak) ) {
1624             # iterate through categories
1625 804         447 my %groups;
1626             # iterate through each of the aa in the col
1627             # look to see which groups it is in
1628 804         593 foreach my $c ( @colresidues ) {
1629 3075         1683 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
  30455         24136  
  3075         2703  
1630 6808         3680 push @{$groups{$f}},$c;
  6808         7786  
1631             }
1632             }
1633             GRP:
1634 804         821 foreach my $cols ( values %groups ) {
1635 3944         4326 @$cols = sort @$cols;
1636             # now we are just testing to see if two arrays
1637             # are identical w/o changing either one
1638             # have to be same len
1639 3944 100       5796 next if( scalar @$cols != scalar @colresidues );
1640             # walk down the length and check each slot
1641 196         279 for($_=0;$_ < (scalar @$cols);$_++ ) {
1642 453 50       813 next GRP if( $cols->[$_] ne $colresidues[$_] );
1643             }
1644 196         182 $char = $matchchars{$type};
1645 196         304 last TYPE;
1646             }
1647             }
1648             }
1649             bottom:
1650 3132         3874 $matchline .= $char;
1651             }
1652 6         975 return $matchline;
1653             }
1654              
1655              
1656             =head2 gap_line
1657              
1658             Title : gap_line()
1659             Usage : $line = $align->gap_line()
1660             Function : Generates a gap line - much like consensus string
1661             except that a line where '-' represents gap
1662             Args : (optional) gap line characters ('-' by default)
1663             Returns : string
1664              
1665             =cut
1666              
1667             sub gap_line {
1668 14     14 1 10 my ($self,$gapchar) = @_;
1669 14   33     39 $gapchar = $gapchar || $self->gap_char;
1670 14         13 my %gap_hsh; # column gaps vector
1671 14         23 foreach my $seq ( $self->each_seq ) {
1672 33         28 my $i = 0;
1673 223         758 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
  4020         4840  
1674 33         62 map {[$i++, $_]} split(//, uc ($seq->seq));
  4020         3398  
1675             }
1676 14         16 my $gap_line;
1677 14         24 foreach my $pos ( 0..$self->length-1 ) {
1678 1973 100       2009 $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1679             }
1680 14         48 return $gap_line;
1681             }
1682              
1683             =head2 all_gap_line
1684              
1685             Title : all_gap_line()
1686             Usage : $line = $align->all_gap_line()
1687             Function : Generates a gap line - much like consensus string
1688             except that a line where '-' represents all-gap column
1689             Args : (optional) gap line characters ('-' by default)
1690             Returns : string
1691              
1692             =cut
1693              
1694             sub all_gap_line {
1695 2     2 1 3 my ($self,$gapchar) = @_;
1696 2   66     6 $gapchar = $gapchar || $self->gap_char;
1697 2         2 my %gap_hsh; # column gaps counter hash
1698 2         5 my @seqs = $self->each_seq;
1699 2         3 foreach my $seq ( @seqs ) {
1700 5         4 my $i = 0;
1701 7         16 map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
  54         94  
1702 5         11 map {[$i++, $_]} split(//, uc ($seq->seq));
  54         50  
1703             }
1704 2         3 my $gap_line;
1705 2         5 foreach my $pos ( 0..$self->length-1 ) {
1706 22 100 100     39 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1707             # gaps column
1708 1         2 $gap_line .= $self->gap_char;
1709             } else {
1710 21         18 $gap_line .= '.';
1711             }
1712             }
1713 2         5 return $gap_line;
1714             }
1715              
1716             =head2 gap_col_matrix
1717              
1718             Title : gap_col_matrix()
1719             Usage : my $cols = $align->gap_col_matrix()
1720             Function : Generates an array where each element in the array is a
1721             hash reference with a key of the sequence name and a
1722             value of 1 if the sequence has a gap at that column
1723             Returns : Reference to an array
1724             Args : Optional: gap line character ($aln->gap_char or '-' by default)
1725              
1726             =cut
1727              
1728             sub gap_col_matrix {
1729 0     0 1 0 my ( $self, $gapchar ) = @_;
1730 0   0     0 $gapchar = $gapchar || $self->gap_char;
1731 0         0 my %gap_hsh; # column gaps vector
1732             my @cols;
1733 0         0 foreach my $seq ( $self->each_seq ) {
1734 0         0 my $i = 0;
1735 0         0 my $str = $seq->seq;
1736 0         0 my $len = $seq->length;
1737 0         0 my $ch;
1738 0         0 my $id = $seq->display_id;
1739 0         0 while ( $i < $len ) {
1740 0         0 $ch = substr( $str, $i, 1 );
1741 0         0 $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1742             }
1743             }
1744 0         0 return \@cols;
1745             }
1746              
1747             =head2 match
1748              
1749             Title : match()
1750             Usage : $ali->match()
1751             Function : Goes through all columns and changes residues that are
1752             identical to residue in first sequence to match '.'
1753             character. Sets match_char.
1754              
1755             USE WITH CARE: Most MSA formats do not support match
1756             characters in sequences, so this is mostly for output
1757             only. NEXUS format (Bio::AlignIO::nexus) can handle
1758             it.
1759             Returns : 1 on success
1760             Argument : a match character, optional, defaults to '.'
1761              
1762             =cut
1763              
1764             sub match {
1765 1     1 1 2 my ( $self, $match ) = @_;
1766              
1767 1   50     4 $match ||= '.';
1768 1         1 my ($matching_char) = $match;
1769 1 50       5 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/; #';
1770 1         5 $self->map_chars( $matching_char, '-' );
1771              
1772 1         3 my @seqs = $self->each_seq();
1773 1 50       4 return 1 unless scalar @seqs > 1;
1774              
1775 1         2 my $refseq = shift @seqs;
1776 1         2 my @refseq = split //, $refseq->seq;
1777 1         3 my $gapchar = $self->gap_char;
1778              
1779 1         2 foreach my $seq (@seqs) {
1780 5         9 my @varseq = split //, $seq->seq();
1781 5         11 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1782 720 100 66     3184 $varseq[$i] = $match
      33        
      66        
1783             if defined $refseq[$i]
1784             && ( $refseq[$i] =~ /[A-Za-z\*]/
1785             || $refseq[$i] =~ /$gapchar/ )
1786             && $refseq[$i] eq $varseq[$i];
1787             }
1788 5         34 $seq->seq( join '', @varseq );
1789             }
1790 1         5 $self->match_char($match);
1791 1         7 return 1;
1792             }
1793              
1794              
1795              
1796             =head2 unmatch
1797              
1798             Title : unmatch()
1799             Usage : $ali->unmatch()
1800             Function : Undoes the effect of method match. Unsets match_char.
1801             Returns : 1 on success
1802             Argument : a match character, optional, defaults to '.'
1803              
1804             See L and L
1805              
1806             =cut
1807              
1808             sub unmatch {
1809 5     5 1 9 my ( $self, $match ) = @_;
1810              
1811 5   100     23 $match ||= '.';
1812              
1813 5         18 my @seqs = $self->each_seq();
1814 5 50       16 return 1 unless scalar @seqs > 1;
1815              
1816 5         8 my $refseq = shift @seqs;
1817 5         14 my @refseq = split //, $refseq->seq;
1818 5         18 my $gapchar = $self->gap_char;
1819 5         12 foreach my $seq (@seqs) {
1820 114         170 my @varseq = split //, $seq->seq();
1821 114         204 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1822 19782 100 100     85187 $varseq[$i] = $refseq[$i]
      33        
      100        
1823             if defined $refseq[$i]
1824             && ( $refseq[$i] =~ /[A-Za-z\*]/
1825             || $refseq[$i] =~ /$gapchar/ )
1826             && $varseq[$i] eq $match;
1827             }
1828 114         813 $seq->seq( join '', @varseq );
1829             }
1830 5         24 $self->match_char('');
1831 5         70 return 1;
1832             }
1833              
1834              
1835             =head1 MSA attributes
1836              
1837             Methods for setting and reading the MSA attributes.
1838              
1839             Note that the methods defining character semantics depend on the user
1840             to set them sensibly. They are needed only by certain input/output
1841             methods. Unset them by setting to an empty string ('').
1842              
1843             =head2 id
1844              
1845             Title : id
1846             Usage : $myalign->id("Ig")
1847             Function : Gets/sets the id field of the alignment
1848             Returns : An id string
1849             Argument : An id string (optional)
1850              
1851             =cut
1852              
1853             sub id {
1854 142     142 1 165 my ( $self, $name ) = @_;
1855              
1856 142 100       261 if ( defined($name) ) {
1857 72         92 $self->{'_id'} = $name;
1858             }
1859              
1860 142         262 return $self->{'_id'};
1861             }
1862              
1863             =head2 accession
1864              
1865             Title : accession
1866             Usage : $myalign->accession("PF00244")
1867             Function : Gets/sets the accession field of the alignment
1868             Returns : An acc string
1869             Argument : An acc string (optional)
1870              
1871             =cut
1872              
1873             sub accession {
1874 16     16 1 21 my ( $self, $acc ) = @_;
1875              
1876 16 100       34 if ( defined($acc) ) {
1877 8         20 $self->{'_accession'} = $acc;
1878             }
1879              
1880 16         31 return $self->{'_accession'};
1881             }
1882              
1883             =head2 description
1884              
1885             Title : description
1886             Usage : $myalign->description("14-3-3 proteins")
1887             Function : Gets/sets the description field of the alignment
1888             Returns : An description string
1889             Argument : An description string (optional)
1890              
1891             =cut
1892              
1893             sub description {
1894 25     25 1 36 my ( $self, $name ) = @_;
1895              
1896 25 100       52 if ( defined($name) ) {
1897 13         26 $self->{'_description'} = $name;
1898             }
1899              
1900 25         55 return $self->{'_description'};
1901             }
1902              
1903             =head2 missing_char
1904              
1905             Title : missing_char
1906             Usage : $myalign->missing_char("?")
1907             Function : Gets/sets the missing_char attribute of the alignment
1908             It is generally recommended to set it to 'n' or 'N'
1909             for nucleotides and to 'X' for protein.
1910             Returns : An missing_char string,
1911             Argument : An missing_char string (optional)
1912              
1913             =cut
1914              
1915             sub missing_char {
1916 32     32 1 39 my ( $self, $char ) = @_;
1917              
1918 32 100       65 if ( defined $char ) {
1919 22 50       45 $self->throw("Single missing character, not [$char]!")
1920             if CORE::length($char) > 1;
1921 22         36 $self->{'_missing_char'} = $char;
1922             }
1923              
1924 32         56 return $self->{'_missing_char'};
1925             }
1926              
1927              
1928             =head2 match_char
1929              
1930             Title : match_char
1931             Usage : $myalign->match_char('.')
1932             Function : Gets/sets the match_char attribute of the alignment
1933             Returns : An match_char string,
1934             Argument : An match_char string (optional)
1935              
1936             =cut
1937              
1938             sub match_char {
1939 18     18 1 28 my ( $self, $char ) = @_;
1940              
1941 18 100       42 if ( defined $char ) {
1942 9 50       19 $self->throw("Single match character, not [$char]!")
1943             if CORE::length($char) > 1;
1944 9         16 $self->{'_match_char'} = $char;
1945             }
1946              
1947 18         38 return $self->{'_match_char'};
1948             }
1949              
1950             =head2 gap_char
1951              
1952             Title : gap_char
1953             Usage : $myalign->gap_char('-')
1954             Function : Gets/sets the gap_char attribute of the alignment
1955             Returns : An gap_char string, defaults to '-'
1956             Argument : An gap_char string (optional)
1957              
1958             =cut
1959              
1960             sub gap_char {
1961 11242     11242 1 7791 my ( $self, $char ) = @_;
1962              
1963 11242 100 100     28552 if ( defined $char || !defined $self->{'_gap_char'} ) {
1964 51 100       116 $char = '-' unless defined $char;
1965 51 50       101 $self->throw("Single gap character, not [$char]!")
1966             if CORE::length($char) > 1;
1967 51         118 $self->{'_gap_char'} = $char;
1968             }
1969 11242         15041 return $self->{'_gap_char'};
1970             }
1971              
1972              
1973             =head2 symbol_chars
1974              
1975             Title : symbol_chars
1976             Usage : my @symbolchars = $aln->symbol_chars;
1977             Function: Returns all the seen symbols (other than gaps)
1978             Returns : array of characters that are the seen symbols
1979             Args : boolean to include the gap/missing/match characters
1980              
1981             =cut
1982              
1983             sub symbol_chars{
1984 5     5 1 9 my ($self,$includeextra) = @_;
1985              
1986 5 100       13 unless ($self->{'_symbols'}) {
1987 3         10 foreach my $seq ($self->each_seq) {
1988 12         32 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
  2778         2436  
1989             }
1990             }
1991 5         6 my %copy = %{$self->{'_symbols'}};
  5         37  
1992 5 50       20 if( ! $includeextra ) {
1993 5         14 foreach my $char ( $self->gap_char, $self->match_char,
1994             $self->missing_char) {
1995 15 100       27 delete $copy{$char} if( defined $char );
1996             }
1997             }
1998 5         35 return keys %copy;
1999             }
2000              
2001             =head1 Alignment descriptors
2002              
2003             These read only methods describe the MSA in various ways.
2004              
2005              
2006             =head2 score
2007              
2008             Title : score
2009             Usage : $str = $ali->score()
2010             Function : get/set a score of the alignment
2011             Returns : a score for the alignment
2012             Argument : an optional score to set
2013              
2014             =cut
2015              
2016             sub score {
2017 26     26 1 903 my $self = shift;
2018 26 100       68 $self->{score} = shift if @_;
2019 26         46 return $self->{score};
2020             }
2021              
2022             =head2 consensus_string
2023              
2024             Title : consensus_string
2025             Usage : $str = $ali->consensus_string($threshold_percent)
2026             Function : Makes a strict consensus
2027             Returns : Consensus string
2028             Argument : Optional threshold ranging from 0 to 100.
2029             The consensus residue has to appear at least threshold %
2030             of the sequences at a given location, otherwise a '?'
2031             character will be placed at that location.
2032             (Default value = 0%)
2033              
2034             =cut
2035              
2036             sub consensus_string {
2037 11     11 1 2946 my $self = shift;
2038 11         11 my $threshold = shift;
2039              
2040 11         14 my $out = "";
2041 11         33 my $len = $self->length - 1;
2042              
2043 11         23 foreach ( 0 .. $len ) {
2044 2100         2339 $out .= $self->_consensus_aa( $_, $threshold );
2045             }
2046 11         70 return $out;
2047             }
2048              
2049              
2050             =head2 consensus_conservation
2051              
2052             Title : consensus_conservation
2053             Usage : @conservation = $ali->consensus_conservation();
2054             Function : Conservation (as a percent) of each position of alignment
2055             Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2056             Argument :
2057            
2058             =cut
2059              
2060             sub consensus_conservation {
2061 1     1 1 2 my $self = shift;
2062 1         1 my @cons;
2063 1         2 my $num_sequences = $self->num_sequences;
2064 1         3 foreach my $point (0..$self->length-1) {
2065 422         447 my %hash = $self->_consensus_counts($point);
2066             # max frequency of a non-gap letter
2067 422         547 my $max = (sort {$b<=>$a} values %hash )[0];
  560         466  
2068 422         566 push @cons, 100 * $max / $num_sequences;
2069             }
2070 1         41 return @cons;
2071             }
2072              
2073             sub _consensus_aa {
2074 2100     2100   1430 my $self = shift;
2075 2100         1566 my $point = shift;
2076 2100   100     3713 my $threshold_percent = shift || -1 ;
2077 2100         1308 my ($seq,%hash,$count,$letter,$key);
2078 2100         2023 my $gapchar = $self->gap_char;
2079 2100         2256 %hash = $self->_consensus_counts($point);
2080 2100         2442 my $number_of_sequences = $self->num_sequences();
2081 2100         2066 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2082 2100         1292 $count = -1;
2083 2100         1292 $letter = '?';
2084              
2085 2100         2855 foreach $key ( sort keys %hash ) {
2086             # print "Now at $key $hash{$key}\n";
2087 4898 100 100     11407 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2088 2472         1609 $letter = $key;
2089 2472         2162 $count = $hash{$key};
2090             }
2091             }
2092 2100         3687 return $letter;
2093             }
2094              
2095             # Frequency of each letter in one column
2096             sub _consensus_counts {
2097 2522     2522   1674 my $self = shift;
2098 2522         1435 my $point = shift;
2099 2522         1453 my %hash;
2100 2522         2303 my $gapchar = $self->gap_char;
2101 2522         2610 foreach my $seq ( $self->each_seq() ) {
2102 22078         24166 my $letter = substr($seq->seq,$point,1);
2103 22078 50       25203 $self->throw("--$point-----------") if $letter eq '';
2104 22078 100 100     48069 ($letter eq $gapchar || $letter =~ /\./) && next;
2105 17334         14802 $hash{$letter}++;
2106             }
2107 2522         5402 return %hash;
2108             }
2109              
2110              
2111             =head2 consensus_iupac
2112              
2113             Title : consensus_iupac
2114             Usage : $str = $ali->consensus_iupac()
2115             Function : Makes a consensus using IUPAC ambiguity codes from DNA
2116             and RNA. The output is in upper case except when gaps in
2117             a column force output to be in lower case.
2118              
2119             Note that if your alignment sequences contain a lot of
2120             IUPAC ambiquity codes you often have to manually set
2121             alphabet. Bio::PrimarySeq::_guess_type thinks they
2122             indicate a protein sequence.
2123             Returns : consensus string
2124             Argument : none
2125             Throws : on protein sequences
2126              
2127             =cut
2128              
2129             sub consensus_iupac {
2130 1     1 1 4 my $self = shift;
2131 1         2 my $out = "";
2132 1         3 my $len = $self->length - 1;
2133              
2134             # only DNA and RNA sequences are valid
2135 1         2 foreach my $seq ( $self->each_seq() ) {
2136 2 50       3 $self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2137             if $seq->alphabet eq 'protein';
2138             }
2139              
2140             # loop over the alignment columns
2141 1         3 foreach my $count ( 0 .. $len ) {
2142 10         11 $out .= $self->_consensus_iupac($count);
2143             }
2144 1         5 return $out;
2145             }
2146              
2147             sub _consensus_iupac {
2148 10     10   8 my ($self, $column) = @_;
2149 10         7 my ($string, $char, $rna);
2150              
2151             #determine all residues in a column
2152 10         11 foreach my $seq ( $self->each_seq() ) {
2153 20         25 $string .= substr($seq->seq, $column, 1);
2154             }
2155 10         11 $string = uc $string;
2156              
2157             # quick exit if there's an N in the string
2158 10 100       13 if ($string =~ /N/) {
2159 1 50       4 $string =~ /\W/ ? return 'n' : return 'N';
2160             }
2161             # ... or if there are only gap characters
2162 9 100       20 return '-' if $string =~ /^\W+$/;
2163              
2164             # treat RNA as DNA in regexps
2165 7 50       11 if ($string =~ /U/) {
2166 0         0 $string =~ s/U/T/;
2167 0         0 $rna = 1;
2168             }
2169              
2170             # the following s///'s only need to be done to the _first_ ambiguity code
2171             # as we only need to see the _range_ of characters in $string
2172              
2173 7 50       9 if ($string =~ /[VDHB]/) {
2174 0         0 $string =~ s/V/AGC/;
2175 0         0 $string =~ s/D/AGT/;
2176 0         0 $string =~ s/H/ACT/;
2177 0         0 $string =~ s/B/CTG/;
2178             }
2179              
2180 7 100       11 if ($string =~ /[SKYRWM]/) {
2181 1         2 $string =~ s/S/GC/;
2182 1         1 $string =~ s/K/GT/;
2183 1         2 $string =~ s/Y/CT/;
2184 1         1 $string =~ s/R/AG/;
2185 1         3 $string =~ s/W/AT/;
2186 1         2 $string =~ s/M/AC/;
2187             }
2188              
2189             # and now the guts of the thing
2190              
2191 7 100       23 if ($string =~ /A/) {
    50          
    50          
    50          
2192 5         6 $char = 'A'; # A A
2193 5 50       14 if ($string =~ /G/) {
    50          
    100          
2194 0         0 $char = 'R'; # A and G (purines) R
2195 0 0       0 if ($string =~ /C/) {
    0          
2196 0         0 $char = 'V'; # A and G and C V
2197 0 0       0 if ($string =~ /T/) {
2198 0         0 $char = 'N'; # A and G and C and T N
2199             }
2200             } elsif ($string =~ /T/) {
2201 0         0 $char = 'D'; # A and G and T D
2202             }
2203             } elsif ($string =~ /C/) {
2204 0         0 $char = 'M'; # A and C M
2205 0 0       0 if ($string =~ /T/) {
2206 0         0 $char = 'H'; # A and C and T H
2207             }
2208             } elsif ($string =~ /T/) {
2209 2         3 $char = 'W'; # A and T W
2210             }
2211             } elsif ($string =~ /C/) {
2212 0         0 $char = 'C'; # C C
2213 0 0       0 if ($string =~ /T/) {
    0          
2214 0         0 $char = 'Y'; # C and T (pyrimidines) Y
2215 0 0       0 if ($string =~ /G/) {
2216 0         0 $char = 'B'; # C and T and G B
2217             }
2218             } elsif ($string =~ /G/) {
2219 0         0 $char = 'S'; # C and G S
2220             }
2221             } elsif ($string =~ /G/) {
2222 0         0 $char = 'G'; # G G
2223 0 0       0 if ($string =~ /C/) {
    0          
2224 0         0 $char = 'S'; # G and C S
2225             } elsif ($string =~ /T/) {
2226 0         0 $char = 'K'; # G and T K
2227             }
2228             } elsif ($string =~ /T/) {
2229 2         1 $char = 'T'; # T T
2230             }
2231              
2232 7 50 33     11 $char = 'U' if $rna and $char eq 'T';
2233 7 100       11 $char = lc $char if $string =~ /\W/;
2234              
2235 7         13 return $char;
2236             }
2237              
2238              
2239             =head2 consensus_meta
2240              
2241             Title : consensus_meta
2242             Usage : $seqmeta = $ali->consensus_meta()
2243             Function : Returns a Bio::Seq::Meta object containing the consensus
2244             strings derived from meta data analysis.
2245             Returns : Bio::Seq::Meta
2246             Argument : Bio::Seq::Meta
2247             Throws : non-MetaI object
2248              
2249             =cut
2250              
2251             sub consensus_meta {
2252 56     56 1 62 my ($self, $meta) = @_;
2253 56 50 33     266 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
      66        
2254 0         0 $self->throw('Not a Bio::Seq::MetaI object');
2255             }
2256 56 100       111 return $self->{'_aln_meta'} = $meta if $meta;
2257 39         55 return $self->{'_aln_meta'}
2258             }
2259              
2260             =head2 is_flush
2261              
2262             Title : is_flush
2263             Usage : if ( $ali->is_flush() )
2264             Function : Tells you whether the alignment
2265             : is flush, i.e. all of the same length
2266             Returns : 1 or 0
2267             Argument :
2268              
2269             =cut
2270              
2271             sub is_flush {
2272 53     53 1 64 my ( $self, $report ) = @_;
2273 53         46 my $seq;
2274 53         49 my $length = (-1);
2275 53         50 my $temp;
2276              
2277 53         97 foreach $seq ( $self->each_seq() ) {
2278 248 100       305 if ( $length == (-1) ) {
2279 53         117 $length = CORE::length( $seq->seq() );
2280 53         70 next;
2281             }
2282              
2283 195         232 $temp = CORE::length( $seq->seq() );
2284 195 50       281 if ( $temp != $length ) {
2285 0 0       0 $self->warn(
2286             "expecting $length not $temp from " . $seq->display_id )
2287             if ($report);
2288 0         0 $self->debug(
2289             "expecting $length not $temp from " . $seq->display_id );
2290 0         0 $self->debug( $seq->seq() . "\n" );
2291 0         0 return 0;
2292             }
2293             }
2294              
2295 53         126 return 1;
2296             }
2297              
2298              
2299             =head2 length
2300              
2301             Title : length()
2302             Usage : $len = $ali->length()
2303             Function : Returns the maximum length of the alignment.
2304             To be sure the alignment is a block, use is_flush
2305             Returns : Integer
2306             Argument :
2307              
2308             =cut
2309              
2310             sub length_aln {
2311 0     0 0 0 my $self = shift;
2312 0         0 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2313 0         0 $self->length(@_);
2314             }
2315              
2316             sub length {
2317 2216     2216 1 1948 my $self = shift;
2318 2216         1462 my $seq;
2319 2216         1414 my $length = -1;
2320 2216         1436 my $temp;
2321            
2322 2216         2757 foreach $seq ( $self->each_seq() ) {
2323 8524         9505 $temp = $seq->length();
2324 8524 100       10489 if( $temp > $length ) {
2325 2226         1840 $length = $temp;
2326             }
2327             }
2328              
2329 2216         3832 return $length;
2330             }
2331              
2332              
2333             =head2 maxdisplayname_length
2334              
2335             Title : maxdisplayname_length
2336             Usage : $ali->maxdisplayname_length()
2337             Function : Gets the maximum length of the displayname in the
2338             alignment. Used in writing out various MSA formats.
2339             Returns : integer
2340             Argument :
2341              
2342             =cut
2343              
2344             sub maxname_length {
2345 0     0 1 0 my $self = shift;
2346 0         0 $self->deprecated("maxname_length - deprecated method.".
2347             " Use maxdisplayname_length() instead.");
2348 0         0 $self->maxdisplayname_length();
2349             }
2350              
2351             sub maxnse_length {
2352 0     0 0 0 my $self = shift;
2353 0         0 $self->deprecated("maxnse_length - deprecated method.".
2354             " Use maxnse_length() instead.");
2355 0         0 $self->maxdisplayname_length();
2356             }
2357              
2358             sub maxdisplayname_length {
2359 21     21 1 26 my $self = shift;
2360 21         26 my $maxname = (-1);
2361 21         21 my ( $seq, $len );
2362              
2363 21         36 foreach $seq ( $self->each_seq() ) {
2364 122         175 $len = CORE::length $self->displayname( $seq->get_nse() );
2365              
2366 122 100       223 if ( $len > $maxname ) {
2367 52         59 $maxname = $len;
2368             }
2369             }
2370              
2371 21         44 return $maxname;
2372             }
2373              
2374             =head2 max_metaname_length
2375              
2376             Title : max_metaname_length
2377             Usage : $ali->max_metaname_length()
2378             Function : Gets the maximum length of the meta name tags in the
2379             alignment for the sequences and for the alignment.
2380             Used in writing out various MSA formats.
2381             Returns : integer
2382             Argument : None
2383              
2384             =cut
2385              
2386             sub max_metaname_length {
2387 2     2 1 3 my $self = shift;
2388 2         4 my $maxname = (-1);
2389 2         3 my ($seq,$len);
2390            
2391             # check seq meta first
2392 2         7 for $seq ( $self->each_seq() ) {
2393 17 100       42 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2394 11         14 for my $mtag ($seq->meta_names) {
2395 0         0 $len = CORE::length $mtag;
2396 0 0       0 if( $len > $maxname ) {
2397 0         0 $maxname = $len;
2398             }
2399             }
2400             }
2401            
2402             # alignment meta
2403 2         9 for my $meta ($self->consensus_meta) {
2404 2 100       6 next unless $meta;
2405 1         2 for my $name ($meta->meta_names) {
2406 1         1 $len = CORE::length $name;
2407 1 50       2 if( $len > $maxname ) {
2408 1         2 $maxname = $len;
2409             }
2410             }
2411             }
2412              
2413 2         7 return $maxname;
2414             }
2415              
2416             =head2 num_residues
2417              
2418             Title : num_residues
2419             Usage : $no = $ali->num_residues
2420             Function : number of residues in total in the alignment
2421             Returns : integer
2422             Argument :
2423             Note : replaces no_residues()
2424              
2425             =cut
2426              
2427             sub num_residues {
2428 2     2 1 4 my $self = shift;
2429 2         3 my $count = 0;
2430              
2431 2         4 foreach my $seq ( $self->each_seq ) {
2432 19         24 my $str = $seq->seq();
2433              
2434 19         780 $count += ( $str =~ s/[A-Za-z]//g );
2435             }
2436              
2437 2         8 return $count;
2438             }
2439              
2440             =head2 num_sequences
2441              
2442             Title : num_sequences
2443             Usage : $depth = $ali->num_sequences
2444             Function : number of sequence in the sequence alignment
2445             Returns : integer
2446             Argument : none
2447             Note : replaces no_sequences()
2448              
2449             =cut
2450              
2451             sub num_sequences {
2452 2553     2553 1 3707 my $self = shift;
2453 2553         2721 return scalar($self->each_seq);
2454             }
2455              
2456             =head2 average_percentage_identity
2457              
2458             Title : average_percentage_identity
2459             Usage : $id = $align->average_percentage_identity
2460             Function: The function uses a fast method to calculate the average
2461             percentage identity of the alignment
2462             Returns : The average percentage identity of the alignment
2463             Args : None
2464             Notes : This method implemented by Kevin Howe calculates a figure that is
2465             designed to be similar to the average pairwise identity of the
2466             alignment (identical in the absence of gaps), without having to
2467             explicitly calculate pairwise identities proposed by Richard Durbin.
2468             Validated by Ewan Birney ad Alex Bateman.
2469              
2470             =cut
2471              
2472             sub average_percentage_identity{
2473 9     9 1 13 my ($self,@args) = @_;
2474              
2475 9         46 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2476             'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2477              
2478 9         13 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2479              
2480 9 50       20 if (! $self->is_flush()) {
2481 0         0 $self->throw("All sequences in the alignment must be the same length");
2482             }
2483              
2484 9         17 @seqs = $self->each_seq();
2485 9         19 $len = $self->length();
2486              
2487             # load the each hash with correct keys for existence checks
2488              
2489 9         27 for( my $index=0; $index < $len; $index++) {
2490 2096         1460 foreach my $letter (@alphabet) {
2491 54496         42445 $countHashes[$index]->{$letter} = 0;
2492             }
2493             }
2494 9         17 foreach my $seq (@seqs) {
2495 32         82 my @seqChars = split //, $seq->seq();
2496 32         73 for( my $column=0; $column < @seqChars; $column++ ) {
2497 7580         4330 my $char = uc($seqChars[$column]);
2498 7580 100       8313 if (exists $countHashes[$column]->{$char}) {
2499 7077         8765 $countHashes[$column]->{$char}++;
2500             }
2501             }
2502             }
2503              
2504 9         16 $total = 0;
2505 9         12 $divisor = 0;
2506 9         28 for(my $column =0; $column < $len; $column++) {
2507 2096         1141 my %hash = %{$countHashes[$column]};
  2096         9015  
2508 2096         2459 $subdivisor = 0;
2509 2096         4375 foreach my $res (keys %hash) {
2510 54496         35745 $total += $hash{$res}*($hash{$res} - 1);
2511 54496         36789 $subdivisor += $hash{$res};
2512             }
2513 2096         5660 $divisor += $subdivisor * ($subdivisor - 1);
2514             }
2515 9 50       2061 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2516             }
2517              
2518             =head2 percentage_identity
2519              
2520             Title : percentage_identity
2521             Usage : $id = $align->percentage_identity
2522             Function: The function calculates the average percentage identity
2523             (aliased to average_percentage_identity)
2524             Returns : The average percentage identity
2525             Args : None
2526              
2527             =cut
2528              
2529             sub percentage_identity {
2530 5     5 1 9 my $self = shift;
2531 5         15 return $self->average_percentage_identity();
2532             }
2533              
2534             =head2 overall_percentage_identity
2535              
2536             Title : overall_percentage_identity
2537             Usage : $id = $align->overall_percentage_identity
2538             $id = $align->overall_percentage_identity('short')
2539             Function: The function calculates the percentage identity of
2540             the conserved columns
2541             Returns : The percentage identity of the conserved columns
2542             Args : length value to use, optional defaults to alignment length
2543             possible values: 'align', 'short', 'long'
2544              
2545             The argument values 'short' and 'long' refer to shortest and longest
2546             sequence in the alignment. Method modification code by Hongyu Zhang.
2547              
2548             =cut
2549              
2550             sub overall_percentage_identity{
2551 17     17 1 26 my ($self, $length_measure) = @_;
2552              
2553 17         28 my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
  442         435  
2554              
2555 17         34 my %enum = map {$_ => undef} qw (align short long);
  51         64  
2556              
2557             $self->throw("Unknown argument [$length_measure]")
2558 17 50 66     53 if $length_measure and not exists $enum{$length_measure};
2559 17   100     52 $length_measure ||= 'align';
2560              
2561 17 50       29 if (! $self->is_flush()) {
2562 0         0 $self->throw("All sequences in the alignment must be the same length");
2563             }
2564              
2565             # Count the residues seen at each position
2566 17         16 my $len;
2567 17         13 my $total = 0; # number of positions with identical residues
2568 17         16 my @countHashes;
2569 17         21 my @seqs = $self->each_seq;
2570 17         16 my $nof_seqs = scalar @seqs;
2571 17         29 my $aln_len = $self->length();
2572 17         22 for my $seq (@seqs) {
2573 90         152 my $seqstr = $seq->seq;
2574              
2575             # Count residues for given sequence
2576 90         131 for my $column (0 .. $aln_len-1) {
2577 17788         11603 my $char = uc( substr($seqstr, $column, 1) );
2578 17788 100       18986 if ( exists $alphabet{$char} ) {
2579              
2580             # This is a valid char
2581 17019 100       15158 if ( defined $countHashes[$column]->{$char} ) {
2582 12754         7520 $countHashes[$column]->{$char}++;
2583             } else {
2584 4265         3163 $countHashes[$column]->{$char} = 1;
2585             }
2586              
2587 17019 100       19673 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2588             # All sequences have this same residue
2589 810         558 $total++;
2590             }
2591              
2592             }
2593             }
2594              
2595             # Sequence length
2596 90 100 100     286 if ($length_measure eq 'short' || $length_measure eq 'long') {
2597 32         34 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2598 32 100       46 if ($length_measure eq 'short') {
    50          
2599 16 100 100     53 if ( (not defined $len) || ($seq_len < $len) ) {
2600 5         7 $len = $seq_len;
2601             }
2602             } elsif ($length_measure eq 'long') {
2603 16 100 100     46 if ( (not defined $len) || ($seq_len > $len) ) {
2604 2         3 $len = $seq_len;
2605             }
2606             }
2607             }
2608              
2609             }
2610              
2611 17 100       35 if ($length_measure eq 'align') {
2612 15         15 $len = $aln_len;
2613             }
2614              
2615 17         467 return ($total / $len ) * 100.0;
2616             }
2617              
2618              
2619              
2620             =head1 Alignment positions
2621              
2622             Methods to map a sequence position into an alignment column and back.
2623             column_from_residue_number() does the former. The latter is really a
2624             property of the sequence object and can done using
2625             L:
2626              
2627             # select somehow a sequence from the alignment, e.g.
2628             my $seq = $aln->get_seq_by_pos(1);
2629             #$loc is undef or Bio::LocationI object
2630             my $loc = $seq->location_from_column(5);
2631              
2632             =head2 column_from_residue_number
2633              
2634             Title : column_from_residue_number
2635             Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2636             Function: This function gives the position in the alignment
2637             (i.e. column number) of the given residue number in the
2638             sequence with the given name. For example, for the
2639             alignment
2640              
2641             Seq1/91-97 AC..DEF.GH.
2642             Seq2/24-30 ACGG.RTY...
2643             Seq3/43-51 AC.DDEF.GHI
2644              
2645             column_from_residue_number( "Seq1", 94 ) returns 6.
2646             column_from_residue_number( "Seq2", 25 ) returns 2.
2647             column_from_residue_number( "Seq3", 50 ) returns 10.
2648              
2649             An exception is thrown if the residue number would lie
2650             outside the length of the aligment
2651             (e.g. column_from_residue_number( "Seq2", 22 )
2652              
2653             Note: If the the parent sequence is represented by more than
2654             one alignment sequence and the residue number is present in
2655             them, this method finds only the first one.
2656              
2657             Returns : A column number for the position in the alignment of the
2658             given residue in the given sequence (1 = first column)
2659             Args : A sequence id/name (not a name/start-end)
2660             A residue number in the whole sequence (not just that
2661             segment of it in the alignment)
2662              
2663             =cut
2664              
2665             sub column_from_residue_number {
2666 27     27 1 446 my ( $self, $name, $resnumber ) = @_;
2667              
2668             $self->throw("No sequence with name [$name]")
2669 27 50       53 unless $self->{'_start_end_lists'}->{$name};
2670 27 50       37 $self->throw("Second argument residue number missing") unless $resnumber;
2671              
2672 27         45 foreach my $seq ( $self->each_seq_with_id($name) ) {
2673 27         24 my $col;
2674 27         30 eval { $col = $seq->column_from_residue_number($resnumber); };
  27         49  
2675 27 50       44 next if $@;
2676 27         47 return $col;
2677             }
2678              
2679 0         0 $self->throw( "Could not find a sequence segment in $name "
2680             . "containing residue number $resnumber" );
2681              
2682             }
2683              
2684             =head1 Sequence names
2685              
2686             Methods to manipulate the display name. The default name based on the
2687             sequence id and subsequence positions can be overridden in various
2688             ways.
2689              
2690             =head2 displayname
2691              
2692             Title : displayname
2693             Usage : $myalign->displayname("Ig", "IgA")
2694             Function : Gets/sets the display name of a sequence in the alignment
2695             Returns : A display name string
2696             Argument : name of the sequence
2697             displayname of the sequence (optional)
2698              
2699             =cut
2700              
2701             sub displayname {
2702 550     550 1 488 my ( $self, $name, $disname ) = @_;
2703              
2704             $self->throw("No sequence with name [$name]")
2705 550 50       886 unless defined $self->{'_seq'}->{$name};
2706              
2707 550 100 66     1261 if ( $disname and $name ) {
    100          
2708 112         136 $self->{'_dis_name'}->{$name} = $disname;
2709 112         130 return $disname;
2710             }
2711             elsif ( defined $self->{'_dis_name'}->{$name} ) {
2712 48         133 return $self->{'_dis_name'}->{$name};
2713             }
2714             else {
2715 390         653 return $name;
2716             }
2717             }
2718              
2719             sub get_displayname {
2720 0     0 0 0 my $self = shift;
2721 0         0 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2722 0         0 $self->displayname(@_);
2723             }
2724              
2725             sub set_displayname {
2726 0     0 0 0 my $self = shift;
2727 0         0 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2728 0         0 $self->displayname(@_);
2729             }
2730              
2731              
2732             =head2 set_displayname_count
2733              
2734             Title : set_displayname_count
2735             Usage : $ali->set_displayname_count
2736             Function : Sets the names to be name_# where # is the number of
2737             times this name has been used.
2738             Returns : 1, on success
2739             Argument :
2740              
2741             =cut
2742              
2743             sub set_displayname_count {
2744 1     1 1 2 my $self= shift;
2745 1         2 my (@arr,$name,$seq,$count,$temp,$nse);
2746              
2747 1         3 foreach $seq ( $self->each_alphabetically() ) {
2748 16         21 $nse = $seq->get_nse();
2749              
2750             #name will be set when this is the second
2751             #time (or greater) is has been seen
2752              
2753 16 50 66     35 if( defined $name and $name eq ($seq->id()) ) {
2754 0         0 $temp = sprintf("%s_%s",$name,$count);
2755 0         0 $self->displayname($nse,$temp);
2756 0         0 $count++;
2757             } else {
2758 16         12 $count = 1;
2759 16         18 $name = $seq->id();
2760 16         27 $temp = sprintf("%s_%s",$name,$count);
2761 16         19 $self->displayname($nse,$temp);
2762 16         16 $count++;
2763             }
2764             }
2765 1         5 return 1;
2766             }
2767              
2768             =head2 set_displayname_flat
2769              
2770             Title : set_displayname_flat
2771             Usage : $ali->set_displayname_flat()
2772             Function : Makes all the sequences be displayed as just their name,
2773             not name/start-end (NSE)
2774             Returns : 1
2775             Argument :
2776              
2777             =cut
2778              
2779             sub set_displayname_flat {
2780 11     11 1 16 my $self = shift;
2781 11         17 my ( $nse, $seq );
2782              
2783 11         21 foreach $seq ( $self->each_seq() ) {
2784 79         111 $nse = $seq->get_nse();
2785 79         107 $self->displayname( $nse, $seq->id() );
2786             }
2787 11         28 return 1;
2788             }
2789              
2790              
2791             =head2 set_displayname_normal
2792              
2793             Title : set_displayname_normal
2794             Usage : $ali->set_displayname_normal()
2795             Function : Makes all the sequences be displayed as name/start-end (NSE)
2796             Returns : 1, on success
2797             Argument :
2798              
2799             =cut
2800              
2801             sub set_displayname_normal {
2802 1     1 1 2 my $self = shift;
2803 1         2 my ( $nse, $seq );
2804              
2805 1         3 foreach $seq ( $self->each_seq() ) {
2806 16         18 $nse = $seq->get_nse();
2807 16         18 $self->displayname( $nse, $nse );
2808             }
2809 1         5 return 1;
2810             }
2811              
2812             =head2 source
2813              
2814             Title : source
2815             Usage : $obj->source($newval)
2816             Function: sets the Alignment source program
2817             Example :
2818             Returns : value of source
2819             Args : newvalue (optional)
2820              
2821             =cut
2822              
2823             sub source {
2824 123     123 1 607 my ( $self, $value ) = @_;
2825 123 100       349 if ( defined $value ) {
2826 111         195 $self->{'_source'} = $value;
2827             }
2828 123         215 return $self->{'_source'};
2829             }
2830              
2831              
2832             =head2 set_displayname_safe
2833              
2834             Title : set_displayname_safe
2835             Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2836             Function : Assign machine-generated serial names to sequences in input order.
2837             Designed to protect names during PHYLIP runs. Assign 10-char string
2838             in the form of "S000000001" to "S999999999". Restore the original
2839             names using "restore_displayname".
2840             Returns : 1. a new $aln with system names;
2841             2. a hash ref for restoring names
2842             Argument : Number for id length (default 10)
2843              
2844             =cut
2845              
2846             sub set_displayname_safe {
2847 1     1 1 2 my $self = shift;
2848 1   50     5 my $idlength = shift || 10;
2849 1         1 my ( $seq, %phylip_name );
2850 1         1 my $ct = 0;
2851 1         3 my $new = Bio::SimpleAlign->new();
2852 1         2 foreach $seq ( $self->each_seq() ) {
2853 14         13 $ct++;
2854 14         47 my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2855 14         21 $phylip_name{$pname} = $seq->id();
2856             my $new_seq = Bio::LocatableSeq->new(
2857             -id => $pname,
2858             -seq => $seq->seq(),
2859             -alphabet => $seq->alphabet,
2860             -start => $seq->{_start},
2861             -end => $seq->{_end}
2862 14         30 );
2863 14         43 $new->add_seq($new_seq);
2864             }
2865              
2866             $self->debug(
2867 1         5 "$ct seq names changed. Restore names by using restore_displayname.");
2868 1         3 return ( $new, \%phylip_name );
2869             }
2870              
2871              
2872             =head2 restore_displayname
2873              
2874             Title : restore_displayname
2875             Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2876             Function : Restore original sequence names (after running
2877             $ali->set_displayname_safe)
2878             Returns : a new $aln with names restored.
2879             Argument : a hash reference of names from "set_displayname_safe".
2880              
2881             =cut
2882              
2883             sub restore_displayname {
2884 1     1 1 2 my $self = shift;
2885 1         1 my $ref=shift;
2886 1         7 my %name=%$ref;
2887 1         3 my $new=Bio::SimpleAlign->new();
2888 1         3 foreach my $seq ( $self->each_seq() ) {
2889 14 50       24 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2890             my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2891             -seq => $seq->seq(),
2892             -alphabet => $seq->alphabet,
2893             -start => $seq->{_start},
2894             -end => $seq->{_end}
2895 14         25 );
2896 14         38 $new->add_seq($new_seq);
2897             }
2898 1         5 return $new;
2899             }
2900              
2901             =head2 sort_by_start
2902              
2903             Title : sort_by_start
2904             Usage : $ali->sort_by_start
2905             Function : Changes the order of the alignment to the start position of each
2906             subalignment
2907             Returns : 1 on success
2908             Argument :
2909              
2910             =cut
2911              
2912             sub sort_by_start {
2913 1     1 1 2 my $self = shift;
2914 1         1 my ($seq,$nse,@arr,%hash,$count);
2915 1         2 foreach $seq ( $self->each_seq() ) {
2916 3         6 $nse = $seq->get_nse;
2917 3         5 $hash{$nse} = $seq;
2918             }
2919 1         2 $count = 0;
2920 1         1 %{$self->{'_order'}} = (); # reset the hash;
  1         3  
2921 1         4 foreach $nse ( sort _startend keys %hash) {
2922 3         4 $self->{'_order'}->{$count} = $nse;
2923 3         2 $count++;
2924             }
2925 1         3 1;
2926             }
2927              
2928             sub _startend {
2929 2     2   5 my ($aname,$arange) = split (/[\/]/,$a);
2930 2         3 my ($bname,$brange) = split (/[\/]/,$b);
2931 2         4 my ($astart,$aend) = split(/\-/,$arange);
2932 2         3 my ($bstart,$bend) = split(/\-/,$brange);
2933 2         4 return $astart <=> $bstart;
2934             }
2935              
2936             =head2 bracket_string
2937              
2938             Title : bracket_string
2939             Usage : my @params = (-refseq => 'testseq',
2940             -allele1 => 'allele1',
2941             -allele2 => 'allele2',
2942             -delimiters => '{}',
2943             -separator => '/');
2944             $str = $aln->bracket_string(@params)
2945              
2946             Function : When supplied with a list of parameters (see below), returns a
2947             string in BIC format. This is used for allelic comparisons.
2948             Briefly, if either allele contains a base change when compared to
2949             the refseq, the base or gap for each allele is represented in
2950             brackets in the order present in the 'alleles' parameter.
2951              
2952             For the following data:
2953              
2954             >testseq
2955             GGATCCATTGCTACT
2956             >allele1
2957             GGATCCATTCCTACT
2958             >allele2
2959             GGAT--ATTCCTCCT
2960              
2961             the returned string with parameters 'refseq => testseq' and
2962             'alleles => [qw(allele1 allele2)]' would be:
2963              
2964             GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2965             Returns : BIC-formatted string
2966             Argument : Required args
2967             refseq : string (ID) of the reference sequence used
2968             as basis for comparison
2969             allele1 : string (ID) of the first allele
2970             allele2 : string (ID) of the second allele
2971             Optional args
2972             delimiters: two symbol string of left and right delimiters.
2973             Only the first two symbols are used
2974             default = '[]'
2975             separator : string used as a separator. Only the first
2976             symbol is used
2977             default = '/'
2978             Throws : On no refseq/alleles, or invalid refseq/alleles.
2979              
2980             =cut
2981              
2982             sub bracket_string {
2983 14     14 1 5598 my ($self, @args) = @_;
2984 14         44 my ($ref, $a1, $a2, $delim, $sep) =
2985             $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2986 14 50 33     64 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
      33        
2987 14         11 my ($ld, $rd);
2988 14 100       29 ($ld, $rd) = split('', $delim, 2) if $delim;
2989 14   100     34 $ld ||= '[';
2990 14   100     23 $rd ||= ']';
2991 14   100     24 $sep ||= '/';
2992             my ($refseq, $allele1, $allele2) =
2993 14         13 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
  42         46  
2994 14 50 33     49 if (!$refseq || !$allele1 || !$allele2) {
      33        
2995 0         0 $self->throw("One of your refseq/allele IDs is invalid!");
2996             }
2997 14         21 my $len = $self->length-1;
2998 14         14 my $bic = '';
2999             # loop over the alignment columns
3000 14         21 for my $column ( 0 .. $len ) {
3001 210         127 my $string;
3002             my ($compres, $res1, $res2) =
3003 210         140 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
  630         622  
3004             # are any of the allele symbols different from the refseq?
3005 210 100 100     534 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3006             $ld.$res1.$sep.$res2.$rd;
3007 210         192 $bic .= $string;
3008             }
3009 14         34 return $bic;
3010             }
3011              
3012              
3013             =head2 methods implementing Bio::FeatureHolderI
3014              
3015             FeatureHolderI implementation to support labeled character sets like one
3016             would get from NEXUS represented data.
3017              
3018             =head2 get_SeqFeatures
3019              
3020             Usage : @features = $aln->get_SeqFeatures
3021             Function: Get the feature objects held by this feature holder.
3022             Example :
3023             Returns : an array of Bio::SeqFeatureI implementing objects
3024             Args : optional filter coderef, taking a Bio::SeqFeatureI
3025             : as argument, returning TRUE if wanted, FALSE if
3026             : unwanted
3027              
3028             =cut
3029              
3030             sub get_SeqFeatures {
3031 4     4 1 12 my $self = shift;
3032 4         6 my $filter_cb = shift;
3033 4 50 33     17 $self->throw("Arg (filter callback) must be a coderef")
3034             unless !defined($filter_cb)
3035             or ref($filter_cb) eq 'CODE';
3036 4 100       10 if ( !defined $self->{'_as_feat'} ) {
3037 2         5 $self->{'_as_feat'} = [];
3038             }
3039 4 50       11 if ($filter_cb) {
3040 0         0 return grep { $filter_cb->($_) } @{ $self->{'_as_feat'} };
  0         0  
  0         0  
3041             }
3042 4         4 return @{ $self->{'_as_feat'} };
  4         11  
3043             }
3044              
3045              
3046             =head2 add_SeqFeature
3047              
3048             Usage : $aln->add_SeqFeature($subfeat);
3049             Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3050             (see L) is supported, but has no effect.
3051             Example :
3052             Returns : 1 on success
3053             Args : a Bio::SeqFeatureI object
3054              
3055             =cut
3056              
3057             sub add_SeqFeature {
3058 9     9 1 11 my ($self, @feat) = @_;
3059              
3060 9 100       16 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3061              
3062 9 50       15 if (scalar @feat > 1) {
3063 0         0 $self->deprecated(
3064             -message => 'Providing an array of features to Bio::SimpleAlign add_SeqFeature()'.
3065             ' is deprecated and will be removed in a future version. '.
3066             'Add a single feature at a time instead.',
3067             -warn_version => 1.007,
3068             -throw_version => 1.009,
3069             );
3070             }
3071              
3072 9         11 for my $feat ( @feat ) {
3073              
3074 9 50       14 next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3075              
3076 9 50       30 if( !$feat->isa("Bio::SeqFeatureI") ) {
3077 0         0 $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3078             }
3079              
3080 9         7 push @{$self->{'_as_feat'}}, $feat;
  9         15  
3081             }
3082 9         12 return 1;
3083             }
3084              
3085              
3086             =head2 remove_SeqFeatures
3087              
3088             Usage : $obj->remove_SeqFeatures
3089             Function: Removes all SeqFeatures. If you want to remove only a subset,
3090             remove that subset from the returned array, and add back the rest.
3091             Returns : The array of Bio::SeqFeatureI features that was
3092             deleted from this alignment.
3093             Args : none
3094              
3095             =cut
3096              
3097             sub remove_SeqFeatures {
3098 0     0 1 0 my $self = shift;
3099              
3100 0 0       0 return () unless $self->{'_as_feat'};
3101 0         0 my @feats = @{$self->{'_as_feat'}};
  0         0  
3102 0         0 $self->{'_as_feat'} = [];
3103 0         0 return @feats;
3104             }
3105              
3106             =head2 feature_count
3107              
3108             Title : feature_count
3109             Usage : $obj->feature_count()
3110             Function: Return the number of SeqFeatures attached to the alignment
3111             Returns : integer representing the number of SeqFeatures
3112             Args : None
3113              
3114             =cut
3115              
3116             sub feature_count {
3117 2     2 1 3 my ($self) = @_;
3118              
3119 2 50       5 if (defined($self->{'_as_feat'})) {
3120 2         1 return ($#{$self->{'_as_feat'}} + 1);
  2         9  
3121             } else {
3122 0         0 return 0;
3123             }
3124             }
3125              
3126             =head2 get_all_SeqFeatures
3127              
3128             Title : get_all_SeqFeatures
3129             Usage :
3130             Function: Get all SeqFeatures.
3131             Example :
3132             Returns : an array of Bio::SeqFeatureI implementing objects
3133             Args : none
3134             Note : Falls through to Bio::FeatureHolderI implementation.
3135              
3136             =cut
3137              
3138             =head2 methods for Bio::AnnotatableI
3139              
3140             AnnotatableI implementation to support sequence alignments which
3141             contain annotation (NEXUS, Stockholm).
3142              
3143             =head2 annotation
3144              
3145             Title : annotation
3146             Usage : $ann = $aln->annotation or
3147             $aln->annotation($ann)
3148             Function: Gets or sets the annotation
3149             Returns : Bio::AnnotationCollectionI object
3150             Args : None or Bio::AnnotationCollectionI object
3151              
3152             See L and L
3153             for more information
3154              
3155             =cut
3156              
3157             sub annotation {
3158 87     87 1 97 my ($obj,$value) = @_;
3159 87 100       189 if( defined $value ) {
    100          
3160 46 50       161 $obj->throw("object of class ".ref($value)." does not implement ".
3161             "Bio::AnnotationCollectionI. Too bad.")
3162             unless $value->isa("Bio::AnnotationCollectionI");
3163 46         59 $obj->{'_annotation'} = $value;
3164             } elsif( ! defined $obj->{'_annotation'}) {
3165 21         88 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3166             }
3167 87         149 return $obj->{'_annotation'};
3168             }
3169              
3170             =head1 Deprecated methods
3171              
3172             =cut
3173              
3174             =head2 no_residues
3175              
3176             Title : no_residues
3177             Usage : $no = $ali->no_residues
3178             Function : number of residues in total in the alignment
3179             Returns : integer
3180             Argument :
3181             Note : deprecated in favor of num_residues()
3182              
3183             =cut
3184              
3185             sub no_residues {
3186 0     0 1 0 my $self = shift;
3187 0         0 $self->deprecated(-warn_version => 1.0069,
3188             -throw_version => 1.0075,
3189             -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3190 0         0 $self->num_residues(@_);
3191             }
3192              
3193             =head2 no_sequences
3194              
3195             Title : no_sequences
3196             Usage : $depth = $ali->no_sequences
3197             Function : number of sequence in the sequence alignment
3198             Returns : integer
3199             Argument :
3200             Note : deprecated in favor of num_sequences()
3201              
3202             =cut
3203              
3204             sub no_sequences {
3205 0     0 1 0 my $self = shift;
3206 0         0 $self->deprecated(-warn_version => 1.0069,
3207             -throw_version => 1.0075,
3208             -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3209 0         0 $self->num_sequences(@_);
3210             }
3211              
3212             =head2 mask_columns
3213              
3214             Title : mask_columns
3215             Usage : $aln2 = $aln->mask_columns(20,30)
3216             Function : Masks a slice of the alignment inclusive of start and
3217             end columns, and the first column in the alignment is denoted 1.
3218             Mask beyond the length of the sequence does not do padding.
3219             Returns : A Bio::SimpleAlign object
3220             Args : Positive integer for start column, positive integer for end column,
3221             optional string value use for the mask. Example:
3222              
3223             $aln2 = $aln->mask_columns(20,30,'?')
3224             Note : Masking must use a character that is not used for gaps or
3225             frameshifts. These can be adjusted using the relevant global
3226             variables, but be aware these may be (uncontrollably) modified
3227             elsewhere within BioPerl (see bug 2715)
3228              
3229             =cut
3230              
3231             sub mask_columns {
3232             #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3233 6     6 1 1084 my $self = shift;
3234              
3235 6         11 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3236             $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3237            
3238             # coordinates are alignment-based, not sequence-based
3239 6         6 my ($start, $end, $mask_char) = @_;
3240 6 50       11 unless (defined $mask_char) { $mask_char = 'N' }
  0         0  
3241              
3242 6 50 33     46 $self->throw("Mask start has to be a positive integer and less than ".
      33        
3243             "alignment length, not [$start]")
3244             unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3245 6 50 33     35 $self->throw("Mask end has to be a positive integer and less than ".
      33        
3246             "alignment length, not [$end]")
3247             unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3248 6 50       9 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3249             "end [$end]") unless $start <= $end;
3250 6 50 33     36 $self->throw("Mask character $mask_char has to be a single character ".
3251             "and not a gap or frameshift symbol")
3252             unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3253            
3254 6         9 my $aln = $self->new;
3255 6         12 $aln->id($self->id);
3256 6         11 foreach my $seq ( $self->each_seq() ) {
3257 21         49 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3258             -alphabet => $seq->alphabet,
3259             -strand => $seq->strand,
3260             -verbose => $self->verbose);
3261            
3262             # convert from 1-based alignment coords!
3263 21         48 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3264 21         124 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3265 21         32 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3266 21         37 $new_seq->seq($new_dna_string);
3267 21         42 $aln->add_seq($new_seq);
3268             }
3269             # Preserve chosen mask character, it may be need later (like in 'slice')
3270 6         11 $aln->{_mask_char} = $mask_char;
3271 6         12 return $aln;
3272             }
3273              
3274             1;