File Coverage

Bio/SimpleAlign.pm

Criterion	Covered	Total	%
statement	1013	1175	86.2
branch	315	456	69.0
condition	135	218	61.9
subroutine	80	96	83.3
pod	70	78	89.7
total	1613	2023	79.7

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::SimpleAlign;
2	48			48		1758	use strict;
	48					53
	48					1200
3	48			48		162	use warnings;
	48					48
	48					1102
4	48			48		3002	use Bio::LocatableSeq; # uses Seq's as list
	48					59
	48					785
5	48			48		4791	use Bio::Seq;
	48					76
	48					816
6	48			48		11448	use Bio::SeqFeature::Generic;
	48					77
	48					1373
7
8	48			48		220	use parent qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI);
	48					60
	48					320
9
10							# BioPerl module for SimpleAlign
11							#
12							# Please direct questions and support issues to
13							#
14							# Cared for by Heikki Lehvaslaiho
15							#
16							# Copyright Ewan Birney
17							#
18							# You may distribute this module under the same terms as perl itself
19
20							# POD documentation - main docs before the code
21							#
22							# History:
23							# 11/3/00 Added threshold feature to consensus and consensus_aa - PS
24							# May 2001 major rewrite - Heikki Lehvaslaiho
25
26							=head1 NAME
27
28							Bio::SimpleAlign - Multiple alignments held as a set of sequences
29
30							=head1 SYNOPSIS
31
32							# Use Bio::AlignIO to read in the alignment
33							$str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
34							$aln = $str->next_aln();
35
36							# Describe
37							print $aln->length;
38							print $aln->num_residues;
39							print $aln->is_flush;
40							print $aln->num_sequences;
41							print $aln->score;
42							print $aln->percentage_identity;
43							print $aln->consensus_string(50);
44
45							# Find the position in the alignment for a sequence location
46							$pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
47
48							# Extract sequences and check values for the alignment column $pos
49							foreach $seq ($aln->each_seq) {
50							$res = $seq->subseq($pos, $pos);
51							$count{$res}++;
52							}
53							foreach $res (keys %count) {
54							printf "Res: %s Count: %2d\n", $res, $count{$res};
55							}
56
57							# Manipulate
58							$aln->remove_seq($seq);
59							$mini_aln = $aln->slice(20,30); # get a block of columns
60							$mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
61							$new_aln = $aln->remove_columns([20,30]); # remove by position
62							$new_aln = $aln->remove_columns(['mismatch']); # remove by property
63
64							# Analyze
65							$str = $aln->consensus_string($threshold_percent);
66							$str = $aln->match_line();
67							$str = $aln->cigar_line();
68							$id = $aln->percentage_identity;
69
70							# See the module documentation for details and more methods.
71
72							=head1 DESCRIPTION
73
74							SimpleAlign is an object that handles a multiple sequence alignment
75							(MSA). It is very permissive of types (it does not insist on sequences
76							being all same length, for example). Think of it as a set of sequences
77							with a whole series of built-in manipulations and methods for reading and
78							writing alignments.
79
80							SimpleAlign uses L, a subclass of L,
81							to store its sequences. These are subsequences with a start and end
82							positions in the parent reference sequence. Each sequence in the
83							SimpleAlign object is a Bio::LocatableSeq.
84
85							SimpleAlign expects the combination of name, start, and end for a
86							given sequence to be unique in the alignment, and this is the key for the
87							internal hashes (name, start, end are abbreviated C in the code).
88							However, in some cases people do not want the name/start-end to be displayed:
89							either multiple names in an alignment or names specific to the alignment
90							(ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
91							C, and generally is what is used to print out the
92							alignment. They default to name/start-end.
93
94							The SimpleAlign Module is derived from the Align module by Ewan Birney.
95
96							=head1 FEEDBACK
97
98							=head2 Mailing Lists
99
100							User feedback is an integral part of the evolution of this and other
101							Bioperl modules. Send your comments and suggestions preferably to one
102							of the Bioperl mailing lists. Your participation is much appreciated.
103
104							bioperl-l@bioperl.org - General discussion
105							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
106
107							=head2 Support
108
109							Please direct usage questions or support issues to the mailing list:
110
111							I
112
113							rather than to the module maintainer directly. Many experienced and
114							reponsive experts will be able look at the problem and quickly
115							address it. Please include a thorough description of the problem
116							with code and data examples if at all possible.
117
118							=head2 Reporting Bugs
119
120							Report bugs to the Bioperl bug tracking system to help us keep track
121							the bugs and their resolution. Bug reports can be submitted via the
122							web:
123
124							https://github.com/bioperl/bioperl-live/issues
125
126							=head1 AUTHOR
127
128							Ewan Birney, birney@ebi.ac.uk
129
130							=head1 CONTRIBUTORS
131
132							Allen Day, allenday-at-ucla.edu,
133							Richard Adams, Richard.Adams-at-ed.ac.uk,
134							David J. Evans, David.Evans-at-vir.gla.ac.uk,
135							Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
136							Allen Smith, allens-at-cpan.org,
137							Jason Stajich, jason-at-bioperl.org,
138							Anthony Underwood, aunderwood-at-phls.org.uk,
139							Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
140							Brian Osborne, bosborne at alum.mit.edu
141							Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
142							Hongyu Zhang, forward at hongyu.org
143							Jay Hannah, jay at jays.net
144							Alexandr Bezginov, albezg at gmail.com
145
146							=head1 SEE ALSO
147
148							L
149
150							=head1 APPENDIX
151
152							The rest of the documentation details each of the object
153							methods. Internal methods are usually preceded with a _
154
155							=cut
156
157							## This data should probably be in a more centralized module...
158							## it is taken from Clustalw documentation.
159							## These are all the positively scoring groups that occur in the
160							## Gonnet Pam250 matrix. The strong and weak groups are
161							## defined as strong score >0.5 and weak score =<0.5 respectively.
162							our %CONSERVATION_GROUPS = (
163							'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )],
164							'weak' => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)],
165							);
166
167
168							=head2 new
169
170							Title : new
171							Usage : my $aln = Bio::SimpleAlign->new();
172							Function : Creates a new simple align object
173							Returns : Bio::SimpleAlign
174							Args : -source => string representing the source program
175							where this alignment came from
176							-annotation => Bio::AnnotationCollectionI
177							-seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
178							-seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
179							-consensus => consensus string
180							-consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
181
182							=cut
183
184
185							sub new {
186	291			291	1	521	my($class,@args) = @_;
187
188	291					897	my $self = $class->SUPER::new(@args);
189
190	291					1347	my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
191							SOURCE
192							SCORE
193							ID
194							ACCESSION
195							DESCRIPTION
196							SEQS
197							FEATURES
198							ANNOTATION
199							SEQ_ANNOTATION
200							CONSENSUS
201							CONSENSUS_META
202							)], @args);
203	291	100				1058	$src && $self->source($src);
204	291	100				509	defined $score && $self->score($score);
205							# we need to set up internal hashs first!
206
207	291					712	$self->{'_seq'} = {};
208	291					399	$self->{'_order'} = {};
209	291					367	$self->{'_start_end_lists'} = {};
210	291					378	$self->{'_dis_name'} = {};
211	291					355	$self->{'_id'} = 'NoName';
212							# maybe we should automatically read in from args. Hmmm...
213	291	100				503	$id && $self->id($id);
214	291	100				504	$acc && $self->accession($acc);
215	291	100				450	$desc && $self->description($desc);
216	291	100				494	$coll && $self->annotation($coll);
217							# sequence annotation is layered into a provided annotation collection (or dies)
218	291	50				446	if ($sa) {
219	0	0				0	$self->throw("Must supply an alignment-based annotation collection (-annotation) ".
220							"with a sequence annotation collection")
221							if !$coll;
222	0					0	$coll->add_Annotation('seq_annotation', $sa);
223							}
224	291	100	66			753	if ($feats && ref $feats eq 'ARRAY') {
225	1					3	for my $feat (@$feats) {
226	6					7	$self->add_SeqFeature($feat);
227							}
228							}
229	291	50				761	$con && $self->consensus($con);
230	291	100				502	$cmeta && $self->consensus_meta($cmeta);
231							# assumes these are in correct alignment order
232	291	100	66			768	if ($seqs && ref($seqs) eq 'ARRAY') {
233	9					16	for my $seq (@$seqs) {
234	168					204	$self->add_seq($seq);
235							}
236							}
237
238	291					573	return $self; # success - we hope!
239							}
240
241							=head1 Modifier methods
242
243							These methods modify the MSA by adding, removing or shuffling complete
244							sequences.
245
246							=head2 add_seq
247
248							Title : add_seq
249							Usage : $myalign->add_seq($newseq);
250							$myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
251							Function : Adds another sequence to the alignment. Does not align
252							it - just adds it to the hashes.
253							If -ORDER is specified, the sequence is inserted at the
254							the position spec'd by -ORDER, and existing sequences
255							are pushed down the storage array.
256							Returns : nothing
257							Args : A Bio::LocatableSeq object
258							Positive integer for the sequence position (optional)
259
260							See L for more information
261
262							=cut
263
264							sub addSeq {
265	0			0	0	0	my $self = shift;
266	0					0	$self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
267	0					0	$self->add_seq(@_);
268							}
269
270							sub add_seq {
271	2420			2420	1	2327	my $self = shift;
272	2420					2578	my @args = @_;
273	2420					5464	my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
274	2420					2491	my ($name,$id,$start,$end);
275
276	2420	50				3340	unless ($seq) {
277	0					0	$self->throw("LocatableSeq argument required");
278							}
279	2420	50	33			8399	if( ! ref $seq \|\| ! $seq->isa('Bio::LocatableSeq') ) {
280	0					0	$self->throw("Unable to process non locatable sequences [". ref($seq). "]");
281							}
282	2420	100				3362	!defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
	2417					3722
283	2420					1953	$order--; # jay's patch (user-specified order is 1-origin)
284
285	2420	100				3387	if ($order < 0) {
286	1					4	$self->throw("User-specified value for ORDER must be >= 1");
287							}
288
289	2419		66			3381	$id = $seq->id() \|\|$seq->display_id \|\| $seq->primary_id;
290
291							# build the symbol list for this sequence,
292							# will prune out the gap and missing/match chars
293							# when actually asked for the symbol list in the
294							# symbol_chars
295							# map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
296
297	2419					3935	$name = $seq->get_nse;
298
299	2419	50				3561	if( $self->{'_seq'}->{$name} ) {
300	0	0				0	$self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
301							}
302							else {
303	2419					6395	$self->debug( "Assigning $name to $order\n");
304
305	2419					2514	my $ordh = $self->{'_order'};
306	2419	50				3566	if ($ordh->{$order}) {
307							# make space to insert
308							# $c->() returns (in reverse order) the first subsequence
309							# of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
310							# (3,2,1), and $c->(2,4,5) returns (2).
311	0					0	my $c;
312	0	0		0		0	$c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
	0					0
313							map {
314	0					0	$ordh->{$_+1} = $ordh->{$_}
315	0					0	} $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
	0					0
	0					0
	0					0
316
317							}
318	2419					3349	$ordh->{$order} = $name;
319
320	2419	100				3784	unless( exists( $self->{'_start_end_lists'}->{$id})) {
321	2400					3829	$self->{'_start_end_lists'}->{$id} = [];
322							}
323	2419					1924	push @{$self->{'_start_end_lists'}->{$id}}, $seq;
	2419					3952
324							}
325
326	2419					6698	$self->{'_seq'}->{$name} = $seq;
327
328							}
329
330
331							=head2 remove_seq
332
333							Title : remove_seq
334							Usage : $aln->remove_seq($seq);
335							Function : Removes a single sequence from an alignment
336							Returns :
337							Argument : a Bio::LocatableSeq object
338
339							=cut
340
341							sub removeSeq {
342	0			0	0	0	my $self = shift;
343	0					0	$self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
344	0					0	$self->remove_seq(@_);
345							}
346
347							sub remove_seq {
348	13			13	1	9	my $self = shift;
349	13					9	my $seq = shift;
350	13					9	my ($name,$id);
351
352	13	50	33			63	$self->throw("Need Bio::Locatable seq argument ")
353							unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
354
355	13					21	$id = $seq->id();
356	13					22	$name = $seq->get_nse;
357
358	13	50				17	if( !exists $self->{'_seq'}->{$name} ) {
359	0					0	$self->throw("Sequence $name does not exist in the alignment to remove!");
360							}
361
362	13					14	delete $self->{'_seq'}->{$name};
363
364							# we need to remove this seq from the start_end_lists hash
365
366	13	50				18	if (exists $self->{'_start_end_lists'}->{$id}) {
367							# we need to find the sequence in the array.
368
369	13					11	my ($i, $found);;
370	13					14	for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
	13					21
371	13	50				10	if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
	13					31
372	13					9	$found = 1;
373	13					12	last;
374							}
375							}
376	13	50				15	if ($found) {
377	13					11	splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
	13					18
378							}
379							else {
380	0					0	$self->throw("Could not find the sequence to remoce from the start-end list");
381							}
382							}
383							else {
384	0					0	$self->throw("There is no seq list for the name $id");
385							}
386							# we need to shift order hash
387	13					10	my %rev_order = reverse %{$self->{'_order'}};
	13					60
388	13					16	my $no = $rev_order{$name};
389	13					19	my $num_sequences = $self->num_sequences;
390	13					20	for (; $no < $num_sequences; $no++) {
391	94					141	$self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
392							}
393	13					14	delete $self->{'_order'}->{$no};
394	13					28	return 1;
395							}
396
397
398							=head2 purge
399
400							Title : purge
401							Usage : $aln->purge(0.7);
402							Function: Removes sequences above given sequence similarity
403							This function will grind on large alignments. Beware!
404							Example :
405							Returns : An array of the removed sequences
406							Args : float, threshold for similarity
407
408							=cut
409
410							sub purge {
411	1			1	1	2	my ($self,$perc) = @_;
412	1					1	my (%duplicate, @dups);
413
414	1					2	my @seqs = $self->each_seq();
415
416	1					4	for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
417	15					13	my $seq = $seqs[$i];
418
419							#skip if already in duplicate hash
420	15	100				21	next if exists $duplicate{$seq->display_id} ;
421	4					8	my $one = $seq->seq();
422
423	4					65	my @one = split '', $one; #split to get 1aa per array element
424
425	4					7	for (my $j=$i+1;$j < @seqs;$j++) {
426	28					22	my $seq2 = $seqs[$j];
427
428							#skip if already in duplicate hash
429	28	100				55	next if exists $duplicate{$seq2->display_id} ;
430
431	27					43	my $two = $seq2->seq();
432	27					386	my @two = split '', $two;
433
434	27					22	my $count = 0;
435	27					21	my $res = 0;
436	27					37	for (my $k=0;$k<@one;$k++) {
437	6534	100	66			29896	if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
			66
			100
438							$one[$k] eq $two[$k]) {
439	4329					2717	$count++;
440							}
441	6534	100	66			37370	if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
			66
			66
			66
442							$two[$k] ne '.' && $two[$k] ne '-' ) {
443	6320					7629	$res++;
444							}
445							}
446
447	27					24	my $ratio = 0;
448	27	50				44	$ratio = $count/$res unless $res == 0;
449
450							# if above threshold put in duplicate hash and push onto
451							# duplicate array for returning to get_unique
452	27	100				200	if ( $ratio > $perc ) {
453	12	50				37	$self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
454	12					23	$duplicate{$seq2->display_id} = 1;
455	12					137	push @dups, $seq2;
456							}
457							}
458							}
459	1					3	foreach my $seq (@dups) {
460	12					15	$self->remove_seq($seq);
461							}
462	1					11	return @dups;
463							}
464
465							=head2 sort_alphabetically
466
467							Title : sort_alphabetically
468							Usage : $ali->sort_alphabetically
469							Function : Changes the order of the alignment to alphabetical on name
470							followed by numerical by number.
471							Returns :
472							Argument :
473
474							=cut
475
476							sub sort_alphabetically {
477	1			1	1	3	my $self = shift;
478	1					1	my ($seq,$nse,@arr,%hash,$count);
479
480	1					2	foreach $seq ( $self->each_seq() ) {
481	3					5	$nse = $seq->get_nse;
482	3					4	$hash{$nse} = $seq;
483							}
484
485	1					2	$count = 0;
486
487	1					1	%{$self->{'_order'}} = (); # reset the hash;
	1					3
488
489	1					5	foreach $nse ( sort _alpha_startend keys %hash) {
490	3					3	$self->{'_order'}->{$count} = $nse;
491
492	3					4	$count++;
493							}
494	1					5	1;
495							}
496
497							=head2 sort_by_list
498
499							Title : sort_by_list
500							Usage : $aln_ordered=$aln->sort_by_list($list_file)
501							Function : Arbitrarily order sequences in an alignment
502							Returns : A new Bio::SimpleAlign object
503							Argument : a file listing sequence names in intended order (one name per line)
504
505							=cut
506
507							sub sort_by_list {
508	1			1	1	2	my ($self, $list) = @_;
509	1					1	my (@seq, @ids, %order);
510
511	1					2	foreach my $seq ( $self->each_seq() ) {
512	6					5	push @seq, $seq;
513	6					8	push @ids, $seq->display_id;
514							}
515
516	1					2	my $ct=1;
517	1	50				30	open my $listfh, '<', $list or $self->throw("Could not read file '$list': $!");
518	1					18	while (<$listfh>) {
519	6					5	chomp;
520	6					7	my $name=$_;
521	6	50				6	$self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
522	6					13	$order{$name}=$ct++;
523							}
524	1					5	close($listfh);
525
526							# use the map-sort-map idiom:
527	1					3	my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
	6					6
	7					6
	6					7
528	1					4	my $aln = $self->new;
529	1					3	foreach (@sorted) { $aln->add_seq($_) }
	6					8
530	1					6	return $aln;
531							}
532
533							=head2 set_new_reference
534
535							Title : set_new_reference
536							Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
537							the sequence whoes name is "B31" (full, exact, and case-sensitive),
538							as the reference (1st) sequence
539							Function : Change/Set a new reference (i.e., the first) sequence
540							Returns : a new Bio::SimpleAlign object.
541							Throws an exception if designated sequence not found
542							Argument : a positive integer of sequence order, or a sequence name
543							in the original alignment
544
545							=cut
546
547							sub set_new_reference {
548	2			2	1	3	my ($self, $seqid) = @_;
549	2					3	my $aln = $self->new;
550	2					1	my (@seq, @ids, @new_seq);
551	2					3	my $is_num=0;
552	2					2	foreach my $seq ( $self->each_seq() ) {
553	12					9	push @seq, $seq;
554	12					14	push @ids, $seq->display_id;
555							}
556
557	2	100				9	if ($seqid =~ /^\d+$/) { # argument is seq position
558	1					1	$is_num=1;
559	1	50	33			5	$self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 \|\| $seqid > $self->num_sequences);
560							} else { # argument is a seq name
561	1	50				4	$self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
562							}
563
564	2					5	for (my $i=0; $i<=$#seq; $i++) {
565	12					9	my $pos=$i+1;
566	12	100	100			34	if ( ($is_num && $pos == $seqid) \|\| ($seqid eq $seq[$i]->display_id) ) {
			100
567	2					4	unshift @new_seq, $seq[$i];
568							} else {
569	10					17	push @new_seq, $seq[$i];
570							}
571							}
572	2					3	foreach (@new_seq) { $aln->add_seq($_); }
	12					15
573	2					6	return $aln;
574							}
575
576							sub _in_aln { # check if input name exists in the alignment
577	7			7		6	my ($str, $ref) = @_;
578	7					8	foreach (@$ref) {
579	24	100				38	return 1 if $str eq $_;
580							}
581	0					0	return 0;
582							}
583
584
585							=head2 uniq_seq
586
587							Title : uniq_seq
588							Usage : $aln->uniq_seq(): Remove identical sequences in
589							in the alignment. Ambiguous base ("N", "n") and
590							leading and ending gaps ("-") are NOT counted as
591							differences.
592							Function : Make a new alignment of unique sequence types (STs)
593							Returns : 1a. if called in a scalar context,
594							a new Bio::SimpleAlign object (all sequences renamed as "ST")
595							1b. if called in an array context,
596							a new Bio::SimpleAlign object, and a hashref whose keys
597							are sequence types, and whose values are arrayrefs to
598							lists of sequence ids within the corresponding sequence type
599							2. if $aln->verbose > 0, ST of each sequence is sent to
600							STDERR (in a tabular format)
601							Argument : None
602
603							=cut
604
605							sub uniq_seq {
606	1			1	1	1	my ($self, $seqid) = @_;
607	1					3	my $aln = $self->new;
608	1					1	my (%member, %order, @seq, @uniq_str, $st);
609	1					1	my $order=0;
610	1					3	my $len = $self->length();
611	1					1	$st = {};
612	1					3	foreach my $seq ( $self->each_seq() ) {
613	15					23	my $str = $seq->seq();
614
615							# it's necessary to ignore "n", "N", leading gaps and ending gaps in
616							# comparing two sequence strings
617
618							# 1st, convert "n", "N" to "?" (for DNA sequence only):
619	15	50				41	$str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
620							# 2nd, convert leading and ending gaps to "?":
621	15					25	$str = &_convert_leading_ending_gaps($str, '-', '?');
622							# Note that '?' also can mean unknown residue.
623							# I don't like making global class member changes like this, too
624							# prone to errors... -- cjfields 08-11-18
625	15					11	local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
626	15					28	my $new = Bio::LocatableSeq->new(
627							-id => $seq->id(),
628							-alphabet=> $seq->alphabet,
629							-seq => $str,
630							-start => $seq->start,
631							-end => $seq->end
632							);
633	15					31	push @seq, $new;
634							}
635
636	1					2	foreach my $seq (@seq) {
637	15					33	my $str = $seq->seq();
638	15					24	my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
639	15	100				24	if ($seen) { # seen before
640	4					3	my @memb = @{$member{$key}};
	4					12
641	4					4	push @memb, $seq;
642	4					13	$member{$key} = \@memb;
643							} else { # not seen
644	11					16	push @uniq_str, $key;
645	11					9	$order++;
646	11					57	$member{$key} = [ ($seq) ];
647	11					29	$order{$key} = $order;
648							}
649							}
650
651	1					8	foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
	24					20
652							# convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
653	11					16	my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
654							# convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
655	11	50				37	$str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
656	11					8	my $gap='-';
657	11					9	my $end= CORE::length($str2);
658	11					332	$end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
659	11					46	my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
660							-seq =>$str2,
661							-start=>1,
662							-end =>$end
663							);
664	11					24	$aln->add_seq($new);
665	11					8	foreach (@{$member{$str}}) {
	11					27
666	15					13	push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
	15					41
667	15					20	$self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
668							}
669							}
670	1	50				11	return wantarray ? ($aln, $st) : $aln;
671							}
672
673							sub _check_uniq { # check if same seq exists in the alignment
674	15			15		9	my ($str1, $ref, $length) = @_;
675	15					359	my @char1=split //, $str1;
676	15					22	my @array=@$ref;
677
678	15	100				35	return (0, $str1) if @array==0; # not seen (1st sequence)
679
680	14					15	foreach my $str2 (@array) {
681	59					44	my $diff=0;
682	59					1369	my @char2=split //, $str2;
683	59					78	for (my $i=0; $i<=$length-1; $i++) {
684	24249	100				25454	next if $char1[$i] eq '?';
685	22382	100				22373	next if $char2[$i] eq '?';
686	21740	100				35222	$diff++ if $char1[$i] ne $char2[$i];
687							}
688	59	100				899	return (1, $str2) if $diff == 0; # seen before
689							}
690
691	10					163	return (0, $str1); # not seen
692							}
693
694							sub _convert_leading_ending_gaps {
695	26			26		37	my $s=shift;
696	26					19	my $sym1=shift;
697	26					17	my $sym2=shift;
698	26					661	my @array=split //, $s;
699							# convert leading char:
700	26					45	for (my $i=0; $i<=$#array; $i++) {
701	369	100				535	($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
702							}
703							# convert ending char:
704	26					47	for (my $i = $#array; $i>= 0; $i--) {
705	552	100				762	($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
706							}
707	26					248	my $s_new=join '', @array;
708	26					369	return $s_new;
709							}
710
711							=head1 Sequence selection methods
712
713							Methods returning one or more sequences objects.
714
715							=head2 each_seq
716
717							Title : each_seq
718							Usage : foreach $seq ( $align->each_seq() )
719							Function : Gets a Seq object from the alignment
720							Returns : Seq object
721							Argument :
722
723							=cut
724
725							sub eachSeq {
726	0			0	0	0	my $self = shift;
727	0					0	$self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
728	0					0	$self->each_seq();
729							}
730
731							sub each_seq {
732	7620			7620	1	5635	my $self = shift;
733	7620					4610	my (@arr,$order);
734
735	7620					4783	foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
	120939					76245
	7620					17004
736	56090	100				72807	if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
737	56077					55677	push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
738							}
739							}
740	7620					15516	return @arr;
741							}
742
743
744							=head2 each_alphabetically
745
746							Title : each_alphabetically
747							Usage : foreach $seq ( $ali->each_alphabetically() )
748							Function : Returns a sequence object, but the objects are returned
749							in alphabetically sorted order.
750							Does not change the order of the alignment.
751							Returns : Seq object
752							Argument :
753
754							=cut
755
756							sub each_alphabetically {
757	2			2	1	4	my $self = shift;
758	2					2	my ($seq,$nse,@arr,%hash,$count);
759
760	2					4	foreach $seq ( $self->each_seq() ) {
761	32					39	$nse = $seq->get_nse;
762	32					40	$hash{$nse} = $seq;
763							}
764
765	2					12	foreach $nse ( sort _alpha_startend keys %hash) {
766	32					25	push(@arr,$hash{$nse});
767							}
768	2					11	return @arr;
769							}
770
771							sub _alpha_startend {
772	95			95		54	my ($aname,$astart,$bname,$bstart);
773	95					86	($aname,$astart) = split (/-/,$a);
774	95					89	($bname,$bstart) = split (/-/,$b);
775
776	95	50				90	if( $aname eq $bname ) {
777	0					0	return $astart <=> $bstart;
778							}
779							else {
780	95					74	return $aname cmp $bname;
781							}
782							}
783
784							=head2 each_seq_with_id
785
786							Title : each_seq_with_id
787							Usage : foreach $seq ( $align->each_seq_with_id() )
788							Function : Gets a Seq objects from the alignment, the contents
789							being those sequences with the given name (there may be
790							more than one)
791							Returns : Seq object
792							Argument : a seq name
793
794							=cut
795
796							sub eachSeqWithId {
797	0			0	0	0	my $self = shift;
798	0					0	$self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
799	0					0	$self->each_seq_with_id(@_);
800							}
801
802							sub each_seq_with_id {
803	89			89	1	73	my $self = shift;
804	89					74	my $id = shift;
805
806	89	50				128	$self->throw("Method each_seq_with_id needs a sequence name argument")
807							unless defined $id;
808
809	89					62	my (@arr, $seq);
810
811	89	50				140	if (exists($self->{'_start_end_lists'}->{$id})) {
812	89					66	@arr = @{$self->{'_start_end_lists'}->{$id}};
	89					133
813							}
814	89					132	return @arr;
815							}
816
817							=head2 get_seq_by_pos
818
819							Title : get_seq_by_pos
820							Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
821							Function : Gets a sequence based on its position in the alignment.
822							Numbering starts from 1. Sequence positions larger than
823							num_sequences() will throw an error.
824							Returns : a Bio::LocatableSeq object
825							Args : positive integer for the sequence position
826
827							=cut
828
829							sub get_seq_by_pos {
830
831	258			258	1	10718	my $self = shift;
832	258					328	my ($pos) = @_;
833
834	258	50	33			1525	$self->throw("Sequence position has to be a positive integer, not [$pos]")
835							unless $pos =~ /^\d+$/ and $pos > 0;
836	258	50				403	$self->throw("No sequence at position [$pos]")
837							unless $pos <= $self->num_sequences ;
838
839	258					459	my $nse = $self->{'_order'}->{--$pos};
840	258					667	return $self->{'_seq'}->{$nse};
841							}
842
843							=head2 get_seq_by_id
844
845							Title : get_seq_by_id
846							Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
847							Function : Gets a sequence based on its name.
848							Sequences that do not exist will warn and return undef
849							Returns : a Bio::LocatableSeq object
850							Args : string for sequence name
851
852							=cut
853
854							sub get_seq_by_id {
855	18			18	1	2336	my ($self,$name) = @_;
856	18	50				37	unless( defined $name ) {
857	0					0	$self->warn("Must provide a sequence name");
858	0					0	return;
859							}
860	18					18	for my $seq ( values %{$self->{'_seq'}} ) {
	18					39
861	46	100				68	if ( $seq->id eq $name) {
862	18					31	return $seq;
863							}
864							}
865	0					0	return;
866							}
867
868							=head2 seq_with_features
869
870							Title : seq_with_features
871							Usage : $seq = $aln->seq_with_features(-pos => 1,
872							-consensus => 60
873							-mask =>
874							sub { my $consensus = shift;
875
876							for my $i (1..5){
877							my $n = 'N' x $i;
878							my $q = '\?' x $i;
879							while($consensus =~ /[^?]$q[^?]/){
880							$consensus =~ s/([^?])$q([^?])/$1$n$2/;
881							}
882							}
883							return $consensus;
884							}
885							);
886							Function: produces a Bio::Seq object by first splicing gaps from -pos
887							(by means of a splice_by_seq_pos() call), then creating
888							features using non-? chars (by means of a consensus_string()
889							call with stringency -consensus).
890							Returns : a Bio::Seq object
891							Args : -pos : required. sequence from which to build the Bio::Seq
892							object
893							-consensus : optional, defaults to consensus_string()'s
894							default cutoff value
895							-mask : optional, a coderef to apply to consensus_string()'s
896							output before building features. this may be useful for
897							closing gaps of 1 bp by masking over them with N, for
898							instance
899
900							=cut
901
902							sub seq_with_features{
903	0			0	1	0	my ($self,%arg) = @_;
904
905							#first do the preparatory splice
906	0	0				0	$self->throw("must provide a -pos argument") unless $arg{-pos};
907	0					0	$self->splice_by_seq_pos($arg{-pos});
908
909	0					0	my $consensus_string = $self->consensus_string($arg{-consensus});
910							$consensus_string = $arg{-mask}->($consensus_string)
911	0	0				0	if defined($arg{-mask});
912
913	0					0	my(@bs,@es);
914
915	0	0				0	push @bs, 1 if $consensus_string =~ /^[^?]/;
916
917	0					0	while($consensus_string =~ /\?[^?]/g){
918	0					0	push @bs, pos($consensus_string);
919							}
920	0					0	while($consensus_string =~ /[^?]\?/g){
921	0					0	push @es, pos($consensus_string);
922							}
923
924	0	0				0	push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
925
926	0					0	my $seq = Bio::Seq->new();
927
928							# my $rootfeature = Bio::SeqFeature::Generic->new(
929							# -source_tag => 'location',
930							# -start => $self->get_seq_by_pos($arg{-pos})->start,
931							# -end => $self->get_seq_by_pos($arg{-pos})->end,
932							# );
933							# $seq->add_SeqFeature($rootfeature);
934
935	0					0	while(my $b = shift @bs){
936	0					0	my $e = shift @es;
937							$seq->add_SeqFeature(
938							Bio::SeqFeature::Generic->new(
939							-start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
940	0		0			0	-end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
941							-source_tag => $self->source \|\| 'MSA',
942							)
943							);
944							}
945
946	0					0	return $seq;
947							}
948
949
950							=head1 Create new alignments
951
952							The result of these methods are horizontal or vertical subsets of the
953							current MSA.
954
955							=head2 select
956
957							Title : select
958							Usage : $aln2 = $aln->select(1, 3) # three first sequences
959							Function : Creates a new alignment from a continuous subset of
960							sequences. Numbering starts from 1. Sequence positions
961							larger than num_sequences() will throw an error.
962							Returns : a Bio::SimpleAlign object
963							Args : positive integer for the first sequence
964							positive integer for the last sequence to include (optional)
965
966							=cut
967
968							sub select {
969	1			1	1	2	my $self = shift;
970	1					1	my ($start, $end) = @_;
971
972	1	50	33			10	$self->throw("Select start has to be a positive integer, not [$start]")
973							unless $start =~ /^\d+$/ and $start > 0;
974	1	50	33			7	$self->throw("Select end has to be a positive integer, not [$end]")
975							unless $end =~ /^\d+$/ and $end > 0;
976	1	50				2	$self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
977							unless $start <= $end;
978
979	1					3	my $aln = $self->new;
980	1					3	foreach my $pos ($start .. $end) {
981	3					4	$aln->add_seq($self->get_seq_by_pos($pos));
982							}
983	1					4	$aln->id($self->id);
984							# fix for meta, sf, ann
985	1					2	return $aln;
986							}
987
988							=head2 select_noncont
989
990							Title : select_noncont
991							Usage : # 1st and 3rd sequences, sorted
992							$aln2 = $aln->select_noncont(1, 3)
993
994							# 1st and 3rd sequences, sorted (same as first)
995							$aln2 = $aln->select_noncont(3, 1)
996
997							# 1st and 3rd sequences, unsorted
998							$aln2 = $aln->select_noncont('nosort',3, 1)
999
1000							Function : Creates a new alignment from a subset of sequences. Numbering
1001							starts from 1. Sequence positions larger than num_sequences() will
1002							throw an error. Sorts the order added to new alignment by default,
1003							to prevent sorting pass 'nosort' as the first argument in the list.
1004							Returns : a Bio::SimpleAlign object
1005							Args : array of integers for the sequences. If the string 'nosort' is
1006							passed as the first argument, the sequences will not be sorted
1007							in the new alignment but will appear in the order listed.
1008
1009							=cut
1010
1011							sub select_noncont {
1012	8			8	1	9	my $self = shift;
1013	8					9	my $nosort = 0;
1014	8					11	my (@pos) = @_;
1015	8	100				35	if ($pos[0] !~ m{^\d+$}) {
1016	1					1	my $sortcmd = shift @pos;
1017	1	50				4	if ($sortcmd eq 'nosort') {
1018	1					1	$nosort = 1;
1019							} else {
1020	0					0	$self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1021							}
1022							}
1023
1024	8					16	my $end = $self->num_sequences;
1025	8					12	foreach ( @pos ) {
1026	32	50	33			165	$self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
			33
1027							unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1028							}
1029
1030	8	100				18	@pos = sort {$a <=> $b} @pos unless $nosort;
	25					20
1031
1032	8					15	my $aln = $self->new;
1033	8					11	foreach my $p (@pos) {
1034	32					39	$aln->add_seq($self->get_seq_by_pos($p));
1035							}
1036	8					15	$aln->id($self->id);
1037							# fix for meta, sf, ann
1038	8					16	return $aln;
1039							}
1040
1041							=head2 select_noncont_by_name
1042
1043							Title : select_noncont_by_name
1044							Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1045							Function : Creates a new alignment from a subset of sequences which are
1046							selected by name (sequence ID).
1047							Returns : a Bio::SimpleAlign object
1048							Args : array of names (i.e., identifiers) for the sequences.
1049
1050							=cut
1051
1052							sub select_noncont_by_name {
1053	1			1	1	3	my ($self, @names) = @_;
1054
1055	1					2	my $aln = $self->new;
1056	1					2	foreach my $name (@names) {
1057	3					6	$aln->add_seq($self->get_seq_by_id($name));
1058							}
1059	1					3	$aln->id($self->id);
1060
1061	1					2	return $aln;
1062							}
1063
1064							=head2 slice
1065
1066							Title : slice
1067							Usage : $aln2 = $aln->slice(20,30)
1068							Function : Creates a slice from the alignment inclusive of start and
1069							end columns, and the first column in the alignment is denoted 1.
1070							Sequences with no residues in the slice are excluded from the
1071							new alignment and a warning is printed. Slice beyond the length of
1072							the sequence does not do padding.
1073							Returns : A Bio::SimpleAlign object
1074							Args : Positive integer for start column, positive integer for end column,
1075							optional boolean which if true will keep gap-only columns in the newly
1076							created slice. Example:
1077
1078							$aln2 = $aln->slice(20,30,1)
1079
1080							=cut
1081
1082							sub slice {
1083	31			31	1	1964	my $self = shift;
1084	31					35	my ($start, $end, $keep_gap_only) = @_;
1085
1086	31	50	33			187	$self->throw("Slice start has to be a positive integer, not [$start]")
1087							unless $start =~ /^\d+$/ and $start > 0;
1088	31	50	33			143	$self->throw("Slice end has to be a positive integer, not [$end]")
1089							unless $end =~ /^\d+$/ and $end > 0;
1090	31	50				44	$self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1091							unless $start <= $end;
1092	31	100				55	$self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1093							"[$start] is too big.") if $start > $self->length;
1094	30					50	my $cons_meta = $self->consensus_meta;
1095	30					50	my $aln = $self->new;
1096	30					57	$aln->id($self->id);
1097	30					49	foreach my $seq ( $self->each_seq() ) {
1098	107	50				568	my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1099							Bio::Seq::Meta->new
1100							(-id => $seq->id,
1101							-alphabet => $seq->alphabet,
1102							-strand => $seq->strand,
1103							-verbose => $self->verbose) :
1104							Bio::LocatableSeq->new
1105							(-id => $seq->id,
1106							-alphabet => $seq->alphabet,
1107							-strand => $seq->strand,
1108							-verbose => $self->verbose);
1109
1110							# seq
1111	107					139	my $seq_end = $end;
1112	107	50				194	$seq_end = $seq->length if( $end > $seq->length );
1113
1114	107					187	my $slice_seq = $seq->subseq($start, $seq_end);
1115	107					170	$new_seq->seq( $slice_seq );
1116
1117							# Allowed extra characters in string
1118	107					88	my $allowed_chars = '';
1119	107	100				175	if (exists $self->{_mask_char}) {
1120	24					20	$allowed_chars = $self->{_mask_char};
1121	24					27	$allowed_chars = quotemeta $allowed_chars;
1122							}
1123	107					622	$slice_seq =~ s/[^\w$allowed_chars]//g;
1124
1125	107	100				163	if ($start > 1) {
1126	52					88	my $pre_start_seq = $seq->subseq(1, $start - 1);
1127	52					297	$pre_start_seq =~ s/[^\w$allowed_chars]//g;
1128	52	100				90	if (!defined($seq->strand)) {
		100
1129	38					58	$new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130							} elsif ($seq->strand < 0){
1131	1					3	$new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132							} else {
1133	13					21	$new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1134							}
1135							} else {
1136	55	100	100			92	if ((defined $seq->strand)&&($seq->strand < 0)){
1137	2					5	$new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1138							} else {
1139	53					82	$new_seq->start( $seq->start);
1140							}
1141							}
1142	107	50				462	if ($new_seq->isa('Bio::Seq::MetaI')) {
1143	0					0	for my $meta_name ($seq->meta_names) {
1144	0					0	$new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1145							}
1146							}
1147	107					131	$new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1148
1149	107	100	66			167	if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1150	106					185	$aln->add_seq($new_seq);
1151							} else {
1152	1	50				3	if( $keep_gap_only ) {
1153	1					3	$aln->add_seq($new_seq);
1154							} else {
1155	0					0	my $nse = $seq->get_nse();
1156	0					0	$self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1157							" Sequence excluded from the new alignment.");
1158							}
1159							}
1160							}
1161	30	50				54	if ($cons_meta) {
1162	0					0	my $new = Bio::Seq::Meta->new();
1163	0					0	for my $meta_name ($cons_meta->meta_names) {
1164	0					0	$new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1165							}
1166	0					0	$aln->consensus_meta($new);
1167							}
1168	30					61	$aln->annotation($self->annotation);
1169							# fix for meta, sf, ann
1170	30					63	return $aln;
1171							}
1172
1173							=head2 remove_columns
1174
1175							Title : remove_columns
1176							Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1177							$aln2 = $aln->remove_columns([0,0],[6,8])
1178							Function : Creates an aligment with columns removed corresponding to
1179							the specified type or by specifying the columns by number.
1180							Returns : Bio::SimpleAlign object
1181							Args : Array ref of types ('match'\|'weak'\|'strong'\|'mismatch'\|'gaps'\|
1182							'all_gaps_columns') or array ref where the referenced array
1183							contains a pair of integers that specify a range.
1184							The first column is 0
1185
1186							=cut
1187
1188							sub remove_columns {
1189	4			4	1	434	my ($self,@args) = @_;
1190	4	50				11	@args \|\| $self->throw("Must supply column ranges or column types");
1191	4					4	my $aln;
1192
1193	4	100				27	if ($args[0][0] =~ /^[a-z_]+$/i) {
		50
1194	1					3	$aln = $self->_remove_columns_by_type($args[0]);
1195							} elsif ($args[0][0] =~ /^\d+$/) {
1196	3					10	$aln = $self->_remove_columns_by_num(\@args);
1197							} else {
1198	0					0	$self->throw("You must pass array references to remove_columns(), not @args");
1199							}
1200							# fix for meta, sf, ann
1201	4					8	$aln;
1202							}
1203
1204
1205							=head2 remove_gaps
1206
1207							Title : remove_gaps
1208							Usage : $aln2 = $aln->remove_gaps
1209							Function : Creates an aligment with gaps removed
1210							Returns : a Bio::SimpleAlign object
1211							Args : a gap character(optional) if none specified taken
1212							from $self->gap_char,
1213							[optional] $all_gaps_columns flag (1 or 0, default is 0)
1214							indicates that only all-gaps columns should be deleted
1215
1216							Used from method L in most cases. Set gap character
1217							using L.
1218
1219							=cut
1220
1221							sub remove_gaps {
1222	6			6	1	11	my ($self,$gapchar,$all_gaps_columns) = @_;
1223	6					4	my $gap_line;
1224	6	100				12	if ($all_gaps_columns) {
1225	2					5	$gap_line = $self->all_gap_line($gapchar);
1226							} else {
1227	4					9	$gap_line = $self->gap_line($gapchar);
1228							}
1229	6					10	my $aln = $self->new;
1230
1231	6					7	my @remove;
1232	6					4	my $length = 0;
1233	6		66			15	my $del_char = $gapchar \|\| $self->gap_char;
1234							# Do the matching to get the segments to remove
1235	6					44	while ($gap_line =~ m/[$del_char]/g) {
1236	14					13	my $start = pos($gap_line)-1;
1237	14					26	$gap_line =~ m/\G[$del_char]+/gc;
1238	14					12	my $end = pos($gap_line)-1;
1239
1240							#have to offset the start and end for subsequent removes
1241	14					9	$start-=$length;
1242	14					8	$end -=$length;
1243	14					12	$length += ($end-$start+1);
1244	14					38	push @remove, [$start,$end];
1245							}
1246
1247							#remove the segments
1248	6	100				19	$aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1249							# fix for meta, sf, ann
1250	6					18	return $aln;
1251							}
1252
1253
1254							sub _remove_col {
1255	9			9		11	my ($self,$aln,$remove) = @_;
1256	9					7	my @new;
1257
1258	9					14	my $gap = $self->gap_char;
1259
1260							# splice out the segments and create new seq
1261	9					15	foreach my $seq($self->each_seq){
1262	42					85	my $new_seq = Bio::LocatableSeq->new(
1263							-id => $seq->id,
1264							-alphabet=> $seq->alphabet,
1265							-strand => $seq->strand,
1266							-verbose => $self->verbose);
1267	42					80	my $sequence = $seq->seq;
1268	42					34	foreach my $pair(@{$remove}){
	42					54
1269	658					448	my $start = $pair->[0];
1270	658					421	my $end = $pair->[1];
1271	658	50				701	$sequence = $seq->seq unless $sequence;
1272	658					451	my $orig = $sequence;
1273	658	100				831	my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1274	658	100				803	my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1275	658					505	$sequence = $head.$tail;
1276							# start
1277	658	100				746	unless (defined $new_seq->start) {
1278	42	100				46	if ($start == 0) {
1279	31					90	my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1280	31					54	$new_seq->start($seq->start + $end + 1 - $start_adjust);
1281							}
1282							else {
1283	11					43	my $start_adjust = $orig =~ /^$gap+/;
1284	11	100				19	if ($start_adjust) {
1285	5					12	$start_adjust = $+[0] == $start;
1286							}
1287	11					18	$new_seq->start($seq->start + $start_adjust);
1288							}
1289							}
1290							# end
1291	658	100				678	if (($end + 1) >= CORE::length($orig)) {
1292	32					74	my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1293	32					44	$new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1294							}
1295							else {
1296	626					682	$new_seq->end($seq->end);
1297							}
1298							}
1299
1300	42	100				50	if ($new_seq->end < $new_seq->start) {
1301							# we removed all columns except for gaps: set to 0 to indicate no
1302							# sequence
1303	1					3	$new_seq->start(0);
1304	1					3	$new_seq->end(0);
1305							}
1306
1307	42	50				92	$new_seq->seq($sequence) if $sequence;
1308	42					60	push @new, $new_seq;
1309							}
1310							# add the new seqs to the alignment
1311	9					13	foreach my $new(@new){
1312	42					50	$aln->add_seq($new);
1313							}
1314							# fix for meta, sf, ann
1315	9					17	return $aln;
1316							}
1317
1318							sub _remove_columns_by_type {
1319	1			1		2	my ($self,$type) = @_;
1320	1					2	my $aln = $self->new;
1321	1					1	my @remove;
1322
1323	1	50				2	my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
	1					4
	1					2
1324	1	50				1	my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
	1					3
1325	1					5	my %matchchars = ( 'match' => '\*',
1326							'weak' => '\.',
1327							'strong' => ':',
1328							'mismatch' => ' ',
1329							'gaps' => '',
1330							'all_gaps_columns' => ''
1331							);
1332							# get the characters to delete against
1333	1					1	my $del_char;
1334	1					1	foreach my $type (@{$type}){
	1					2
1335	1					2	$del_char.= $matchchars{$type};
1336							}
1337
1338	1					1	my $length = 0;
1339	1					3	my $match_line = $self->match_line;
1340							# do the matching to get the segments to remove
1341	1	50				4	if($del_char){
1342	1					16	while($match_line =~ m/[$del_char]/g ){
1343	37					26	my $start = pos($match_line)-1;
1344	37					47	$match_line=~/\G[$del_char]+/gc;
1345	37					27	my $end = pos($match_line)-1;
1346
1347							#have to offset the start and end for subsequent removes
1348	37					18	$start-=$length;
1349	37					24	$end -=$length;
1350	37					17	$length += ($end-$start+1);
1351	37					77	push @remove, [$start,$end];
1352							}
1353							}
1354
1355							# remove the segments
1356	1	50				5	$aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1357	1	50				3	$aln = $aln->remove_gaps() if $gap;
1358	1	50				11	$aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1359							# fix for meta, sf, ann
1360	1					7	$aln;
1361							}
1362
1363
1364							sub _remove_columns_by_num {
1365	3			3		4	my ($self,$positions) = @_;
1366	3					5	my $aln = $self->new;
1367
1368							# sort the positions
1369	3					9	@$positions = sort { $a->[0] <=> $b->[0] } @$positions;
	3					5
1370
1371	3					2	my @remove;
1372	3					4	my $length = 0;
1373	3					3	foreach my $pos (@{$positions}) {
	3					6
1374	5					3	my ($start, $end) = @{$pos};
	5					6
1375
1376							#have to offset the start and end for subsequent removes
1377	5					6	$start-=$length;
1378	5					5	$end -=$length;
1379	5					3	$length += ($end-$start+1);
1380	5					219	push @remove, [$start,$end];
1381							}
1382
1383							#remove the segments
1384	3	50				14	$aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1385							# fix for meta, sf, ann
1386	3					6	$aln;
1387							}
1388
1389
1390							=head1 Change sequences within the MSA
1391
1392							These methods affect characters in all sequences without changing the
1393							alignment.
1394
1395							=head2 splice_by_seq_pos
1396
1397							Title : splice_by_seq_pos
1398							Usage : $status = splice_by_seq_pos(1);
1399							Function: splices all aligned sequences where the specified sequence
1400							has gaps.
1401							Example :
1402							Returns : 1 on success
1403							Args : position of sequence to splice by
1404
1405
1406							=cut
1407
1408							sub splice_by_seq_pos{
1409	0			0	1	0	my ($self,$pos) = @_;
1410
1411	0					0	my $guide = $self->get_seq_by_pos($pos);
1412	0					0	my $guide_seq = $guide->seq;
1413
1414	0					0	$guide_seq =~ s/\./\-/g;
1415
1416	0					0	my @gaps = ();
1417	0					0	$pos = -1;
1418	0					0	while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1419	0					0	unshift @gaps, $pos;
1420	0					0	$pos++;
1421							}
1422
1423	0					0	foreach my $seq ($self->each_seq){
1424	0					0	my @bases = split '', $seq->seq;
1425
1426	0					0	splice(@bases, $_, 1) foreach @gaps;
1427	0					0	$seq->seq(join('', @bases));
1428							}
1429
1430	0					0	1;
1431							}
1432
1433							=head2 map_chars
1434
1435							Title : map_chars
1436							Usage : $ali->map_chars('\.','-')
1437							Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1438							characters.
1439
1440							Note that the first argument is interpreted as a regexp
1441							so be careful and escape any wild card characters (e.g.
1442							do $ali->map_chars('\.','-') to replace periods with dashes.
1443							Returns : 1 on success
1444							Argument : A regexp and a string
1445
1446							=cut
1447
1448							sub map_chars {
1449	4			4	1	6	my $self = shift;
1450	4					8	my $from = shift;
1451	4					5	my $to = shift;
1452	4					5	my ( $seq, $temp );
1453
1454	4	50	33			21	$self->throw("Need two arguments: a regexp and a string")
1455							unless defined $from and defined $to;
1456
1457	4					9	foreach $seq ( $self->each_seq() ) {
1458	39					45	$temp = $seq->seq();
1459	39					144	$temp =~ s/$from/$to/g;
1460	39					48	$seq->seq($temp);
1461							}
1462	4					12	return 1;
1463							}
1464
1465
1466							=head2 uppercase
1467
1468							Title : uppercase()
1469							Usage : $ali->uppercase()
1470							Function : Sets all the sequences to uppercase
1471							Returns : 1 on success
1472							Argument :
1473
1474							=cut
1475
1476							sub uppercase {
1477	1			1	1	3	my $self = shift;
1478	1					1	my $seq;
1479							my $temp;
1480
1481	1					4	foreach $seq ( $self->each_seq() ) {
1482	16					21	$temp = $seq->seq();
1483	16					21	$temp =~ tr/[a-z]/[A-Z]/;
1484
1485	16					21	$seq->seq($temp);
1486							}
1487	1					4	return 1;
1488							}
1489
1490							=head2 cigar_line
1491
1492							Title : cigar_line()
1493							Usage : %cigars = $align->cigar_line()
1494							Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1495							Report) line for each sequence in the alignment. Examples are
1496							"1,60" or "5,10:12,58", where the numbers refer to conserved
1497							positions within the alignment. The keys of the hash are the
1498							NSEs (name/start/end) assigned to each sequence.
1499							Args : threshold (optional, defaults to 100)
1500							Returns : Hash of strings (cigar lines)
1501
1502							=cut
1503
1504							sub cigar_line {
1505	1			1	1	438	my $self = shift;
1506	1		50			6	my $thr=shift\|\|100;
1507	1					2	my %cigars;
1508
1509	1					3	my @consensus = split "",($self->consensus_string($thr));
1510	1					3	my $len = $self->length;
1511	1					3	my $gapchar = $self->gap_char;
1512
1513							# create a precursor, something like (1,4,5,6,7,33,45),
1514							# where each number corresponds to a conserved position
1515	1					2	foreach my $seq ( $self->each_seq ) {
1516	4					7	my @seq = split "", uc ($seq->seq);
1517	4					5	my $pos = 1;
1518	4					9	for (my $x = 0 ; $x < $len ; $x++ ) {
1519	132	100				189	if ($seq[$x] eq $consensus[$x]) {
		100
1520	28					15	push @{$cigars{$seq->get_nse}},$pos;
	28					31
1521	28					48	$pos++;
1522							} elsif ($seq[$x] ne $gapchar) {
1523	84					97	$pos++;
1524							}
1525							}
1526							}
1527							# duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1528	1					3	for my $name (keys %cigars) {
1529	4					4	splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
	4					4
1530	4	50				2	( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
	4					5
	4					6
1531	4					4	push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
	4					4
	4					2
1532	4					4	( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
	4					4
1533	4	50				3	${$cigars{$name}}[$#{$cigars{$name}}] );
	4					6
	4					4
1534	4					5	for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
	32					39
1535	28	100	100			14	if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
	28					24
	28					42
1536	8					4	${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
	8					20
1537	4					3	splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
	4					3
	4					6
1538							}
1539							}
1540							}
1541							# collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1542	1					3	for my $name (keys %cigars) {
1543	4					1	my @remove;
1544	4					4	for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
	40					46
1545	36	100	100			25	if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
	36					21
	36					55
1546	12					8	${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
	12					26
1547	8					8	unshift @remove,$x;
1548							}
1549							}
1550	4					4	for my $pos (@remove) {
1551	8					4	splice @{$cigars{$name}}, $pos, 1;
	8					11
1552							}
1553							}
1554							# join and punctuate
1555	1					2	for my $name (keys %cigars) {
1556	4					5	my ($start,$end,$str) = "";
1557	4					3	while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
	20					30
1558	16					17	$str .= ($start . "," . $end . ":");
1559							}
1560	4					10	$str =~ s/:$//;
1561	4					6	$cigars{$name} = $str;
1562							}
1563	1					7	%cigars;
1564							}
1565
1566
1567							=head2 match_line
1568
1569							Title : match_line()
1570							Usage : $line = $align->match_line()
1571							Function : Generates a match line - much like consensus string
1572							except that a line indicating the '*' for a match.
1573							Args : (optional) Match line characters ('*' by default)
1574							(optional) Strong match char (':' by default)
1575							(optional) Weak match char ('.' by default)
1576							Returns : String
1577
1578							=cut
1579
1580							sub match_line {
1581	6			6	1	13	my ($self,$matchlinechar, $strong, $weak) = @_;
1582	6		50			73	my %matchchars = ('match' => $matchlinechar \|\| '*',
			50
			50
1583							'weak' => $weak \|\| '.',
1584							'strong' => $strong \|\| ':',
1585							'mismatch' => ' ',
1586							);
1587
1588	6					8	my @seqchars;
1589							my $alphabet;
1590	6					15	foreach my $seq ( $self->each_seq ) {
1591	69					112	push @seqchars, [ split(//, uc ($seq->seq)) ];
1592	69	100				1106	$alphabet = $seq->alphabet unless defined $alphabet;
1593							}
1594	6					15	my $refseq = shift @seqchars;
1595							# let's just march down the columns
1596	6					7	my $matchline;
1597							POS:
1598	6					20	foreach my $pos ( 0..$self->length ) {
1599	3138					2447	my $refchar = $refseq->[$pos];
1600	3138					2072	my $char = $matchchars{'mismatch'};
1601	3138	100				3490	unless( defined $refchar ) {
1602	6	50				28	last if $pos == $self->length; # short circuit on last residue
1603							# this in place to handle jason's soon-to-be-committed
1604							# intron mapping code
1605	0					0	goto bottom;
1606							}
1607	3132					2875	my %col = ($refchar => 1);
1608	3132		66			9885	my $dash = ($refchar eq '-' \|\| $refchar eq '.' \|\| $refchar eq ' ');
1609	3132					2499	foreach my $seq ( @seqchars ) {
1610	26906	50				28125	next if $pos >= scalar @$seq;
1611	26906	100	100			87824	$dash = 1 if( $seq->[$pos] eq '-' \|\| $seq->[$pos] eq '.' \|\|
			66
1612							$seq->[$pos] eq ' ' );
1613	26906	50				34760	$col{$seq->[$pos]}++ if defined $seq->[$pos];
1614							}
1615	3132					4640	my @colresidues = sort keys %col;
1616
1617							# if all the values are the same
1618	3132	100				4437	if( $dash ) { $char = $matchchars{'mismatch'} }
	758	100				566
		100
1619	1721					1282	elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1620							elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1621							# matches for protein seqs
1622							TYPE:
1623	472					363	foreach my $type ( qw(strong weak) ) {
1624							# iterate through categories
1625	804					447	my %groups;
1626							# iterate through each of the aa in the col
1627							# look to see which groups it is in
1628	804					593	foreach my $c ( @colresidues ) {
1629	3075					1683	foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
	30455					24136
	3075					2703
1630	6808					3680	push @{$groups{$f}},$c;
	6808					7786
1631							}
1632							}
1633							GRP:
1634	804					821	foreach my $cols ( values %groups ) {
1635	3944					4326	@$cols = sort @$cols;
1636							# now we are just testing to see if two arrays
1637							# are identical w/o changing either one
1638							# have to be same len
1639	3944	100				5796	next if( scalar @$cols != scalar @colresidues );
1640							# walk down the length and check each slot
1641	196					279	for($_=0;$_ < (scalar @$cols);$_++ ) {
1642	453	50				813	next GRP if( $cols->[$_] ne $colresidues[$_] );
1643							}
1644	196					182	$char = $matchchars{$type};
1645	196					304	last TYPE;
1646							}
1647							}
1648							}
1649							bottom:
1650	3132					3874	$matchline .= $char;
1651							}
1652	6					975	return $matchline;
1653							}
1654
1655
1656							=head2 gap_line
1657
1658							Title : gap_line()
1659							Usage : $line = $align->gap_line()
1660							Function : Generates a gap line - much like consensus string
1661							except that a line where '-' represents gap
1662							Args : (optional) gap line characters ('-' by default)
1663							Returns : string
1664
1665							=cut
1666
1667							sub gap_line {
1668	14			14	1	10	my ($self,$gapchar) = @_;
1669	14		33			39	$gapchar = $gapchar \|\| $self->gap_char;
1670	14					13	my %gap_hsh; # column gaps vector
1671	14					23	foreach my $seq ( $self->each_seq ) {
1672	33					28	my $i = 0;
1673	223					758	map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
	4020					4840
1674	33					62	map {[$i++, $_]} split(//, uc ($seq->seq));
	4020					3398
1675							}
1676	14					16	my $gap_line;
1677	14					24	foreach my $pos ( 0..$self->length-1 ) {
1678	1973	100				2009	$gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1679							}
1680	14					48	return $gap_line;
1681							}
1682
1683							=head2 all_gap_line
1684
1685							Title : all_gap_line()
1686							Usage : $line = $align->all_gap_line()
1687							Function : Generates a gap line - much like consensus string
1688							except that a line where '-' represents all-gap column
1689							Args : (optional) gap line characters ('-' by default)
1690							Returns : string
1691
1692							=cut
1693
1694							sub all_gap_line {
1695	2			2	1	3	my ($self,$gapchar) = @_;
1696	2		66			6	$gapchar = $gapchar \|\| $self->gap_char;
1697	2					2	my %gap_hsh; # column gaps counter hash
1698	2					5	my @seqs = $self->each_seq;
1699	2					3	foreach my $seq ( @seqs ) {
1700	5					4	my $i = 0;
1701	7					16	map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
	54					94
1702	5					11	map {[$i++, $_]} split(//, uc ($seq->seq));
	54					50
1703							}
1704	2					3	my $gap_line;
1705	2					5	foreach my $pos ( 0..$self->length-1 ) {
1706	22	100	100			39	if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1707							# gaps column
1708	1					2	$gap_line .= $self->gap_char;
1709							} else {
1710	21					18	$gap_line .= '.';
1711							}
1712							}
1713	2					5	return $gap_line;
1714							}
1715
1716							=head2 gap_col_matrix
1717
1718							Title : gap_col_matrix()
1719							Usage : my $cols = $align->gap_col_matrix()
1720							Function : Generates an array where each element in the array is a
1721							hash reference with a key of the sequence name and a
1722							value of 1 if the sequence has a gap at that column
1723							Returns : Reference to an array
1724							Args : Optional: gap line character ($aln->gap_char or '-' by default)
1725
1726							=cut
1727
1728							sub gap_col_matrix {
1729	0			0	1	0	my ( $self, $gapchar ) = @_;
1730	0		0			0	$gapchar = $gapchar \|\| $self->gap_char;
1731	0					0	my %gap_hsh; # column gaps vector
1732							my @cols;
1733	0					0	foreach my $seq ( $self->each_seq ) {
1734	0					0	my $i = 0;
1735	0					0	my $str = $seq->seq;
1736	0					0	my $len = $seq->length;
1737	0					0	my $ch;
1738	0					0	my $id = $seq->display_id;
1739	0					0	while ( $i < $len ) {
1740	0					0	$ch = substr( $str, $i, 1 );
1741	0					0	$cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1742							}
1743							}
1744	0					0	return \@cols;
1745							}
1746
1747							=head2 match
1748
1749							Title : match()
1750							Usage : $ali->match()
1751							Function : Goes through all columns and changes residues that are
1752							identical to residue in first sequence to match '.'
1753							character. Sets match_char.
1754
1755							USE WITH CARE: Most MSA formats do not support match
1756							characters in sequences, so this is mostly for output
1757							only. NEXUS format (Bio::AlignIO::nexus) can handle
1758							it.
1759							Returns : 1 on success
1760							Argument : a match character, optional, defaults to '.'
1761
1762							=cut
1763
1764							sub match {
1765	1			1	1	2	my ( $self, $match ) = @_;
1766
1767	1		50			4	$match \|\|= '.';
1768	1					1	my ($matching_char) = $match;
1769	1	50				5	$matching_char = "\\$match" if $match =~ /[\^.$\|()\[\]]/; #';
1770	1					5	$self->map_chars( $matching_char, '-' );
1771
1772	1					3	my @seqs = $self->each_seq();
1773	1	50				4	return 1 unless scalar @seqs > 1;
1774
1775	1					2	my $refseq = shift @seqs;
1776	1					2	my @refseq = split //, $refseq->seq;
1777	1					3	my $gapchar = $self->gap_char;
1778
1779	1					2	foreach my $seq (@seqs) {
1780	5					9	my @varseq = split //, $seq->seq();
1781	5					11	for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1782	720	100	66			3184	$varseq[$i] = $match
			33
			66
1783							if defined $refseq[$i]
1784							&& ( $refseq[$i] =~ /[A-Za-z\*]/
1785							\|\| $refseq[$i] =~ /$gapchar/ )
1786							&& $refseq[$i] eq $varseq[$i];
1787							}
1788	5					34	$seq->seq( join '', @varseq );
1789							}
1790	1					5	$self->match_char($match);
1791	1					7	return 1;
1792							}
1793
1794
1795
1796							=head2 unmatch
1797
1798							Title : unmatch()
1799							Usage : $ali->unmatch()
1800							Function : Undoes the effect of method match. Unsets match_char.
1801							Returns : 1 on success
1802							Argument : a match character, optional, defaults to '.'
1803
1804							See L and L
1805
1806							=cut
1807
1808							sub unmatch {
1809	5			5	1	9	my ( $self, $match ) = @_;
1810
1811	5		100			23	$match \|\|= '.';
1812
1813	5					18	my @seqs = $self->each_seq();
1814	5	50				16	return 1 unless scalar @seqs > 1;
1815
1816	5					8	my $refseq = shift @seqs;
1817	5					14	my @refseq = split //, $refseq->seq;
1818	5					18	my $gapchar = $self->gap_char;
1819	5					12	foreach my $seq (@seqs) {
1820	114					170	my @varseq = split //, $seq->seq();
1821	114					204	for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1822	19782	100	100			85187	$varseq[$i] = $refseq[$i]
			33
			100
1823							if defined $refseq[$i]
1824							&& ( $refseq[$i] =~ /[A-Za-z\*]/
1825							\|\| $refseq[$i] =~ /$gapchar/ )
1826							&& $varseq[$i] eq $match;
1827							}
1828	114					813	$seq->seq( join '', @varseq );
1829							}
1830	5					24	$self->match_char('');
1831	5					70	return 1;
1832							}
1833
1834
1835							=head1 MSA attributes
1836
1837							Methods for setting and reading the MSA attributes.
1838
1839							Note that the methods defining character semantics depend on the user
1840							to set them sensibly. They are needed only by certain input/output
1841							methods. Unset them by setting to an empty string ('').
1842
1843							=head2 id
1844
1845							Title : id
1846							Usage : $myalign->id("Ig")
1847							Function : Gets/sets the id field of the alignment
1848							Returns : An id string
1849							Argument : An id string (optional)
1850
1851							=cut
1852
1853							sub id {
1854	142			142	1	165	my ( $self, $name ) = @_;
1855
1856	142	100				261	if ( defined($name) ) {
1857	72					92	$self->{'_id'} = $name;
1858							}
1859
1860	142					262	return $self->{'_id'};
1861							}
1862
1863							=head2 accession
1864
1865							Title : accession
1866							Usage : $myalign->accession("PF00244")
1867							Function : Gets/sets the accession field of the alignment
1868							Returns : An acc string
1869							Argument : An acc string (optional)
1870
1871							=cut
1872
1873							sub accession {
1874	16			16	1	21	my ( $self, $acc ) = @_;
1875
1876	16	100				34	if ( defined($acc) ) {
1877	8					20	$self->{'_accession'} = $acc;
1878							}
1879
1880	16					31	return $self->{'_accession'};
1881							}
1882
1883							=head2 description
1884
1885							Title : description
1886							Usage : $myalign->description("14-3-3 proteins")
1887							Function : Gets/sets the description field of the alignment
1888							Returns : An description string
1889							Argument : An description string (optional)
1890
1891							=cut
1892
1893							sub description {
1894	25			25	1	36	my ( $self, $name ) = @_;
1895
1896	25	100				52	if ( defined($name) ) {
1897	13					26	$self->{'_description'} = $name;
1898							}
1899
1900	25					55	return $self->{'_description'};
1901							}
1902
1903							=head2 missing_char
1904
1905							Title : missing_char
1906							Usage : $myalign->missing_char("?")
1907							Function : Gets/sets the missing_char attribute of the alignment
1908							It is generally recommended to set it to 'n' or 'N'
1909							for nucleotides and to 'X' for protein.
1910							Returns : An missing_char string,
1911							Argument : An missing_char string (optional)
1912
1913							=cut
1914
1915							sub missing_char {
1916	32			32	1	39	my ( $self, $char ) = @_;
1917
1918	32	100				65	if ( defined $char ) {
1919	22	50				45	$self->throw("Single missing character, not [$char]!")
1920							if CORE::length($char) > 1;
1921	22					36	$self->{'_missing_char'} = $char;
1922							}
1923
1924	32					56	return $self->{'_missing_char'};
1925							}
1926
1927
1928							=head2 match_char
1929
1930							Title : match_char
1931							Usage : $myalign->match_char('.')
1932							Function : Gets/sets the match_char attribute of the alignment
1933							Returns : An match_char string,
1934							Argument : An match_char string (optional)
1935
1936							=cut
1937
1938							sub match_char {
1939	18			18	1	28	my ( $self, $char ) = @_;
1940
1941	18	100				42	if ( defined $char ) {
1942	9	50				19	$self->throw("Single match character, not [$char]!")
1943							if CORE::length($char) > 1;
1944	9					16	$self->{'_match_char'} = $char;
1945							}
1946
1947	18					38	return $self->{'_match_char'};
1948							}
1949
1950							=head2 gap_char
1951
1952							Title : gap_char
1953							Usage : $myalign->gap_char('-')
1954							Function : Gets/sets the gap_char attribute of the alignment
1955							Returns : An gap_char string, defaults to '-'
1956							Argument : An gap_char string (optional)
1957
1958							=cut
1959
1960							sub gap_char {
1961	11242			11242	1	7791	my ( $self, $char ) = @_;
1962
1963	11242	100	100			28552	if ( defined $char \|\| !defined $self->{'_gap_char'} ) {
1964	51	100				116	$char = '-' unless defined $char;
1965	51	50				101	$self->throw("Single gap character, not [$char]!")
1966							if CORE::length($char) > 1;
1967	51					118	$self->{'_gap_char'} = $char;
1968							}
1969	11242					15041	return $self->{'_gap_char'};
1970							}
1971
1972
1973							=head2 symbol_chars
1974
1975							Title : symbol_chars
1976							Usage : my @symbolchars = $aln->symbol_chars;
1977							Function: Returns all the seen symbols (other than gaps)
1978							Returns : array of characters that are the seen symbols
1979							Args : boolean to include the gap/missing/match characters
1980
1981							=cut
1982
1983							sub symbol_chars{
1984	5			5	1	9	my ($self,$includeextra) = @_;
1985
1986	5	100				13	unless ($self->{'_symbols'}) {
1987	3					10	foreach my $seq ($self->each_seq) {
1988	12					32	map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
	2778					2436
1989							}
1990							}
1991	5					6	my %copy = %{$self->{'_symbols'}};
	5					37
1992	5	50				20	if( ! $includeextra ) {
1993	5					14	foreach my $char ( $self->gap_char, $self->match_char,
1994							$self->missing_char) {
1995	15	100				27	delete $copy{$char} if( defined $char );
1996							}
1997							}
1998	5					35	return keys %copy;
1999							}
2000
2001							=head1 Alignment descriptors
2002
2003							These read only methods describe the MSA in various ways.
2004
2005
2006							=head2 score
2007
2008							Title : score
2009							Usage : $str = $ali->score()
2010							Function : get/set a score of the alignment
2011							Returns : a score for the alignment
2012							Argument : an optional score to set
2013
2014							=cut
2015
2016							sub score {
2017	26			26	1	903	my $self = shift;
2018	26	100				68	$self->{score} = shift if @_;
2019	26					46	return $self->{score};
2020							}
2021
2022							=head2 consensus_string
2023
2024							Title : consensus_string
2025							Usage : $str = $ali->consensus_string($threshold_percent)
2026							Function : Makes a strict consensus
2027							Returns : Consensus string
2028							Argument : Optional threshold ranging from 0 to 100.
2029							The consensus residue has to appear at least threshold %
2030							of the sequences at a given location, otherwise a '?'
2031							character will be placed at that location.
2032							(Default value = 0%)
2033
2034							=cut
2035
2036							sub consensus_string {
2037	11			11	1	2946	my $self = shift;
2038	11					11	my $threshold = shift;
2039
2040	11					14	my $out = "";
2041	11					33	my $len = $self->length - 1;
2042
2043	11					23	foreach ( 0 .. $len ) {
2044	2100					2339	$out .= $self->_consensus_aa( $_, $threshold );
2045							}
2046	11					70	return $out;
2047							}
2048
2049
2050							=head2 consensus_conservation
2051
2052							Title : consensus_conservation
2053							Usage : @conservation = $ali->consensus_conservation();
2054							Function : Conservation (as a percent) of each position of alignment
2055							Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2056							Argument :
2057
2058							=cut
2059
2060							sub consensus_conservation {
2061	1			1	1	2	my $self = shift;
2062	1					1	my @cons;
2063	1					2	my $num_sequences = $self->num_sequences;
2064	1					3	foreach my $point (0..$self->length-1) {
2065	422					447	my %hash = $self->_consensus_counts($point);
2066							# max frequency of a non-gap letter
2067	422					547	my $max = (sort {$b<=>$a} values %hash )[0];
	560					466
2068	422					566	push @cons, 100 * $max / $num_sequences;
2069							}
2070	1					41	return @cons;
2071							}
2072
2073							sub _consensus_aa {
2074	2100			2100		1430	my $self = shift;
2075	2100					1566	my $point = shift;
2076	2100		100			3713	my $threshold_percent = shift \|\| -1 ;
2077	2100					1308	my ($seq,%hash,$count,$letter,$key);
2078	2100					2023	my $gapchar = $self->gap_char;
2079	2100					2256	%hash = $self->_consensus_counts($point);
2080	2100					2442	my $number_of_sequences = $self->num_sequences();
2081	2100					2066	my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2082	2100					1292	$count = -1;
2083	2100					1292	$letter = '?';
2084
2085	2100					2855	foreach $key ( sort keys %hash ) {
2086							# print "Now at $key $hash{$key}\n";
2087	4898	100	100			11407	if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2088	2472					1609	$letter = $key;
2089	2472					2162	$count = $hash{$key};
2090							}
2091							}
2092	2100					3687	return $letter;
2093							}
2094
2095							# Frequency of each letter in one column
2096							sub _consensus_counts {
2097	2522			2522		1674	my $self = shift;
2098	2522					1435	my $point = shift;
2099	2522					1453	my %hash;
2100	2522					2303	my $gapchar = $self->gap_char;
2101	2522					2610	foreach my $seq ( $self->each_seq() ) {
2102	22078					24166	my $letter = substr($seq->seq,$point,1);
2103	22078	50				25203	$self->throw("--$point-----------") if $letter eq '';
2104	22078	100	100			48069	($letter eq $gapchar \|\| $letter =~ /\./) && next;
2105	17334					14802	$hash{$letter}++;
2106							}
2107	2522					5402	return %hash;
2108							}
2109
2110
2111							=head2 consensus_iupac
2112
2113							Title : consensus_iupac
2114							Usage : $str = $ali->consensus_iupac()
2115							Function : Makes a consensus using IUPAC ambiguity codes from DNA
2116							and RNA. The output is in upper case except when gaps in
2117							a column force output to be in lower case.
2118
2119							Note that if your alignment sequences contain a lot of
2120							IUPAC ambiquity codes you often have to manually set
2121							alphabet. Bio::PrimarySeq::_guess_type thinks they
2122							indicate a protein sequence.
2123							Returns : consensus string
2124							Argument : none
2125							Throws : on protein sequences
2126
2127							=cut
2128
2129							sub consensus_iupac {
2130	1			1	1	4	my $self = shift;
2131	1					2	my $out = "";
2132	1					3	my $len = $self->length - 1;
2133
2134							# only DNA and RNA sequences are valid
2135	1					2	foreach my $seq ( $self->each_seq() ) {
2136	2	50				3	$self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2137							if $seq->alphabet eq 'protein';
2138							}
2139
2140							# loop over the alignment columns
2141	1					3	foreach my $count ( 0 .. $len ) {
2142	10					11	$out .= $self->_consensus_iupac($count);
2143							}
2144	1					5	return $out;
2145							}
2146
2147							sub _consensus_iupac {
2148	10			10		8	my ($self, $column) = @_;
2149	10					7	my ($string, $char, $rna);
2150
2151							#determine all residues in a column
2152	10					11	foreach my $seq ( $self->each_seq() ) {
2153	20					25	$string .= substr($seq->seq, $column, 1);
2154							}
2155	10					11	$string = uc $string;
2156
2157							# quick exit if there's an N in the string
2158	10	100				13	if ($string =~ /N/) {
2159	1	50				4	$string =~ /\W/ ? return 'n' : return 'N';
2160							}
2161							# ... or if there are only gap characters
2162	9	100				20	return '-' if $string =~ /^\W+$/;
2163
2164							# treat RNA as DNA in regexps
2165	7	50				11	if ($string =~ /U/) {
2166	0					0	$string =~ s/U/T/;
2167	0					0	$rna = 1;
2168							}
2169
2170							# the following s///'s only need to be done to the _first_ ambiguity code
2171							# as we only need to see the _range_ of characters in $string
2172
2173	7	50				9	if ($string =~ /[VDHB]/) {
2174	0					0	$string =~ s/V/AGC/;
2175	0					0	$string =~ s/D/AGT/;
2176	0					0	$string =~ s/H/ACT/;
2177	0					0	$string =~ s/B/CTG/;
2178							}
2179
2180	7	100				11	if ($string =~ /[SKYRWM]/) {
2181	1					2	$string =~ s/S/GC/;
2182	1					1	$string =~ s/K/GT/;
2183	1					2	$string =~ s/Y/CT/;
2184	1					1	$string =~ s/R/AG/;
2185	1					3	$string =~ s/W/AT/;
2186	1					2	$string =~ s/M/AC/;
2187							}
2188
2189							# and now the guts of the thing
2190
2191	7	100				23	if ($string =~ /A/) {
		50
		50
		50
2192	5					6	$char = 'A'; # A A
2193	5	50				14	if ($string =~ /G/) {
		50
		100
2194	0					0	$char = 'R'; # A and G (purines) R
2195	0	0				0	if ($string =~ /C/) {
		0
2196	0					0	$char = 'V'; # A and G and C V
2197	0	0				0	if ($string =~ /T/) {
2198	0					0	$char = 'N'; # A and G and C and T N
2199							}
2200							} elsif ($string =~ /T/) {
2201	0					0	$char = 'D'; # A and G and T D
2202							}
2203							} elsif ($string =~ /C/) {
2204	0					0	$char = 'M'; # A and C M
2205	0	0				0	if ($string =~ /T/) {
2206	0					0	$char = 'H'; # A and C and T H
2207							}
2208							} elsif ($string =~ /T/) {
2209	2					3	$char = 'W'; # A and T W
2210							}
2211							} elsif ($string =~ /C/) {
2212	0					0	$char = 'C'; # C C
2213	0	0				0	if ($string =~ /T/) {
		0
2214	0					0	$char = 'Y'; # C and T (pyrimidines) Y
2215	0	0				0	if ($string =~ /G/) {
2216	0					0	$char = 'B'; # C and T and G B
2217							}
2218							} elsif ($string =~ /G/) {
2219	0					0	$char = 'S'; # C and G S
2220							}
2221							} elsif ($string =~ /G/) {
2222	0					0	$char = 'G'; # G G
2223	0	0				0	if ($string =~ /C/) {
		0
2224	0					0	$char = 'S'; # G and C S
2225							} elsif ($string =~ /T/) {
2226	0					0	$char = 'K'; # G and T K
2227							}
2228							} elsif ($string =~ /T/) {
2229	2					1	$char = 'T'; # T T
2230							}
2231
2232	7	50	33			11	$char = 'U' if $rna and $char eq 'T';
2233	7	100				11	$char = lc $char if $string =~ /\W/;
2234
2235	7					13	return $char;
2236							}
2237
2238
2239							=head2 consensus_meta
2240
2241							Title : consensus_meta
2242							Usage : $seqmeta = $ali->consensus_meta()
2243							Function : Returns a Bio::Seq::Meta object containing the consensus
2244							strings derived from meta data analysis.
2245							Returns : Bio::Seq::Meta
2246							Argument : Bio::Seq::Meta
2247							Throws : non-MetaI object
2248
2249							=cut
2250
2251							sub consensus_meta {
2252	56			56	1	62	my ($self, $meta) = @_;
2253	56	50	33			266	if ($meta && (!ref $meta \|\| !$meta->isa('Bio::Seq::MetaI'))) {
			66
2254	0					0	$self->throw('Not a Bio::Seq::MetaI object');
2255							}
2256	56	100				111	return $self->{'_aln_meta'} = $meta if $meta;
2257	39					55	return $self->{'_aln_meta'}
2258							}
2259
2260							=head2 is_flush
2261
2262							Title : is_flush
2263							Usage : if ( $ali->is_flush() )
2264							Function : Tells you whether the alignment
2265							: is flush, i.e. all of the same length
2266							Returns : 1 or 0
2267							Argument :
2268
2269							=cut
2270
2271							sub is_flush {
2272	53			53	1	64	my ( $self, $report ) = @_;
2273	53					46	my $seq;
2274	53					49	my $length = (-1);
2275	53					50	my $temp;
2276
2277	53					97	foreach $seq ( $self->each_seq() ) {
2278	248	100				305	if ( $length == (-1) ) {
2279	53					117	$length = CORE::length( $seq->seq() );
2280	53					70	next;
2281							}
2282
2283	195					232	$temp = CORE::length( $seq->seq() );
2284	195	50				281	if ( $temp != $length ) {
2285	0	0				0	$self->warn(
2286							"expecting $length not $temp from " . $seq->display_id )
2287							if ($report);
2288	0					0	$self->debug(
2289							"expecting $length not $temp from " . $seq->display_id );
2290	0					0	$self->debug( $seq->seq() . "\n" );
2291	0					0	return 0;
2292							}
2293							}
2294
2295	53					126	return 1;
2296							}
2297
2298
2299							=head2 length
2300
2301							Title : length()
2302							Usage : $len = $ali->length()
2303							Function : Returns the maximum length of the alignment.
2304							To be sure the alignment is a block, use is_flush
2305							Returns : Integer
2306							Argument :
2307
2308							=cut
2309
2310							sub length_aln {
2311	0			0	0	0	my $self = shift;
2312	0					0	$self->deprecated("length_aln - deprecated method. Use length() instead.");
2313	0					0	$self->length(@_);
2314							}
2315
2316							sub length {
2317	2216			2216	1	1948	my $self = shift;
2318	2216					1462	my $seq;
2319	2216					1414	my $length = -1;
2320	2216					1436	my $temp;
2321
2322	2216					2757	foreach $seq ( $self->each_seq() ) {
2323	8524					9505	$temp = $seq->length();
2324	8524	100				10489	if( $temp > $length ) {
2325	2226					1840	$length = $temp;
2326							}
2327							}
2328
2329	2216					3832	return $length;
2330							}
2331
2332
2333							=head2 maxdisplayname_length
2334
2335							Title : maxdisplayname_length
2336							Usage : $ali->maxdisplayname_length()
2337							Function : Gets the maximum length of the displayname in the
2338							alignment. Used in writing out various MSA formats.
2339							Returns : integer
2340							Argument :
2341
2342							=cut
2343
2344							sub maxname_length {
2345	0			0	1	0	my $self = shift;
2346	0					0	$self->deprecated("maxname_length - deprecated method.".
2347							" Use maxdisplayname_length() instead.");
2348	0					0	$self->maxdisplayname_length();
2349							}
2350
2351							sub maxnse_length {
2352	0			0	0	0	my $self = shift;
2353	0					0	$self->deprecated("maxnse_length - deprecated method.".
2354							" Use maxnse_length() instead.");
2355	0					0	$self->maxdisplayname_length();
2356							}
2357
2358							sub maxdisplayname_length {
2359	21			21	1	26	my $self = shift;
2360	21					26	my $maxname = (-1);
2361	21					21	my ( $seq, $len );
2362
2363	21					36	foreach $seq ( $self->each_seq() ) {
2364	122					175	$len = CORE::length $self->displayname( $seq->get_nse() );
2365
2366	122	100				223	if ( $len > $maxname ) {
2367	52					59	$maxname = $len;
2368							}
2369							}
2370
2371	21					44	return $maxname;
2372							}
2373
2374							=head2 max_metaname_length
2375
2376							Title : max_metaname_length
2377							Usage : $ali->max_metaname_length()
2378							Function : Gets the maximum length of the meta name tags in the
2379							alignment for the sequences and for the alignment.
2380							Used in writing out various MSA formats.
2381							Returns : integer
2382							Argument : None
2383
2384							=cut
2385
2386							sub max_metaname_length {
2387	2			2	1	3	my $self = shift;
2388	2					4	my $maxname = (-1);
2389	2					3	my ($seq,$len);
2390
2391							# check seq meta first
2392	2					7	for $seq ( $self->each_seq() ) {
2393	17	100				42	next if !$seq->isa('Bio::Seq::MetaI' \|\| !$seq->meta_names);
2394	11					14	for my $mtag ($seq->meta_names) {
2395	0					0	$len = CORE::length $mtag;
2396	0	0				0	if( $len > $maxname ) {
2397	0					0	$maxname = $len;
2398							}
2399							}
2400							}
2401
2402							# alignment meta
2403	2					9	for my $meta ($self->consensus_meta) {
2404	2	100				6	next unless $meta;
2405	1					2	for my $name ($meta->meta_names) {
2406	1					1	$len = CORE::length $name;
2407	1	50				2	if( $len > $maxname ) {
2408	1					2	$maxname = $len;
2409							}
2410							}
2411							}
2412
2413	2					7	return $maxname;
2414							}
2415
2416							=head2 num_residues
2417
2418							Title : num_residues
2419							Usage : $no = $ali->num_residues
2420							Function : number of residues in total in the alignment
2421							Returns : integer
2422							Argument :
2423							Note : replaces no_residues()
2424
2425							=cut
2426
2427							sub num_residues {
2428	2			2	1	4	my $self = shift;
2429	2					3	my $count = 0;
2430
2431	2					4	foreach my $seq ( $self->each_seq ) {
2432	19					24	my $str = $seq->seq();
2433
2434	19					780	$count += ( $str =~ s/[A-Za-z]//g );
2435							}
2436
2437	2					8	return $count;
2438							}
2439
2440							=head2 num_sequences
2441
2442							Title : num_sequences
2443							Usage : $depth = $ali->num_sequences
2444							Function : number of sequence in the sequence alignment
2445							Returns : integer
2446							Argument : none
2447							Note : replaces no_sequences()
2448
2449							=cut
2450
2451							sub num_sequences {
2452	2553			2553	1	3707	my $self = shift;
2453	2553					2721	return scalar($self->each_seq);
2454							}
2455
2456							=head2 average_percentage_identity
2457
2458							Title : average_percentage_identity
2459							Usage : $id = $align->average_percentage_identity
2460							Function: The function uses a fast method to calculate the average
2461							percentage identity of the alignment
2462							Returns : The average percentage identity of the alignment
2463							Args : None
2464							Notes : This method implemented by Kevin Howe calculates a figure that is
2465							designed to be similar to the average pairwise identity of the
2466							alignment (identical in the absence of gaps), without having to
2467							explicitly calculate pairwise identities proposed by Richard Durbin.
2468							Validated by Ewan Birney ad Alex Bateman.
2469
2470							=cut
2471
2472							sub average_percentage_identity{
2473	9			9	1	13	my ($self,@args) = @_;
2474
2475	9					46	my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2476							'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2477
2478	9					13	my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2479
2480	9	50				20	if (! $self->is_flush()) {
2481	0					0	$self->throw("All sequences in the alignment must be the same length");
2482							}
2483
2484	9					17	@seqs = $self->each_seq();
2485	9					19	$len = $self->length();
2486
2487							# load the each hash with correct keys for existence checks
2488
2489	9					27	for( my $index=0; $index < $len; $index++) {
2490	2096					1460	foreach my $letter (@alphabet) {
2491	54496					42445	$countHashes[$index]->{$letter} = 0;
2492							}
2493							}
2494	9					17	foreach my $seq (@seqs) {
2495	32					82	my @seqChars = split //, $seq->seq();
2496	32					73	for( my $column=0; $column < @seqChars; $column++ ) {
2497	7580					4330	my $char = uc($seqChars[$column]);
2498	7580	100				8313	if (exists $countHashes[$column]->{$char}) {
2499	7077					8765	$countHashes[$column]->{$char}++;
2500							}
2501							}
2502							}
2503
2504	9					16	$total = 0;
2505	9					12	$divisor = 0;
2506	9					28	for(my $column =0; $column < $len; $column++) {
2507	2096					1141	my %hash = %{$countHashes[$column]};
	2096					9015
2508	2096					2459	$subdivisor = 0;
2509	2096					4375	foreach my $res (keys %hash) {
2510	54496					35745	$total += $hash{$res}*($hash{$res} - 1);
2511	54496					36789	$subdivisor += $hash{$res};
2512							}
2513	2096					5660	$divisor += $subdivisor * ($subdivisor - 1);
2514							}
2515	9	50				2061	return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2516							}
2517
2518							=head2 percentage_identity
2519
2520							Title : percentage_identity
2521							Usage : $id = $align->percentage_identity
2522							Function: The function calculates the average percentage identity
2523							(aliased to average_percentage_identity)
2524							Returns : The average percentage identity
2525							Args : None
2526
2527							=cut
2528
2529							sub percentage_identity {
2530	5			5	1	9	my $self = shift;
2531	5					15	return $self->average_percentage_identity();
2532							}
2533
2534							=head2 overall_percentage_identity
2535
2536							Title : overall_percentage_identity
2537							Usage : $id = $align->overall_percentage_identity
2538							$id = $align->overall_percentage_identity('short')
2539							Function: The function calculates the percentage identity of
2540							the conserved columns
2541							Returns : The percentage identity of the conserved columns
2542							Args : length value to use, optional defaults to alignment length
2543							possible values: 'align', 'short', 'long'
2544
2545							The argument values 'short' and 'long' refer to shortest and longest
2546							sequence in the alignment. Method modification code by Hongyu Zhang.
2547
2548							=cut
2549
2550							sub overall_percentage_identity{
2551	17			17	1	26	my ($self, $length_measure) = @_;
2552
2553	17					28	my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
	442					435
2554
2555	17					34	my %enum = map {$_ => undef} qw (align short long);
	51					64
2556
2557							$self->throw("Unknown argument [$length_measure]")
2558	17	50	66			53	if $length_measure and not exists $enum{$length_measure};
2559	17		100			52	$length_measure \|\|= 'align';
2560
2561	17	50				29	if (! $self->is_flush()) {
2562	0					0	$self->throw("All sequences in the alignment must be the same length");
2563							}
2564
2565							# Count the residues seen at each position
2566	17					16	my $len;
2567	17					13	my $total = 0; # number of positions with identical residues
2568	17					16	my @countHashes;
2569	17					21	my @seqs = $self->each_seq;
2570	17					16	my $nof_seqs = scalar @seqs;
2571	17					29	my $aln_len = $self->length();
2572	17					22	for my $seq (@seqs) {
2573	90					152	my $seqstr = $seq->seq;
2574
2575							# Count residues for given sequence
2576	90					131	for my $column (0 .. $aln_len-1) {
2577	17788					11603	my $char = uc( substr($seqstr, $column, 1) );
2578	17788	100				18986	if ( exists $alphabet{$char} ) {
2579
2580							# This is a valid char
2581	17019	100				15158	if ( defined $countHashes[$column]->{$char} ) {
2582	12754					7520	$countHashes[$column]->{$char}++;
2583							} else {
2584	4265					3163	$countHashes[$column]->{$char} = 1;
2585							}
2586
2587	17019	100				19673	if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2588							# All sequences have this same residue
2589	810					558	$total++;
2590							}
2591
2592							}
2593							}
2594
2595							# Sequence length
2596	90	100	100			286	if ($length_measure eq 'short' \|\| $length_measure eq 'long') {
2597	32					34	my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2598	32	100				46	if ($length_measure eq 'short') {
		50
2599	16	100	100			53	if ( (not defined $len) \|\| ($seq_len < $len) ) {
2600	5					7	$len = $seq_len;
2601							}
2602							} elsif ($length_measure eq 'long') {
2603	16	100	100			46	if ( (not defined $len) \|\| ($seq_len > $len) ) {
2604	2					3	$len = $seq_len;
2605							}
2606							}
2607							}
2608
2609							}
2610
2611	17	100				35	if ($length_measure eq 'align') {
2612	15					15	$len = $aln_len;
2613							}
2614
2615	17					467	return ($total / $len ) * 100.0;
2616							}
2617
2618
2619
2620							=head1 Alignment positions
2621
2622							Methods to map a sequence position into an alignment column and back.
2623							column_from_residue_number() does the former. The latter is really a
2624							property of the sequence object and can done using
2625							L:
2626
2627							# select somehow a sequence from the alignment, e.g.
2628							my $seq = $aln->get_seq_by_pos(1);
2629							#$loc is undef or Bio::LocationI object
2630							my $loc = $seq->location_from_column(5);
2631
2632							=head2 column_from_residue_number
2633
2634							Title : column_from_residue_number
2635							Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2636							Function: This function gives the position in the alignment
2637							(i.e. column number) of the given residue number in the
2638							sequence with the given name. For example, for the
2639							alignment
2640
2641							Seq1/91-97 AC..DEF.GH.
2642							Seq2/24-30 ACGG.RTY...
2643							Seq3/43-51 AC.DDEF.GHI
2644
2645							column_from_residue_number( "Seq1", 94 ) returns 6.
2646							column_from_residue_number( "Seq2", 25 ) returns 2.
2647							column_from_residue_number( "Seq3", 50 ) returns 10.
2648
2649							An exception is thrown if the residue number would lie
2650							outside the length of the aligment
2651							(e.g. column_from_residue_number( "Seq2", 22 )
2652
2653							Note: If the the parent sequence is represented by more than
2654							one alignment sequence and the residue number is present in
2655							them, this method finds only the first one.
2656
2657							Returns : A column number for the position in the alignment of the
2658							given residue in the given sequence (1 = first column)
2659							Args : A sequence id/name (not a name/start-end)
2660							A residue number in the whole sequence (not just that
2661							segment of it in the alignment)
2662
2663							=cut
2664
2665							sub column_from_residue_number {
2666	27			27	1	446	my ( $self, $name, $resnumber ) = @_;
2667
2668							$self->throw("No sequence with name [$name]")
2669	27	50				53	unless $self->{'_start_end_lists'}->{$name};
2670	27	50				37	$self->throw("Second argument residue number missing") unless $resnumber;
2671
2672	27					45	foreach my $seq ( $self->each_seq_with_id($name) ) {
2673	27					24	my $col;
2674	27					30	eval { $col = $seq->column_from_residue_number($resnumber); };
	27					49
2675	27	50				44	next if $@;
2676	27					47	return $col;
2677							}
2678
2679	0					0	$self->throw( "Could not find a sequence segment in $name "
2680							. "containing residue number $resnumber" );
2681
2682							}
2683
2684							=head1 Sequence names
2685
2686							Methods to manipulate the display name. The default name based on the
2687							sequence id and subsequence positions can be overridden in various
2688							ways.
2689
2690							=head2 displayname
2691
2692							Title : displayname
2693							Usage : $myalign->displayname("Ig", "IgA")
2694							Function : Gets/sets the display name of a sequence in the alignment
2695							Returns : A display name string
2696							Argument : name of the sequence
2697							displayname of the sequence (optional)
2698
2699							=cut
2700
2701							sub displayname {
2702	550			550	1	488	my ( $self, $name, $disname ) = @_;
2703
2704							$self->throw("No sequence with name [$name]")
2705	550	50				886	unless defined $self->{'_seq'}->{$name};
2706
2707	550	100	66			1261	if ( $disname and $name ) {
		100
2708	112					136	$self->{'_dis_name'}->{$name} = $disname;
2709	112					130	return $disname;
2710							}
2711							elsif ( defined $self->{'_dis_name'}->{$name} ) {
2712	48					133	return $self->{'_dis_name'}->{$name};
2713							}
2714							else {
2715	390					653	return $name;
2716							}
2717							}
2718
2719							sub get_displayname {
2720	0			0	0	0	my $self = shift;
2721	0					0	$self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2722	0					0	$self->displayname(@_);
2723							}
2724
2725							sub set_displayname {
2726	0			0	0	0	my $self = shift;
2727	0					0	$self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2728	0					0	$self->displayname(@_);
2729							}
2730
2731
2732							=head2 set_displayname_count
2733
2734							Title : set_displayname_count
2735							Usage : $ali->set_displayname_count
2736							Function : Sets the names to be name_# where # is the number of
2737							times this name has been used.
2738							Returns : 1, on success
2739							Argument :
2740
2741							=cut
2742
2743							sub set_displayname_count {
2744	1			1	1	2	my $self= shift;
2745	1					2	my (@arr,$name,$seq,$count,$temp,$nse);
2746
2747	1					3	foreach $seq ( $self->each_alphabetically() ) {
2748	16					21	$nse = $seq->get_nse();
2749
2750							#name will be set when this is the second
2751							#time (or greater) is has been seen
2752
2753	16	50	66			35	if( defined $name and $name eq ($seq->id()) ) {
2754	0					0	$temp = sprintf("%s_%s",$name,$count);
2755	0					0	$self->displayname($nse,$temp);
2756	0					0	$count++;
2757							} else {
2758	16					12	$count = 1;
2759	16					18	$name = $seq->id();
2760	16					27	$temp = sprintf("%s_%s",$name,$count);
2761	16					19	$self->displayname($nse,$temp);
2762	16					16	$count++;
2763							}
2764							}
2765	1					5	return 1;
2766							}
2767
2768							=head2 set_displayname_flat
2769
2770							Title : set_displayname_flat
2771							Usage : $ali->set_displayname_flat()
2772							Function : Makes all the sequences be displayed as just their name,
2773							not name/start-end (NSE)
2774							Returns : 1
2775							Argument :
2776
2777							=cut
2778
2779							sub set_displayname_flat {
2780	11			11	1	16	my $self = shift;
2781	11					17	my ( $nse, $seq );
2782
2783	11					21	foreach $seq ( $self->each_seq() ) {
2784	79					111	$nse = $seq->get_nse();
2785	79					107	$self->displayname( $nse, $seq->id() );
2786							}
2787	11					28	return 1;
2788							}
2789
2790
2791							=head2 set_displayname_normal
2792
2793							Title : set_displayname_normal
2794							Usage : $ali->set_displayname_normal()
2795							Function : Makes all the sequences be displayed as name/start-end (NSE)
2796							Returns : 1, on success
2797							Argument :
2798
2799							=cut
2800
2801							sub set_displayname_normal {
2802	1			1	1	2	my $self = shift;
2803	1					2	my ( $nse, $seq );
2804
2805	1					3	foreach $seq ( $self->each_seq() ) {
2806	16					18	$nse = $seq->get_nse();
2807	16					18	$self->displayname( $nse, $nse );
2808							}
2809	1					5	return 1;
2810							}
2811
2812							=head2 source
2813
2814							Title : source
2815							Usage : $obj->source($newval)
2816							Function: sets the Alignment source program
2817							Example :
2818							Returns : value of source
2819							Args : newvalue (optional)
2820
2821							=cut
2822
2823							sub source {
2824	123			123	1	607	my ( $self, $value ) = @_;
2825	123	100				349	if ( defined $value ) {
2826	111					195	$self->{'_source'} = $value;
2827							}
2828	123					215	return $self->{'_source'};
2829							}
2830
2831
2832							=head2 set_displayname_safe
2833
2834							Title : set_displayname_safe
2835							Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2836							Function : Assign machine-generated serial names to sequences in input order.
2837							Designed to protect names during PHYLIP runs. Assign 10-char string
2838							in the form of "S000000001" to "S999999999". Restore the original
2839							names using "restore_displayname".
2840							Returns : 1. a new $aln with system names;
2841							2. a hash ref for restoring names
2842							Argument : Number for id length (default 10)
2843
2844							=cut
2845
2846							sub set_displayname_safe {
2847	1			1	1	2	my $self = shift;
2848	1		50			5	my $idlength = shift \|\| 10;
2849	1					1	my ( $seq, %phylip_name );
2850	1					1	my $ct = 0;
2851	1					3	my $new = Bio::SimpleAlign->new();
2852	1					2	foreach $seq ( $self->each_seq() ) {
2853	14					13	$ct++;
2854	14					47	my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2855	14					21	$phylip_name{$pname} = $seq->id();
2856							my $new_seq = Bio::LocatableSeq->new(
2857							-id => $pname,
2858							-seq => $seq->seq(),
2859							-alphabet => $seq->alphabet,
2860							-start => $seq->{_start},
2861							-end => $seq->{_end}
2862	14					30	);
2863	14					43	$new->add_seq($new_seq);
2864							}
2865
2866							$self->debug(
2867	1					5	"$ct seq names changed. Restore names by using restore_displayname.");
2868	1					3	return ( $new, \%phylip_name );
2869							}
2870
2871
2872							=head2 restore_displayname
2873
2874							Title : restore_displayname
2875							Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2876							Function : Restore original sequence names (after running
2877							$ali->set_displayname_safe)
2878							Returns : a new $aln with names restored.
2879							Argument : a hash reference of names from "set_displayname_safe".
2880
2881							=cut
2882
2883							sub restore_displayname {
2884	1			1	1	2	my $self = shift;
2885	1					1	my $ref=shift;
2886	1					7	my %name=%$ref;
2887	1					3	my $new=Bio::SimpleAlign->new();
2888	1					3	foreach my $seq ( $self->each_seq() ) {
2889	14	50				24	$self->throw("No sequence with name") unless defined $name{$seq->id()};
2890							my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2891							-seq => $seq->seq(),
2892							-alphabet => $seq->alphabet,
2893							-start => $seq->{_start},
2894							-end => $seq->{_end}
2895	14					25	);
2896	14					38	$new->add_seq($new_seq);
2897							}
2898	1					5	return $new;
2899							}
2900
2901							=head2 sort_by_start
2902
2903							Title : sort_by_start
2904							Usage : $ali->sort_by_start
2905							Function : Changes the order of the alignment to the start position of each
2906							subalignment
2907							Returns : 1 on success
2908							Argument :
2909
2910							=cut
2911
2912							sub sort_by_start {
2913	1			1	1	2	my $self = shift;
2914	1					1	my ($seq,$nse,@arr,%hash,$count);
2915	1					2	foreach $seq ( $self->each_seq() ) {
2916	3					6	$nse = $seq->get_nse;
2917	3					5	$hash{$nse} = $seq;
2918							}
2919	1					2	$count = 0;
2920	1					1	%{$self->{'_order'}} = (); # reset the hash;
	1					3
2921	1					4	foreach $nse ( sort _startend keys %hash) {
2922	3					4	$self->{'_order'}->{$count} = $nse;
2923	3					2	$count++;
2924							}
2925	1					3	1;
2926							}
2927
2928							sub _startend {
2929	2			2		5	my ($aname,$arange) = split (/[\/]/,$a);
2930	2					3	my ($bname,$brange) = split (/[\/]/,$b);
2931	2					4	my ($astart,$aend) = split(/\-/,$arange);
2932	2					3	my ($bstart,$bend) = split(/\-/,$brange);
2933	2					4	return $astart <=> $bstart;
2934							}
2935
2936							=head2 bracket_string
2937
2938							Title : bracket_string
2939							Usage : my @params = (-refseq => 'testseq',
2940							-allele1 => 'allele1',
2941							-allele2 => 'allele2',
2942							-delimiters => '{}',
2943							-separator => '/');
2944							$str = $aln->bracket_string(@params)
2945
2946							Function : When supplied with a list of parameters (see below), returns a
2947							string in BIC format. This is used for allelic comparisons.
2948							Briefly, if either allele contains a base change when compared to
2949							the refseq, the base or gap for each allele is represented in
2950							brackets in the order present in the 'alleles' parameter.
2951
2952							For the following data:
2953
2954							>testseq
2955							GGATCCATTGCTACT
2956							>allele1
2957							GGATCCATTCCTACT
2958							>allele2
2959							GGAT--ATTCCTCCT
2960
2961							the returned string with parameters 'refseq => testseq' and
2962							'alleles => [qw(allele1 allele2)]' would be:
2963
2964							GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2965							Returns : BIC-formatted string
2966							Argument : Required args
2967							refseq : string (ID) of the reference sequence used
2968							as basis for comparison
2969							allele1 : string (ID) of the first allele
2970							allele2 : string (ID) of the second allele
2971							Optional args
2972							delimiters: two symbol string of left and right delimiters.
2973							Only the first two symbols are used
2974							default = '[]'
2975							separator : string used as a separator. Only the first
2976							symbol is used
2977							default = '/'
2978							Throws : On no refseq/alleles, or invalid refseq/alleles.
2979
2980							=cut
2981
2982							sub bracket_string {
2983	14			14	1	5598	my ($self, @args) = @_;
2984	14					44	my ($ref, $a1, $a2, $delim, $sep) =
2985							$self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2986	14	50	33			64	$self->throw('Missing refseq/allele1/allele2') if (!$a1 \|\| !$a2 \|\| !$ref);
			33
2987	14					11	my ($ld, $rd);
2988	14	100				29	($ld, $rd) = split('', $delim, 2) if $delim;
2989	14		100			34	$ld \|\|= '[';
2990	14		100			23	$rd \|\|= ']';
2991	14		100			24	$sep \|\|= '/';
2992							my ($refseq, $allele1, $allele2) =
2993	14					13	map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
	42					46
2994	14	50	33			49	if (!$refseq \|\| !$allele1 \|\| !$allele2) {
			33
2995	0					0	$self->throw("One of your refseq/allele IDs is invalid!");
2996							}
2997	14					21	my $len = $self->length-1;
2998	14					14	my $bic = '';
2999							# loop over the alignment columns
3000	14					21	for my $column ( 0 .. $len ) {
3001	210					127	my $string;
3002							my ($compres, $res1, $res2) =
3003	210					140	map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
	630					622
3004							# are any of the allele symbols different from the refseq?
3005	210	100	100			534	$string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3006							$ld.$res1.$sep.$res2.$rd;
3007	210					192	$bic .= $string;
3008							}
3009	14					34	return $bic;
3010							}
3011
3012
3013							=head2 methods implementing Bio::FeatureHolderI
3014
3015							FeatureHolderI implementation to support labeled character sets like one
3016							would get from NEXUS represented data.
3017
3018							=head2 get_SeqFeatures
3019
3020							Usage : @features = $aln->get_SeqFeatures
3021							Function: Get the feature objects held by this feature holder.
3022							Example :
3023							Returns : an array of Bio::SeqFeatureI implementing objects
3024							Args : optional filter coderef, taking a Bio::SeqFeatureI
3025							: as argument, returning TRUE if wanted, FALSE if
3026							: unwanted
3027
3028							=cut
3029
3030							sub get_SeqFeatures {
3031	4			4	1	12	my $self = shift;
3032	4					6	my $filter_cb = shift;
3033	4	50	33			17	$self->throw("Arg (filter callback) must be a coderef")
3034							unless !defined($filter_cb)
3035							or ref($filter_cb) eq 'CODE';
3036	4	100				10	if ( !defined $self->{'_as_feat'} ) {
3037	2					5	$self->{'_as_feat'} = [];
3038							}
3039	4	50				11	if ($filter_cb) {
3040	0					0	return grep { $filter_cb->($_) } @{ $self->{'_as_feat'} };
	0					0
	0					0
3041							}
3042	4					4	return @{ $self->{'_as_feat'} };
	4					11
3043							}
3044
3045
3046							=head2 add_SeqFeature
3047
3048							Usage : $aln->add_SeqFeature($subfeat);
3049							Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3050							(see L) is supported, but has no effect.
3051							Example :
3052							Returns : 1 on success
3053							Args : a Bio::SeqFeatureI object
3054
3055							=cut
3056
3057							sub add_SeqFeature {
3058	9			9	1	11	my ($self, @feat) = @_;
3059
3060	9	100				16	$self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3061
3062	9	50				15	if (scalar @feat > 1) {
3063	0					0	$self->deprecated(
3064							-message => 'Providing an array of features to Bio::SimpleAlign add_SeqFeature()'.
3065							' is deprecated and will be removed in a future version. '.
3066							'Add a single feature at a time instead.',
3067							-warn_version => 1.007,
3068							-throw_version => 1.009,
3069							);
3070							}
3071
3072	9					11	for my $feat ( @feat ) {
3073
3074	9	50				14	next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3075
3076	9	50				30	if( !$feat->isa("Bio::SeqFeatureI") ) {
3077	0					0	$self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3078							}
3079
3080	9					7	push @{$self->{'_as_feat'}}, $feat;
	9					15
3081							}
3082	9					12	return 1;
3083							}
3084
3085
3086							=head2 remove_SeqFeatures
3087
3088							Usage : $obj->remove_SeqFeatures
3089							Function: Removes all SeqFeatures. If you want to remove only a subset,
3090							remove that subset from the returned array, and add back the rest.
3091							Returns : The array of Bio::SeqFeatureI features that was
3092							deleted from this alignment.
3093							Args : none
3094
3095							=cut
3096
3097							sub remove_SeqFeatures {
3098	0			0	1	0	my $self = shift;
3099
3100	0	0				0	return () unless $self->{'_as_feat'};
3101	0					0	my @feats = @{$self->{'_as_feat'}};
	0					0
3102	0					0	$self->{'_as_feat'} = [];
3103	0					0	return @feats;
3104							}
3105
3106							=head2 feature_count
3107
3108							Title : feature_count
3109							Usage : $obj->feature_count()
3110							Function: Return the number of SeqFeatures attached to the alignment
3111							Returns : integer representing the number of SeqFeatures
3112							Args : None
3113
3114							=cut
3115
3116							sub feature_count {
3117	2			2	1	3	my ($self) = @_;
3118
3119	2	50				5	if (defined($self->{'_as_feat'})) {
3120	2					1	return ($#{$self->{'_as_feat'}} + 1);
	2					9
3121							} else {
3122	0					0	return 0;
3123							}
3124							}
3125
3126							=head2 get_all_SeqFeatures
3127
3128							Title : get_all_SeqFeatures
3129							Usage :
3130							Function: Get all SeqFeatures.
3131							Example :
3132							Returns : an array of Bio::SeqFeatureI implementing objects
3133							Args : none
3134							Note : Falls through to Bio::FeatureHolderI implementation.
3135
3136							=cut
3137
3138							=head2 methods for Bio::AnnotatableI
3139
3140							AnnotatableI implementation to support sequence alignments which
3141							contain annotation (NEXUS, Stockholm).
3142
3143							=head2 annotation
3144
3145							Title : annotation
3146							Usage : $ann = $aln->annotation or
3147							$aln->annotation($ann)
3148							Function: Gets or sets the annotation
3149							Returns : Bio::AnnotationCollectionI object
3150							Args : None or Bio::AnnotationCollectionI object
3151
3152							See L and L
3153							for more information
3154
3155							=cut
3156
3157							sub annotation {
3158	87			87	1	97	my ($obj,$value) = @_;
3159	87	100				189	if( defined $value ) {
		100
3160	46	50				161	$obj->throw("object of class ".ref($value)." does not implement ".
3161							"Bio::AnnotationCollectionI. Too bad.")
3162							unless $value->isa("Bio::AnnotationCollectionI");
3163	46					59	$obj->{'_annotation'} = $value;
3164							} elsif( ! defined $obj->{'_annotation'}) {
3165	21					88	$obj->{'_annotation'} = Bio::Annotation::Collection->new();
3166							}
3167	87					149	return $obj->{'_annotation'};
3168							}
3169
3170							=head1 Deprecated methods
3171
3172							=cut
3173
3174							=head2 no_residues
3175
3176							Title : no_residues
3177							Usage : $no = $ali->no_residues
3178							Function : number of residues in total in the alignment
3179							Returns : integer
3180							Argument :
3181							Note : deprecated in favor of num_residues()
3182
3183							=cut
3184
3185							sub no_residues {
3186	0			0	1	0	my $self = shift;
3187	0					0	$self->deprecated(-warn_version => 1.0069,
3188							-throw_version => 1.0075,
3189							-message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3190	0					0	$self->num_residues(@_);
3191							}
3192
3193							=head2 no_sequences
3194
3195							Title : no_sequences
3196							Usage : $depth = $ali->no_sequences
3197							Function : number of sequence in the sequence alignment
3198							Returns : integer
3199							Argument :
3200							Note : deprecated in favor of num_sequences()
3201
3202							=cut
3203
3204							sub no_sequences {
3205	0			0	1	0	my $self = shift;
3206	0					0	$self->deprecated(-warn_version => 1.0069,
3207							-throw_version => 1.0075,
3208							-message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3209	0					0	$self->num_sequences(@_);
3210							}
3211
3212							=head2 mask_columns
3213
3214							Title : mask_columns
3215							Usage : $aln2 = $aln->mask_columns(20,30)
3216							Function : Masks a slice of the alignment inclusive of start and
3217							end columns, and the first column in the alignment is denoted 1.
3218							Mask beyond the length of the sequence does not do padding.
3219							Returns : A Bio::SimpleAlign object
3220							Args : Positive integer for start column, positive integer for end column,
3221							optional string value use for the mask. Example:
3222
3223							$aln2 = $aln->mask_columns(20,30,'?')
3224							Note : Masking must use a character that is not used for gaps or
3225							frameshifts. These can be adjusted using the relevant global
3226							variables, but be aware these may be (uncontrollably) modified
3227							elsewhere within BioPerl (see bug 2715)
3228
3229							=cut
3230
3231							sub mask_columns {
3232							#based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3233	6			6	1	1084	my $self = shift;
3234
3235	6					11	my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3236							$Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3237
3238							# coordinates are alignment-based, not sequence-based
3239	6					6	my ($start, $end, $mask_char) = @_;
3240	6	50				11	unless (defined $mask_char) { $mask_char = 'N' }
	0					0
3241
3242	6	50	33			46	$self->throw("Mask start has to be a positive integer and less than ".
			33
3243							"alignment length, not [$start]")
3244							unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3245	6	50	33			35	$self->throw("Mask end has to be a positive integer and less than ".
			33
3246							"alignment length, not [$end]")
3247							unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3248	6	50				9	$self->throw("Mask start [$start] has to be smaller than or equal to ".
3249							"end [$end]") unless $start <= $end;
3250	6	50	33			36	$self->throw("Mask character $mask_char has to be a single character ".
3251							"and not a gap or frameshift symbol")
3252							unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3253
3254	6					9	my $aln = $self->new;
3255	6					12	$aln->id($self->id);
3256	6					11	foreach my $seq ( $self->each_seq() ) {
3257	21					49	my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3258							-alphabet => $seq->alphabet,
3259							-strand => $seq->strand,
3260							-verbose => $self->verbose);
3261
3262							# convert from 1-based alignment coords!
3263	21					48	my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3264	21					124	$masked_string =~ s{[^$nonres]}{$mask_char}g;
3265	21					32	my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3266	21					37	$new_seq->seq($new_dna_string);
3267	21					42	$aln->add_seq($new_seq);
3268							}
3269							# Preserve chosen mask character, it may be need later (like in 'slice')
3270	6					11	$aln->{_mask_char} = $mask_char;
3271	6					12	return $aln;
3272							}
3273
3274							1;