File Coverage

blib/lib/Bio/MUST/Core/Ali.pm

Criterion	Covered	Total	%
statement	278	279	99.6
branch	65	76	85.5
condition	14	18	77.7
subroutine	56	57	98.2
pod	33	33	100.0
total	446	463	96.3

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MUST::Core::Ali;
2							# ABSTRACT: Multiple sequence alignment
3							# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be>
4							# CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com>
5							$Bio::MUST::Core::Ali::VERSION = '0.212670';
6	17			17		13671	use Moose;
	17					47
	17					129
7	17			17		92080	use namespace::autoclean;
	17					44
	17					212
8
9	17			17		1661	use autodie;
	17					66
	17					154
10	17			17		91106	use feature qw(say);
	17					64
	17					1478
11
12							# use Smart::Comments;
13
14	17			17		136	use Carp;
	17					40
	17					1233
15	17			17		132	use File::Temp;
	17					39
	17					1437
16	17			17		139	use List::AllUtils qw(uniq indexes sum0);
	17					47
	17					982
17	17			17		118	use Path::Class qw(file);
	17					53
	17					837
18	17			17		112	use POSIX qw(ceil floor);
	17					46
	17					129
19	17			17		11520	use Statistics::Descriptive;
	17					373152
	17					701
20	17			17		8900	use Tie::IxHash;
	17					41782
	17					645
21
22	17			17		149	use Bio::MUST::Core::Types;
	17					44
	17					500
23	17			17		102	use Bio::MUST::Core::Constants qw(:gaps :files);
	17					44
	17					3977
24	17			17		139	use aliased 'Bio::MUST::Core::Seq';
	17					40
	17					167
25	17			17		3904	use aliased 'Bio::MUST::Core::SeqMask';
	17					48
	17					71
26
27							# TODO: add information about methods available in Ali-like objects
28
29							# ATTRIBUTES
30
31
32							has 'seqs' => (
33							traits => ['Array'],
34							is => 'ro',
35							isa => 'ArrayRef[Bio::MUST::Core::Seq]',
36							default => sub { [] },
37							writer => '_set_seqs',
38							handles => {
39							add_seq => 'push',
40							get_seq => 'get',
41							set_seq => 'set',
42							delete_seq => 'delete',
43							insert_seq => 'insert',
44							count_seqs => 'count',
45							all_seqs => 'elements',
46							first_seq => 'first',
47							filter_seqs => 'grep',
48							},
49							);
50
51
52							has 'file' => (
53							is => 'ro',
54							isa => 'Bio::MUST::Core::Types::File',
55							default => 'untitled.ali',
56							coerce => 1,
57							handles => {
58							filename => 'stringify',
59							},
60							);
61
62
63							has 'guessing' => (
64							traits => ['Bool'],
65							is => 'ro',
66							isa => 'Bool',
67							default => 1,
68							handles => {
69							dont_guess => 'unset',
70							guess => 'set',
71							},
72							);
73
74
75							with 'Bio::MUST::Core::Roles::Commentable',
76							'Bio::MUST::Core::Roles::Listable';
77							with 'Bio::MUST::Core::Roles::Aliable'; ## no critic (ProhibitMultipleWiths)
78
79							# CONSTRUCTORS
80
81
82
83							sub clone {
84	1			1	1	4	my $self = shift;
85
86							return $self->new(
87							comments => [ $self->all_comments ],
88	1					48	seqs => [ map { $_->clone } $self->all_seqs ],
	250					712
89							file => file( $self->filename ),
90							guessing => $self->guessing,
91							);
92							}
93
94							# ACCESSORS
95
96
97							sub get_seq_with_id {
98	4			4	1	1985	my $self = shift;
99	4					17	my $id = shift;
100
101	4			10		200	my $seq = $self->first_seq( sub { $_->full_id eq $id } );
	10					44
102	4	100				44	carp "[BMC] Warning: cannot find seq with id: $id; returning undef!"
103							unless $seq;
104
105	4					464	return $seq;
106							}
107
108
109							sub all_new_seqs {
110	6			6	1	20	return shift->filter_seqs( sub { $_->is_new } );
	2			2		106
111							}
112
113
114							sub all_but_new_seqs {
115	3			3	1	10	return shift->filter_seqs( sub { not $_->is_new } );
	1			1		82
116							}
117
118
119							sub all_seq_ids {
120	105			105	1	3943	my $self = shift;
121	105					4420	return map { $_->seq_id } $self->all_seqs;
	1510					37655
122							}
123
124							# PROPERTIES
125
126
127							sub has_uniq_ids {
128	64			64	1	1562	my $self = shift;
129
130	64					211	my @ids = map { $_->full_id } $self->all_seq_ids;
	941					24065
131	64	100				2415	return 1 if $self->count_seqs == uniq @ids;
132	7					234	return 0;
133							}
134
135
136							sub is_protein {
137	77			77	1	2635	my $self = shift;
138	77	100		96		3277	return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs;
	96					534
139	5					31	return 0;
140							}
141
142
143							sub is_aligned {
144	310			310	1	587	my $self = shift;
145	310	100				10966	return 0 if not $self->guessing;
146	307	100		8588		11599	return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs;
	8588					16726
147	116					802	return 0;
148							}
149
150
151							sub width {
152	189			189	1	433	my $self = shift;
153	189	100				617	$self->uniformize if $self->is_aligned; # pad seqs for robustness
154	189					945	return $self->_max_seq_len;
155							}
156
157							sub _max_seq_len {
158	365			365		678	my $self = shift;
159	365					12299	my @lengths = map { $_->seq_len } $self->all_seqs;
	12841					384806
160	365		50			5034	return (List::AllUtils::max @lengths) // 0; # to avoid warnings
161							}
162
163
164							sub seq_len_stats {
165	1			1	1	13	my $self = shift;
166
167	1					44	my @lengths = map { $_->nomiss_seq_len } $self->all_seqs;
	10					27
168	1					16	my $stat = Statistics::Descriptive::Full->new;
169	1					139	$stat->add_data( \@lengths );
170	1					259	my @quantiles = map { $stat->quantile($_) } 0..4;
	5					253
171
172	1					46	return @quantiles;
173							}
174
175
176							sub perc_miss {
177	2			2	1	6	my $self = shift;
178
179	2					85	my $n = sum0 map { $_->nomiss_seq_len } $self->all_seqs;
	255					540
180	2	50				19	return 0.0 unless $n;
181
182	2					11	my $total = $self->width * $self->count_seqs;
183	2					19	return 100.0 * ($total - $n) / $total;
184							}
185
186							# MUTATORS
187
188
189							sub uc_seqs {
190	1			1	1	12	my $self = shift;
191
192	1					44	$_->uc for $self->all_seqs;
193	1					4	return $self;
194							}
195
196
197							sub recode_seqs { ## no critic (RequireArgUnpacking)
198	1			1	1	20	my $self = shift;
199
200	1					44	$_->recode(@_) for $self->all_seqs;
201	1					7	return $self;
202							}
203
204
205							sub degap_seqs {
206	1			1	1	8	my $self = shift;
207
208	1					39	$_->degap for $self->all_seqs;
209	1					4	return $self;
210							}
211
212
213							sub spacify_seqs {
214	176			176	1	364	my $self = shift;
215
216	176					6082	$_->spacify for $self->all_seqs;
217	176					489	return $self;
218							}
219
220
221							sub gapify_seqs { ## no critic (RequireArgUnpacking)
222	2			2	1	19	my $self = shift;
223	2					78	$_->gapify(@_) for $self->all_seqs;
224	2					8	return $self;
225							}
226
227
228							sub trim_seqs {
229	176			176	1	337	my $self = shift;
230
231	176					6208	$_->trim for $self->all_seqs;
232	176					691	return $self;
233							}
234
235
236							sub pad_seqs {
237	176			176	1	412	my $self = shift;
238
239	176					538	my $bound = $self->_max_seq_len;
240	176					5923	$_->pad_to($bound) for $self->all_seqs;
241	176					550	return $self;
242							}
243
244
245							sub uniformize {
246	175			175	1	371	my $self = shift;
247
248							# TODO: profile code (triggers?) to avoid useless multiple calls
249
250	175					624	$self->spacify_seqs;
251	175					795	$self->trim_seqs;
252	175					699	$self->pad_seqs;
253
254	175					343	return $self;
255							}
256
257
258							sub clear_new_tags {
259	1			1	1	4	my $self = shift;
260
261	1					42	$_->clear_new_tag for $self->all_seqs;
262	1					4	return $self;
263							}
264
265
266							sub shorten_ids { ## no critic (RequireArgUnpacking)
267	12			12	1	65	return shift->_change_ids_(1, @_);
268							}
269
270
271							sub restore_ids { ## no critic (RequireArgUnpacking)
272	10			10	1	44	return shift->_change_ids_(0, @_);
273							}
274
275							sub _change_ids_ {
276	22			22		59	my $self = shift;
277	22					47	my $abbr = shift;
278	22					45	my $mapper = shift;
279
280	22					868	for my $seq ($self->all_seqs) {
281	133	100				702	my $new_id = $abbr ? $mapper->abbr_id_for( $seq->full_id )
282							: $mapper->long_id_for( $seq->full_id );
283	133	50				4236	$seq->set_seq_id($new_id) if $new_id;
284							} # Note: leave id alone if not found
285
286	22					115	return $self;
287							}
288
289
290							sub apply_list {
291	1			1	1	8	my $self = shift;
292	1					4	my $list = shift;
293
294							# loop through seqs from bottom to top
295							# ... and delete seq from Ali if not listed in list
296	1					42	my $seq_n = $self->count_seqs;
297
298	1					8	for (my $i = $seq_n-1; $i >= 0; $i--) {
299	250	100				8067	$self->delete_seq($i)
300							unless $list->is_listed( $self->get_seq($i)->full_id );
301							}
302
303	1					8	return $self;
304							}
305
306
307							sub _premask_check {
308	40			40		476	my $self = shift;
309	40					95	my $mask = shift;
310
311							# warn of unaligned Ali
312	40	50				160	carp '[BMC] Note: Ali does not look aligned!'
313							. ' This might not be an issue if seqs are ultra-conserved!'
314							unless $self->is_aligned;
315	40					254	$self->uniformize;
316
317							# warn of empty SeqMask
318							carp '[BMC] Note: applying this mask will result in a zero-width Ali!'
319	40	50		1845		1782	unless List::AllUtils::any { $_ } $mask->all_states;
	1845					2302
320
321							# check that SeqMask is compatible with Ali
322							# potential bugs could come from constant sites etc
323	40					14410	my $a_width = $self->width;
324	40					1594	my $m_width = $mask->mask_len;
325	40	100				182	carp "[BMC] Note: Ali width does not match mask len: $a_width vs. $m_width!"
326							. ' This might not be an issue if the mask results from blocks.'
327							unless $a_width == $m_width;
328
329	40					919	return; # return values of before subs are ignored
330							}
331
332							before 'apply_mask' => \&_premask_check;
333
334							sub apply_mask {
335							my $self = shift;
336							my $mask = shift;
337
338							# select sites for each seq using a precomputed array slice
339							my @indexes = indexes { $_ } $mask->all_states;
340							$_->_set_seq( join q{}, ( $_->all_states )[@indexes] )
341							for $self->all_seqs;
342
343							return $self;
344							}
345
346
347							sub idealize { ## no critic (RequireArgUnpacking)
348	13			13	1	107	my $self = shift;
349	13					154	return $self->apply_mask( SeqMask->ideal_mask($self, @_) );
350							}
351
352							# MISC METHODS
353
354
355							sub gapmiss_regex {
356	73	100		73	1	1152	return shift->is_protein ? $GAPPROTMISS : $GAPDNAMISS;
357							}
358
359
360							sub map_coords {
361	1			1	1	14	my $self = shift;
362	1					4	my $id = shift;
363	1					3	my $coords_in = shift;
364
365	1					6	my $seq = $self->get_seq_with_id($id);
366	1					7	my @states = $seq->all_states;
367
368	1					3	my $count = 0;
369	1					2	my @index;
370	1					4	for my $state (@states) {
371	130	100				351	$count++ if $state !~ $GAP;
372	130					222	push @index, $count;
373							}
374
375	1					2	my @coords_out = map { $index[$_-1] } @{$coords_in};
	6					14
	1					3
376
377	1					14	return \@coords_out;
378							}
379
380							# ALIASES
381
382
383							sub height {
384	0			0	1	0	return shift->count_seqs;
385							}
386
387							# I/O METHODS
388
389
390							sub load {
391	69			69	1	26466	my $class = shift;
392	69					162	my $infile = shift;
393
394	69					397	open my $in, '<', $infile;
395
396	69					33740	my $ali = $class->new( file => $infile );
397	69					451	my $seq_id;
398							my $seq;
399
400							LINE:
401	69					15067	while (my $line = <$in>) {
402	4091					7888	chomp $line;
403
404							# skip empty lines and process comments
405	4091	100	100			26130	next LINE if $line =~ $EMPTY_LINE
406							\|\| $ali->is_comment($line);
407
408							# at each '>' char...
409	3944					14367	my ($defline) = $line =~ $DEF_LINE;
410	3944	100				8598	if ($defline) {
411
412							# add current seq to ali (if any)
413	1662	100				3365	if ($seq) {
414	1593					48380	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
415	1593					54646	$ali->add_seq($new_seq);
416	1593					2989	$seq = q{};
417							}
418
419	1662					2768	$seq_id = $defline;
420	1662					24372	next LINE;
421							}
422
423							# elongate current seq (seqs can be broken on several lines)
424	2282					10687	$seq .= $line;
425							}
426
427							# add last seq to ali (if any)
428	69	50				283	if ($seq) {
429	69					2410	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
430	69					2428	$ali->add_seq($new_seq);
431							}
432
433	69					1797	return $ali;
434							}
435
436
437							before qr{\Astore}xms => sub {
438							my $self = shift;
439
440							# perform pre-storage duties
441							carp '[BMC] Warning: non unique seq ids!' unless $self->has_uniq_ids;
442							$self->uniformize if $self->is_aligned;
443
444							return;
445							};
446
447
448							sub store { ## no critic (RequireArgUnpacking)
449							my $self = shift;
450							my $outfile = shift;
451
452							# automatically redirect to store_fasta for non-Ali outfile names
453							return $self->store_fasta($outfile, @_) # note the currying
454							unless $outfile =~ $ALI_SUFFIX;
455
456							open my $out, '>', $outfile;
457
458							print {$out} $self->header;
459							for my $seq ($self->all_seqs) {
460							say {$out} '>' . $seq->full_id;
461							say {$out} $seq->seq;
462							}
463
464							return;
465							}
466
467
468							sub store_fasta {
469							my $self = shift;
470							my $outfile = shift;
471							my $args = shift // {}; # HashRef (should not be empty...)
472
473							my $degap = $args->{degap} // 0;
474							my $clean = $args->{clean} // 0;
475							my $gap = $args->{gapify} // 'X';
476							my $chunk = $args->{chunk} // 60;
477							my $nowrap = $chunk < 0 ? 1 : 0;
478							my $is_aligned = $self->is_aligned;
479
480							open my $out, '>', $outfile;
481
482							for my $seq ($self->all_seqs) {
483							say {$out} '>' . $seq->foreign_id;
484
485							# optionally clean and/or degap seq
486							$seq = $seq->clone if $clean \|\| $degap; # clone seq only if needed
487							$seq->gapify($gap) if $clean;
488							$seq->degap if $degap;
489
490							my $width = $seq->seq_len;
491							$chunk = $width if $nowrap; # optionally disable wrap
492
493							my $str = $seq->wrapped_str($chunk);
494							$str =~ s{$GAP}{-}xmsg if $is_aligned; # restore '-' when aligned
495							print {$out} $str;
496							}
497
498							return;
499							}
500
501
502							sub temp_fasta {
503	9			9	1	25	my $self = shift;
504	9		50			31	my $args = shift // {}; # HashRef (should not be empty...)
505
506							# abbreviate ids (possibly using a custom prefix)
507	9					23	my $prefix = $args->{id_prefix};
508	9					22	delete $args->{id_prefix}; # not necessarily required
509	9					54	my $mapper = $self->std_mapper($prefix);
510	9					51	$self->shorten_ids($mapper);
511
512							# write temporary .fasta file using standard abbr_ids
513							# ...and restore long_ids afterwards
514	9					94	my $out = File::Temp->new(UNLINK => 0, EXLOCK => 0, SUFFIX => '.fasta');
515	9					5348	$self->store_fasta($out->filename, $args);
516	9					80	$self->restore_ids($mapper);
517							### filename: $out->filename
518
519	9	100				77	return wantarray ? ($out->filename, $mapper) : $out->filename;
520							}
521
522
523							# TODO: add an option to remove trailing '_' in ids?
524
525							sub load_phylip {
526	105			105	1	7455	my $class = shift;
527	105					241	my $infile = shift;
528
529	105					603	open my $in, '<', $infile;
530
531	105					37878	my $ali = $class->new( file => $infile );
532	105					254	my $seq_n;
533							my $site_n;
534	105					196	my $n = 0;
535
536							LINE:
537	105					76005	while (my $line = <$in>) {
538	8557					18371	chomp $line;
539
540							# skip empty lines
541	8557	100				44127	next LINE if $line =~ $EMPTY_LINE;
542
543							# get matrix dimensions
544	8524	100				32752	if ($line =~ $DIM_LINE) {
545	105					548	($seq_n, $site_n) = ($1, $2);
546	105					1623	next LINE;
547							}
548
549							# process regular line (identifier is optional)
550	8419					43458	my ($seq_id, $seq) = $line =~ $PHY_LINE;
551
552							# should never happen...
553	8419	50				19424	croak "[BMC] Error: unable to parse PHYLIP file at line $.; aborting!"
554							unless $seq;
555
556							# delete optional spaces
557	8419					18587	$seq =~ tr/ //d;
558
559							# process first seq block
560	8419	100				278129	if ($ali->count_seqs < $seq_n) {
561
562							# seq_id are mandatory in first block
563	8089	50				17295	croak "[BMC] Error: missing id in PHILIP file at line $.; aborting!"
564							unless $seq_id;
565
566							# store first seq chunk along with seq_id
567	8089					201665	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
568	8089					249422	$ali->add_seq($new_seq);
569							}
570
571							# process remaining seq blocks
572							else {
573	330					10763	my $curr_seq = $ali->get_seq($n);
574
575							# elongate current with partial seq chunk mistaken as seq_id
576	330	100	100			1038	if ($seq_id && $seq_id ne $curr_seq->full_id) {
577	110					3689	$curr_seq->append_seq($seq_id);
578							}
579
580							# elongate current seq with new seq chunk
581	330					11025	$curr_seq->append_seq($seq);
582							}
583
584							# prepare reading of next seq or next block
585	8419	100				86033	$n = 0 if ++$n == $seq_n;
586							}
587
588	105					643	my $width = $ali->width;
589	105	50				482	croak "[BMC] Error: unexpected site number in PHYLIP file: $width;"
590							. ' aborting!' if $width != $site_n;
591
592	105					2940	return $ali;
593							}
594
595							# PHYLIP: http://evolution.genetics.washington.edu/phylip/doc/main.html
596							# The information for each species follows, starting with a ten-character
597							# species name (which can include blanks and some punctuation marks), and
598							# continuing with the characters for that species. The name should be on the
599							# same line as the first character of the data for that species. (...) The
600							# name should be ten characters in length, filled out to the full ten
601							# characters by blanks if shorter. Any printable ASCII/ISO character is
602							# allowed in the name, except for parentheses ("(" and ")"), square brackets
603							# ("[" and "]"), colon (":"), semicolon (";") and comma (","). (...) Note that
604							# in these sequences we have a blank every ten sites to make them easier to
605							# read: any such blanks are allowed. The blank line which separates the two
606							# groups of lines (the ones containing sites 1-20 and ones containing sites
607							# 21-39) may or may not be present.
608
609							# PhyML: http://www.atgc-montpellier.fr/phyml/usersguide.php?type=command
610							# The input sequence file is a standard PHYLIP file of aligned DNA or
611							# amino-acids sequences. (...) The maximum number of characters in species
612							# name MUST not exceed 100. Blanks and the symbols "(),:" are not allowed
613							# within sequence names because the Newick tree format makes special use of
614							# these symbols. However, blanks (one or more) MUST appear at the end of each
615							# species name.
616
617							# TREE-FINDER: http://www.treefinder.de/tf-march2011-manual.pdf
618							# A sequence name may consist of 1 to 10 alphanumeric characters, dashes "-",
619							# dots ".", underscores "_", or some of "/", "?", "*", "+". No space is
620							# allowed inside the names. The first name character must be a letter or an
621							# underscore. A sequence fragment may start behind position 10 after a
622							# sequence name, or anywhere in a line without a name.
623
624							# TREE-PUZZLE [data/globin.a]
625							# 7 128
626							# HBB_HUMAN HLTPEEKSAV TALWGKVNVD EVGGEALGRL LVVYPWTQRF FESFDLSMGN
627							# HBB_HORSE QLSGEEKAAV LALWDKVNEE EVGGEALGRL LVVYPWTQRF FDSFDLSMGN
628							# HBA_HUMAN VLSPADKTNV KAAWGKVGAG EYGAEALERM FLSFPTTKTY FPHFDLSHGS
629							# HBA_HORSE VLSAADKTNV KAAWSKVGAG EYGAEALERM FLGFPTTKTY FPHFDLSHGS
630							# MYG_PHYCA VLSEGEWQLV LHVWAKVEVA GHGQDILIRL FKSHPETLEK FDRFHLKKAS
631							# GLB5_PETMA PLSAAEKTKI RSAWAPVYYE TSGVDILVKF FTSTPAAQEF FPKFGLTKKS
632							# LGB2_LUPLU ALTESQAALV KSSWEEFNIP KHTHRFFILV LEIAPAAKDL FSFLGTSQNN
633
634							# RAXML: http://sco.h-its.org/exelixis/oldPage/RAxML-Manual.7.0.4.pdf
635							# The input alignment format of RAxML is relaxed interleaved or sequential
636							# PHYLIP. "Relaxed" means that sequence names can be of variable length
637							# between 1 up to 256 characters. (...) Prohibited Character(s) in taxon names
638							# taxon names that contain any form of whitespace character, like blanks,
639							# tabulators, and carriage returns, as well as one of the following prohibited
640							# characters: :,();[].
641
642							# PHYLOBAYES: http://megasun.bch.umontreal.ca/People/lartillot/www/phylobayes3.3e.pdf
643							# Taxon names may contain more than 10 characters. Avoid special characters
644							# such as ';' or ')', which will create problems when parsing trees. The best
645							# is to only use letters, digits and '_'. Sequences can be interrupted by
646							# space and tab, but not by return characters. Be sure that the lengths of the
647							# sequences are all the same, and identical to the lengths indicated in the
648							# header. Sequences can be interleaved, in which case the taxon names may or
649							# may not be repeated in each block.
650
651
652							sub store_phylip {
653							my $self = shift;
654							my $outfile = shift;
655							my $args = shift // {}; # HashRef (should not be empty...)
656
657							my $short = $args->{short} // 1;
658							my $clean = $args->{clean} // 0;
659							my $chunk = $args->{chunk} // 60;
660
661							open my $out, '>', $outfile;
662
663							# print data matrix dimensions
664							my $height = $self->count_seqs;
665							my $width = $self->width;
666							say {$out} $height . q{ } . $width;
667
668							# setup id format (this will also affect block-like structure)
669							my $format = $short ? "%-10.10s %s\n" : "%s %s\n";
670							my $method = $short ? 'full_id' : 'foreign_id';
671							my $sep = $short ? q{ } : q{};
672
673							# optionally disable wrapping
674							$chunk = $width if $chunk < 0;
675
676							# optionally clean seq
677							my @seqs = $self->all_seqs;
678							@seqs = map { $_->clone->gapify('X') } @seqs if $clean;
679
680							# output Ali in sequential or interleaved format
681							for (my $site = 0; $site < $width; $site += $chunk) {
682
683							# leave empty line between chunks (but not after last chunk)
684							print {$out} "\n" if $site;
685
686							for my $seq (@seqs) {
687
688							# print 10-chars ids only once
689							# Note: full_ids are only truncated (use IdMapper for more)
690							my $id = $site == 0 ? $seq->$method : q{};
691
692							# print seq chunks in 10-state blocks
693							# Note: We insert a space in the 11th column to ensure that more
694							# software packages can read the written files. PHYLIP itself will
695							# ignore this spurious space in the 'sequences'.
696							( my $str = $seq->edit_seq($site, $chunk) ) =~ s{$GAP}{-}xmsg;
697							printf {$out} $format, $id, join $sep,
698							map { substr($str, $_*10, 10) } 0..ceil(length($str)/10)-1;
699							}
700							}
701
702							return;
703							}
704
705
706							sub load_stockholm {
707	2			2	1	33	my $class = shift;
708	2					5	my $infile = shift;
709
710	2					10	open my $in, '<', $infile;
711
712	2					445	my $ali = $class->new( file => $infile );
713	2					27	tie my %seq_for, 'Tie::IxHash';
714
715							LINE:
716	2					1300	while (my $line = <$in>) {
717	44					294	chomp $line;
718
719							# process GF comments
720	44	100				106	next LINE if $ali->is_comment($line, $STK_COMMENT);
721
722							# skip empty lines and other comment lines
723	27	100	100			219	next LINE if $line =~ $EMPTY_LINE
724							\|\| $line =~ $COMMENT_LINE;
725	11	100				33	last LINE if $line =~ $END_LINE;
726
727							# parse stockholm seq
728	9					46	my ($seq_id, $seq) = $line =~ $STK_SEQ;
729
730							# replace dots and gaps by stars
731	9					30	$seq =~ s/[\.\-]/*/xmsg;
732
733							# elongate seq
734	9					44	$seq_for{$seq_id} .= $seq;
735							}
736
737							# populate Ali object
738	2					16	while (my ($seq_id, $seq) = each %seq_for) {
739	9					395	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
740	9					302	$ali->add_seq($new_seq);
741							}
742
743	2					72	return $ali;
744							}
745
746
747							sub instant_store {
748	1			1	1	6	my $class = shift;
749	1					2	my $outfile = shift;
750	1		50			5	my $args = shift // {}; # HashRef (should not be empty...)
751
752	1					4	my $infile = $args->{infile};
753	1	50				6	croak '[BMC] Error: no infile specified for instant_store; aborting!'
754							unless $infile;
755
756	1					8	my $coderef = $args->{coderef};
757	1	50				4	croak '[BMC] Error: no coderef specified for instant_store; aborting!'
758							unless $coderef;
759
760	1					6	open my $in, '<', $infile;
761	1					349	open my $out, '>', $outfile;
762
763	1					273	my $seq_id;
764							my $seq;
765
766							LINE:
767	1					793	while (my $line = <$in>) {
768	92					144	chomp $line;
769
770							# skip empty lines and process comments
771	92	100	66			493	next LINE if $line =~ $EMPTY_LINE
772							\|\| $line =~ $COMMENT_LINE;
773
774							# at each '>' char...
775	90					223	my ($defline) = $line =~ $DEF_LINE;
776	90	100				154	if ($defline) {
777
778							# process current seq (if any)
779	3	100				9	if ($seq) {
780	2					80	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
781	2					3	print {$out} $coderef->($new_seq);
	2					12
782	2					78	$seq = q{};
783							}
784
785	3					9	$seq_id = $defline;
786	3					19	next LINE;
787							}
788
789							# elongate current seq (seqs can be broken on several lines)
790	87					251	$seq .= $line;
791							}
792
793							# process last seq (if any)
794	1	50				6	if ($seq) {
795	1					38	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
796	1					3	print {$out} $coderef->($new_seq);
	1					5
797							}
798
799	1					156	return;
800							}
801
802
803							sub instant_count {
804	1			1	1	443	my $class = shift;
805	1					4	my $infile = shift;
806
807	1					2	my $seq_n = 0;
808
809	1					7	open my $in, '<', $infile;
810	1	100				2860	while (<$in>) { $seq_n++ if m/^>/xms }
	502					2361
811
812	1					21	return $seq_n;
813							}
814
815							__PACKAGE__->meta->make_immutable;
816							1;
817
818							__END__
819
820							=pod
821
822							=head1 NAME
823
824							Bio::MUST::Core::Ali - Multiple sequence alignment
825
826							=head1 VERSION
827
828							version 0.212670
829
830							=head1 SYNOPSIS
831
832							#!/usr/bin/env perl
833
834							use Modern::Perl '2011';
835							# same as:
836							# use strict;
837							# use warnings;
838							# use feature qw(say);
839
840							use Bio::MUST::Core;
841							use aliased 'Bio::MUST::Core::Ali';
842
843							# read Ali form disk
844							my $ali = Ali->load('example.ali');
845
846							# get some properties
847							say 'height: ' . $ali->height; # number of seqs
848							say 'width: ' . $ali->width; # number of sites
849							say '% miss: ' . $ali->perc_miss; # fraction of missing chars (%)
850							say 'seqs are ' . $ali->is_protein ? 'prot' : 'nucl';
851
852							# turn seqs to uppercase
853							$ali->uc_seqs;
854
855							# filter out seqs with no species associated
856							my @seqs = $ali->filter_seqs( sub { not $_->is_genus_only } );
857							use aliased 'Bio::MUST::Core::IdList';
858							my $list = IdList->new( ids => \@seqs );
859							$ali->apply_list($list);
860
861							# alternatively:
862							# $ali = Ali->new( seqs => \@seqs );
863
864							# filter out gap-rich sites
865							$ali->idealize(0.2); # min 20% non-gaps per site
866
867							# filter out non-informative sites
868							use aliased 'Bio::MUST::Core::SeqMask';
869							my $mask = SeqMask->parsimony_mask($ali);
870							$ali->apply_mask($mask);
871
872							# write down reduced Ali to disk
873							$ali->store('example-uc-genus-sp-20.ali');
874							$ali->store_fasta('example-uc-genus-sp-20.fasta');
875
876							# see below for additional methods
877							# ...
878
879							=head1 DESCRIPTION
880
881							This module implements the multiple sequence alignment (MSA) class and its
882							methods. An Ali is modeled as an array of L<Bio::MUST::Core::Seq> objects.
883							Consequently, sequence ids do not absolutely need to be unique for it to
884							work (though id uniqueness helps a lot for sequence access and filtering).
885
886							An Ali knows whether it contains nucleotide or protein sequences, whether
887							those are aligned or not, as well as its Seq count and exact width. All
888							these properties are introspected on the fly, which means that, while they
889							can be expensive to compute, they are always accurate.
890
891							This is important because an Ali provides methods for inserting, deleting
892							and editing its Seq objects. Further, it does its best to maintain a semantic
893							distinction between true gap states (encoded as '*') and missing regions of
894							undetermined length (encoded as pure whitespace, for example at the end of a
895							short sequence).
896
897							It also has methods for mapping its sequence ids (for example, before export
898							to the PHYLIP format), as well as methods to selectively retain sequences
899							and sites based on L<Bio::MUST::Core::IdList> and
900							L<Bio::MUST::Core::SeqMask> objects. For example, the C<idealize> method
901							discards shared gaps and optionally removes gap-rich sites only due to a
902							tiny number of sequences.
903
904							Finally, an Ali can be stored in MUST pseudo-FASTA format (which handles
905							meaningful whitespace and allows comment lines in the header) or be
906							imported/exported from/to several popular MSA formats, such as plain FASTA,
907							STOCKHOLM and PHYLIP.
908
909							=head1 ATTRIBUTES
910
911							=head2 seqs
912
913							ArrayRef of L<Bio::MUST::Core::Seq> objects (optional)
914
915							Most of the accessor methods described below are implemented by delegation
916							to this public attribute using L<Moose::Meta::Attribute::Native/Moose native
917							delegation>. Their documentation thus heavily borrows from the corresponding
918							help pages.
919
920							=head2 file
921
922							L<Path::Class::File> object (optional)
923
924							This optional attribute is initialized by class methods that C<load> an Ali
925							from disk. It is meant to improve the introspection capabilities of the Ali.
926							For now, this attribute is not used by the C<store> methods, though it might
927							provide them with a default value in the future.
928
929							=head2 guessing
930
931							Boolean (optional)
932
933							By default, an Ali object tries to guess whether it is aligned or not by
934							looking for gap-like characters in any of its Seq objects (see
935							L<Bio::MUST::Core::Seq> for the exact test performed on each sequence).
936
937							When this smart behavior causes issues, one can disable it by unsetting this
938							boolean attribute (see C<dont_guess> and C<guess> accessor methods).
939
940							=head2 comments
941
942							ArrayRef of strings (optional)
943
944							An Ali object is commentable, which means that it supports all the methods
945							pertaining to comment lines described in
946							L<Bio::MUST::Core::Roles::Commentable> (such as C<header>).
947
948							=head1 CONSTRUCTORS
949
950							=head2 new
951
952							Default constructor (class method) returning a new Ali.
953
954							use aliased 'Bio::MUST::Core::Ali';
955							my $ali1 = Ali->new();
956							my @seqs = $ali->all_seqs;
957							my $ali2 = Ali->new( seqs => \@seqs );
958
959							This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>,
960							C<file>, C<guessing> and C<comments>.
961
962							=head2 clone
963
964							Creates a deep copy (a clone) of the Ali. Returns the copy.
965
966							use aliased 'Bio::MUST::Core::Ali';
967							my $ali = Ali->load('input.ali');
968							my $ali_copy = $ali->clone;
969							# you can now mess with $ali_copy without affecting $ali
970
971							This method does not accept any arguments.
972
973							=head1 ACCESSORS
974
975							=head2 add_seq
976
977							Adds one (or more) new sequence(s) at the end of the Ali. Returns the new
978							number of sequences of the Ali.
979
980							use aliased 'Bio::MUST::Core::Seq';
981							my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
982							$ali->add_seq($new_seq);
983
984							This method accepts any number of arguments.
985
986							=head2 get_seq
987
988							Returns a sequence of the Ali by its index. You can also use negative index
989							numbers, just as with Perl's core array handling. If the specified sequence
990							does not exist, this method will return C<undef>.
991
992							my $seq = $ali->get_seq($index);
993							croak "Seq $index not found in Ali!" unless defined $seq;
994
995							This method accepts just one argument (and not an array slice).
996
997							=head2 get_seq_with_id
998
999							Returns a sequence of the Ali by its id. If the specified id is not unique,
1000							only the first matching sequence will be returned, whereas if no sequence
1001							exists for the specified id, this method will return C<undef>.
1002
1003							my $id = 'Pyrus malus_3750@658052655';
1004							my $seq = $ali->get_seq_with_id($id);
1005							croak "Seq $id not found in Ali!" unless defined $seq;
1006
1007							This method accepts just one argument.
1008
1009							=head2 set_seq
1010
1011							Given an index and a sequence, sets the specified Ali element to the
1012							sequence. This method returns the new sequence at C<$index>.
1013
1014							use aliased 'Bio::MUST::Core::Seq';
1015							my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
1016							$ali->set_seq($index, $new_seq);
1017
1018							This method requires two arguments.
1019
1020							=head2 delete_seq
1021
1022							Removes the sequence at the given index from the Ali. This method returns
1023							the deleted sequence. If the specified sequence does not exist, it will
1024							return C<undef> instead.
1025
1026							$ali->delete_seq($index);
1027
1028							This method requires one argument.
1029
1030							=head2 insert_seq
1031
1032							Inserts a new sequence into the Ali at the given index. This method returns
1033							the new sequence at C<$index>.
1034
1035							use aliased 'Bio::MUST::Core::Seq';
1036							my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
1037							$ali->insert_seq($index, $new_seq);
1038
1039							This method requires two arguments.
1040
1041							=head2 all_seqs
1042
1043							Returns all the sequences of the Ali (not an array reference).
1044
1045							my @seqs = $ali->all_seqs;
1046
1047							This method does not accept any arguments.
1048
1049							=head2 first_seq
1050
1051							Returns the first sequence of the Ali matching a given criterion, just like
1052							L<List::Util>'s C<first> function. This method requires a subroutine
1053							implementing the matching logic.
1054
1055							# emulate get_seq_with_id method
1056							my $id2find = 'seq13';
1057							my $seq = $ali->first_seq( sub { $_->full_id eq $id2find } );
1058
1059							This method requires a single argument.
1060
1061							=head2 filter_seqs
1062
1063							Returns every sequence of the Ali matching a given criterion, just like
1064							Perl's core C<grep> function. This method requires a subroutine implementing
1065							the matching logic.
1066
1067							# keep only long sequences (ignoring gaps and missing states)
1068							my @long_seqs = $ali->filter_seqs( sub { $_->nomiss_seq_len > 500 } );
1069
1070							This method requires a single argument.
1071
1072							=head2 all_new_seqs
1073
1074							Returns all the sequences of the Ali tagged as #NEW# (not an array reference).
1075
1076							my @new_seqs = $ali->all_new_seqs;
1077
1078							This method does not accept any arguments.
1079
1080							=head2 all_but_new_seqs
1081
1082							Returns all the sequences of the Ali except those tagged as #NEW# (not an array
1083							reference).
1084
1085							my @preexisting_seqs = $ali->all_but_new_seqs;
1086
1087							This method does not accept any arguments.
1088
1089							=head2 all_seq_ids
1090
1091							Returns all the sequence ids (L<Bio::MUST::Core::SeqId> objects) of the Ali
1092							(not an array reference). This is only a convenience method.
1093
1094							use Test::Deeply;
1095							my @ids1 = $ali->all_seq_ids;
1096							my @ids2 = map { $_->seq_id } $ali->all_seqs;
1097							is_deeply \@ids1, \@ids2, 'should be true';
1098							my @orgs = map { $_->org } @ids1;
1099
1100							This method does not accept any arguments.
1101
1102							=head2 filename
1103
1104							Returns the stringified filename of the Ali.
1105
1106							This method does not accept any arguments.
1107
1108							=head2 guess
1109
1110							Turn on the smart detection of gaps (see C<guessing> attribute above).
1111
1112							This method does not accept any arguments.
1113
1114							=head2 dont_guess
1115
1116							Turn off the smart detection of gaps (see C<guessing> attribute above).
1117
1118							use aliased 'Bio::MUST::Core::Ali';
1119							my $ali = Ali->load('ensembl.fasta');
1120							$ali->dont_guess;
1121
1122							This method does not accept any arguments.
1123
1124							=head1 PROPERTIES
1125
1126							=head2 has_uniq_ids
1127
1128							Returns true if all the sequence ids are unique.
1129
1130							carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids;
1131
1132							This method does not accept any arguments.
1133
1134							=head2 is_protein
1135
1136							Returns true if any sequence of the Ali looks like a protein. See
1137							L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
1138
1139							say 'Your file includes nucleotide sequences' unless $ali->is_protein;
1140
1141							This method does not accept any arguments.
1142
1143							=head2 is_aligned
1144
1145							Returns true if any sequence of the Ali appears to be aligned. See
1146							L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
1147
1148							If the boolean attribute guessing is not set, always returns false.
1149
1150							carp 'Warning: file does not look aligned!' unless $ali->is_aligned;
1151
1152							This method does not accept any arguments.
1153
1154							=head2 count_seqs
1155
1156							Returns the number of sequences of the Ali. The alias method C<height> is
1157							provided for convenience.
1158
1159							my $height = $ali->count_seqs;
1160
1161							This method does not accept any arguments.
1162
1163							=head2 width
1164
1165							Returns the width of the Ali (in characters). If the Ali is not aligned,
1166							returns the length of the longest sequence instead.
1167
1168							To avoid potential bugs due to caching, this method dynamically computes the
1169							Ali width at each call. Moreover, the Ali is always uniformized (see below)
1170							beforehands to ensure accurate width value. Therefore, this method is
1171							expensive and should not be called repeatedly (e.g., in a loop condition).
1172
1173							# you'd better looping through sites like this...
1174							my $width = $ali->width;
1175							for my $site (0..$width-1) {
1176							...
1177							}
1178
1179							This method does not accept any arguments.
1180
1181							=head2 seq_len_stats
1182
1183							Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps).
1184							The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max).
1185
1186							This method does not accept any arguments.
1187
1188							=head2 perc_miss
1189
1190							Returns the percentage of missing (and gap-like) character states in the Ali.
1191
1192							As this method internally calls C<Ali::width>, the remarks above also apply.
1193
1194							my $miss_level = $ali->perc_miss;
1195
1196							This method does not accept any arguments.
1197
1198							=head1 MUTATORS
1199
1200							=head2 uc_seqs
1201
1202							Turn all the sequences of the Ali to uppercase and returns it.
1203
1204							This method does not accept any arguments.
1205
1206							=head2 recode_seqs
1207
1208							Recode all the sequences of the Ali and returns it.
1209
1210							use aliased 'Bio::MUST::Core::Ali';
1211							my $ali = Ali->load('biased.ali');
1212
1213							# set up RY recoding for suppressing codon bias
1214							my %base_for = (
1215							A => 'A', G => 'A', # purines
1216							C => 'C', T => 'C', # pyrimidines
1217							);
1218
1219							my $ali_rec = $ali->recode_seqs( \%base_for );
1220							$ali_rec->store('biased_ry.ali');
1221
1222							This method requires one argument.
1223
1224							=head2 degap_seqs
1225
1226							Remove the gaps in all the sequences of the Ali and returns it.
1227
1228							This method does not accept any arguments.
1229
1230							=head2 spacify_seqs
1231
1232							Spacifies all the sequences of the Ali and returns it. See the corresponding
1233							method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1234							operation.
1235
1236							This method does not accept any arguments.
1237
1238							=head2 gapify_seqs
1239
1240							Gapifies all the sequences of the Ali and returns it. See the corresponding
1241							method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1242							operation.
1243
1244							This method accepts an optional argument.
1245
1246							=head2 trim_seqs
1247
1248							Trims all the sequences of the Ali and returns it. See the corresponding
1249							method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1250							operation.
1251
1252							This method does not accept any arguments.
1253
1254							=head2 pad_seqs
1255
1256							Pads all the sequences of the Ali and returns it. See the corresponding
1257							method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1258							operation.
1259
1260							This method does not accept any arguments.
1261
1262							=head2 uniformize
1263
1264							Performs the three gap-cleaning operations in turn on all the sequences of
1265							the Ali and returns it, which ensures that it is semantically clean and
1266							rectangular.
1267
1268							This is only a convenience method called internally by the Ali object before
1269							selected methods (such as storage-like methods). However, it might prove
1270							useful in some circumstances, hence it is not defined as private.
1271
1272							use aliased 'Bio::MUST::Core::Ali';
1273							my $ali = Ali->load('input.ali');
1274							$ali->add_seq(...);
1275							# more editing of the Ali sequences
1276							$ali->uniformize;
1277
1278							This method does not accept any arguments.
1279
1280							=head2 clear_new_tags
1281
1282							Clear the #NEW# tag (if any) from all the sequences of the Ali and returns it.
1283
1284							use aliased 'Bio::MUST::Core::Ali';
1285							my $ali = Ali->load('input-42.ali');
1286							$ali->clear_new_tags;
1287							my @new_seqs = $ali->all_new_seqs;
1288							# array should be empty
1289
1290							This method does not accept any arguments.
1291
1292							=head2 shorten_ids
1293
1294							Replaces all the sequence ids of the Ali by their abbreviated forms as
1295							specified by the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
1296
1297							Note that this method will work only if the sequence ids have not been
1298							already shortened or modified in any way since the creation of the IdMapper.
1299							Long ids without abbreviated forms in the IdMapper are left untouched.
1300
1301							use aliased 'Bio::MUST::Core::Ali';
1302							use aliased 'Bio::MUST::Core::IdMapper';
1303							my $ali = Ali->load('input.ali');
1304							my $mapper = IdMapper->std_mapper($ali, 'lcl\|seq');
1305							$ali->shorten_ids($mapper);
1306							$ali->store_fasta('input.4blast.fasta');
1307							# makeblastdb
1308
1309							# Note: the temp_fasta method does exactly that
1310
1311							This method requires one argument.
1312
1313							=head2 restore_ids
1314
1315							Replaces all the sequence ids of the Ali by their long forms as specified by
1316							the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
1317
1318							Note that this method will work only if the sequence ids have been previously
1319							abbreviated (see above) and have not been modified in any way since then.
1320							Again, abbreviated ids without long forms in the IdMapper are left untouched.
1321
1322							use aliased 'Bio::MUST::Core::IdMapper';
1323							my $mapper = IdMapper->gi_mapper($ali);
1324							$ali->shorten_ids($mapper);
1325							...
1326							$ali->restore_ids($mapper);
1327
1328							This method requires one argument.
1329
1330							=head2 apply_list
1331
1332							Selectively retains or discards sequences from the Ali based on the content
1333							of the passed L<Bio::MUST::Core::IdList> object and returns the Ali.
1334
1335							use aliased 'Bio::MUST::Core::IdList';
1336							my $list = IdList->load('interesting_seqs.idl');
1337							$ali->apply_list($list); # discard non-interesting seqs
1338
1339							This method requires one argument.
1340
1341							=head2 apply_mask
1342
1343							Selectively retains or discards sites from the Ali based on the content of
1344							the passed L<Bio::MUST::Core::SeqMask> object and returns the Ali.
1345
1346							use aliased 'Bio::MUST::Core::SeqMask';
1347							my $variable_sites = SeqMask->variable_mask($ali);
1348							$ali->apply_mask($variable_sites); # discard constant sites
1349
1350							This method requires one argument.
1351
1352							=head2 idealize
1353
1354							Computes and applies an ideal sequence mask to the Ali and returns it. This
1355							is only a convenience method.
1356
1357							When invoked without arguments, it will discard the gaps that are
1358							universally shared by all the sequences. Otherwise, the provided argument
1359							corresponds to the threshold of the C<ideal_mask> method described in
1360							L<Bio::MUST::Core::SeqMask>.
1361
1362							use aliased 'Bio::MUST::Core::IdList';
1363							my $fast_seqs = IdList->load('fast_evolving_seqs.idl');
1364							my $seqs2keep = $fast_seqs->negative_list($ali);
1365							$ali->apply_list($seqs2keep); # discard fast-evolving seqs
1366							$ali->idealize; # discard newly shared gaps caused by fast seqs
1367
1368							use aliased 'Bio::MUST::Core::Ali';
1369							my $ali = Ali->load('hmm_based.ali');
1370							$ali->idealize(0.05); # discard insertions due to <5% of the seqs
1371
1372							This method accepts an optional argument.
1373
1374							=head1 MISC METHODS
1375
1376							=head2 gapmiss_regex
1377
1378							Returns a regular expression matching gaps and ambiguous or missing states.
1379							The exact regex returned depends on the type of sequences in the Ali (nucl. or
1380							proteins).
1381
1382							my $regex = $ali->gapmiss_regex;
1383							my $first_seq = $ali->get_seq(0)->seq;
1384							my $gapmiss_n = $first_seq =~ m/($regex)/xmsg;
1385							say "The first sequence has $gapmiss_n gaps or ambiguous/missing sites";
1386
1387							This method does not accept any arguments.
1388
1389							=head2 map_coords
1390
1391							Converts a set of site positions from Ali coordinates to coordinates of the
1392							specified sequence (thereby ignoring positions due to gaps). Returns the
1393							converted sites in sequence coordinates as an array refrence.
1394
1395							use aliased 'Bio::MUST::Core::Ali';
1396							my $ali = Ali->load('input.ali');
1397							my $id = 'GIV-Norovirus Hum.GIV.1.POL_1338688@508124125';
1398							my $ali_coords = [ 4, 25, 73, 89, 104, 116 ];
1399							my $seq_coords = $ali->map_coords($id, $ali_coords);
1400							# $seq_coords is [ 3, 23, 59, 71, 71, 74 ]
1401
1402							This method requires two arguments: the id of a sequence and an array
1403							reference of input sites in Ali coordinates.
1404
1405							=head1 I/O METHODS
1406
1407							=head2 load
1408
1409							Class method (constructor) returning a new Ali read from disk. This method
1410							will transparently import plain FASTA files in addition to the MUST
1411							pseudo-FASTA format (ALI files).
1412
1413							use Test::Deeply;
1414							use aliased 'Bio::MUST::Core::Ali';
1415							my $ali1 = Ali->load('example.ali');
1416							my $ali2 = Ali->load('example.fasta');
1417							my @seqs1 = $ali1->all_seqs;
1418							my @seqs2 = $ali2->all_seqs;
1419							is_deeply, \@seqs1, \@seqs2, 'should be true';
1420
1421							This method requires one argument.
1422
1423							=head2 store
1424
1425							Writes the Ali to disk in the MUST pseudo-FASTA format (ALI files).
1426
1427							Note that the ALI format is only used when the suffix of the outfile name is
1428							'.ali'. In all other cases (including lack of suffix), this method
1429							automatically forwards the call to C<store_fasta>.
1430
1431							$ali->store('output.ali');
1432							# output.ali is written in ALI format
1433							$ali->store('output.fasta');
1434							# output.fasta is written in FASTA format
1435
1436							This method requires one argument (but see C<store_fasta> in case of automatic
1437							forwarding of the method call).
1438
1439							=head2 store_fasta
1440
1441							Writes the Ali to disk in the plain FASTA format.
1442
1443							For compatibility purposes, this method automatically fetches sequence ids
1444							using the C<foreign_id> method instead of the native C<full_id> method, both
1445							described in L<Bio::MUST::Core::SeqId>.
1446
1447							$ali->store_fasta( 'output.fasta' );
1448							$ali->store_fasta( 'output.fasta', {chunk => -1, degap => 1} );
1449
1450							This method requires one argument and accepts a second optional argument
1451							controlling the output format. It is a hash reference that may contain one
1452							or more of the following keys:
1453
1454							- clean: replace all ambiguous and missing states by C<X> (default: false)
1455							- degap: boolean value controlling degapping (default: false)
1456							- chunk: line width (default is 60 chars; negative values means no wrap)
1457
1458							Finally, it is possible to fine-tune the behavior of the C<clean> option by
1459							providing another character than C<X> through the C<gapify> key. This can be
1460							useful to replace all ambiguous and missing states by gaps, as shown below:
1461
1462							$ali->store_fasta( 'output.fasta, { clean => 1, gapify => '*' } );
1463
1464							=head2 temp_fasta
1465
1466							Writes a temporary copy of the Ali to disk in the plain FASTA format using
1467							numeric sequence ids and returns the name of the temporary file. This is
1468							only a convenience method.
1469
1470							In list context, returns the IdMapper object along with temporary filename.
1471
1472							my $infile = $ali->temp_fasta( { degap => 1 } );
1473							my $output = `script.sh $infile`;
1474							...
1475
1476							This method accepts the same optional argument hash as C<store_fasta>.
1477							However, an additional option (C<id_prefix>) is available to control the way
1478							abbreviated sequence ids are prefixed by the C<std_mapper> method (see
1479							L<Bio::MUST::Core::Listable>).
1480
1481							my $infile1 = $ali1->temp_fasta( { id_prefix => 'file1-' } );
1482							my $infile2 = $ali2->temp_fasta( { id_prefix => 'file2-' } );
1483
1484							=head2 load_phylip
1485
1486							Class method (constructor) returning a new Ali read from a file in the
1487							PHYLIP format. Both sequential and interleaved formats are supported.
1488
1489							The only constraint is that sequence ids MUST NOT contain whitespace and be
1490							followed by at least one whitespace character. This means that some old-school
1491							PHYLIP files not following this convention will not be processed correctly.
1492
1493							When using the interleaved format, sequence ids may or may not be repeated in
1494							each block.
1495
1496							use aliased 'Bio::MUST::Core::Ali';
1497							my $ali = Ali->load_phylip('phylip.phy');
1498							say $ali->count_seqs;
1499							say $ali->width;
1500
1501							# outputs:
1502							# 10
1503							# 709
1504
1505							This method requires one argument.
1506
1507							=head2 store_phylip
1508
1509							Writes the Ali to disk in the interleaved (or sequential) PHYLIP format.
1510
1511							To ensure maximal flexibility, this method fetches sequence ids using the
1512							native C<full_id> method described in L<Bio::MUST::Core::SeqId>, but
1513							truncates them to 10 characters, as expected by the original PHYLIP software
1514							package. No other tinkering is carried out on the ids. Thus, if the ids
1515							contain whitespace or are not unique in their 10 first characters, it is
1516							advised to first map them using one of the constructors in
1517							L<Bio::MUST::Core::IdMapper>.
1518
1519							use aliased 'Bio::MUST::Core::Ali';
1520							use aliased 'Bio::MUST::Core::IdMapper';
1521							my $ali = Ali->load('input.ali');
1522							my $mapper = IdMapper->std_mapper($ali);
1523							$ali->shorten_ids($mapper);
1524							$ali->store_phylip( 'input.phy', { chunk => 50 } );
1525
1526							This method requires one argument and accepts a second optional argument
1527							controlling the output format. It is a hash reference that may contain one
1528							or more of the following keys:
1529
1530							- short: truncate ids to 10 chars, as in original PHYLIP (defaut: yes)
1531							- clean: replace all ambiguous and missing states by 'X' (default: false)
1532							- chunk: line width (default: 60 chars; negative values means no wrap)
1533
1534							To store the Ali in PHYLIP sequential format, specify a negative chunk (-1).
1535
1536							=head2 load_stockholm
1537
1538							Class method (constructor) returning a new Ali read from a file in the
1539							STOCKHOLM format. =GF comments are retained (see above) but not the other
1540							comment classes (=GS, =GR and =GC).
1541
1542							use aliased 'Bio::MUST::Core::Ali';
1543							my $ali = Ali->load('upsk.stockholm');
1544							say $ali->header;
1545
1546							# outputs:
1547							# ID UPSK
1548							# SE Predicted; Infernal
1549							# SS Published; PMID 9223489
1550							# RN [1]
1551							# RM 9223489
1552							# RT The role of the pseudoknot at the 3' end of turnip yellow mosaic
1553							# RT virus RNA in minus-strand synthesis by the viral RNA-dependent RNA
1554							# RT polymerase.
1555							# RA Deiman BA, Kortlever RM, Pleij CW;
1556							# RL J Virol 1997;71:5990-5996.
1557
1558							This method requires one argument.
1559
1560							=head2 instant_store
1561
1562							Class method intended to transform a large sequence file read from disk
1563							without loading it in memory. This method will transparently process plain
1564							FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
1565
1566							my $chunk = 200;
1567
1568							my $split = sub {
1569							my $seq = shift;
1570							my $base_id = ( split /\s+/xms, $seq->full_id )[0];
1571							my $max_pos = $seq->seq_len - $chunk;
1572							my $n = 0;
1573							my $out_str;
1574							for (my $pos = 0; $pos <= $max_pos; $pos += $chunk, $n++) {
1575							$out_str .= ">$base_id.$n\n" . $seq->edit_seq($pos,
1576							$pos + $chunk <= $max_pos ? $chunk : 2 * $chunk
1577							) . "\n";
1578							}
1579							return $out_str;
1580							};
1581
1582							use aliased 'Bio::MUST::Core::Ali';
1583
1584							Ali->instant_store(
1585							'outfile.fasta', { infile => 'infile.fasta', coderef => $split }
1586							);
1587
1588							This method requires two arguments. The sercond is a hash reference that must
1589							contain the following keys:
1590							- infile: input sequence file
1591							- coderef: subroutine implementing the transforming logic
1592
1593							=head2 instant_count
1594
1595							Class method returning the number of seqs in any sequence file read from disk
1596							without loading it in memory. This method will transparently process plain
1597							FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
1598
1599							use aliased 'Bio::MUST::Core::Ali';
1600							my $seq_n = Ali->instant_count('input.ali');
1601							say $seq_n;
1602
1603							=head1 ALIASES
1604
1605							=head2 height
1606
1607							Alias for C<count_seqs> method. For API consistency.
1608
1609							=head1 AUTHOR
1610
1611							Denis BAURAIN <denis.baurain@uliege.be>
1612
1613							=head1 CONTRIBUTORS
1614
1615							=for stopwords Catherine COLSON Arnaud DI FRANCO
1616
1617							=over 4
1618
1619							=item *
1620
1621							Catherine COLSON <ccolson@doct.uliege.be>
1622
1623							=item *
1624
1625							Arnaud DI FRANCO <arnaud.difranco@gmail.com>
1626
1627							=back
1628
1629							=head1 COPYRIGHT AND LICENSE
1630
1631							This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
1632
1633							This is free software; you can redistribute it and/or modify it under
1634							the same terms as the Perl 5 programming language system itself.
1635
1636							=cut