File Coverage

blib/lib/Bio/MUST/Core/Ali.pm
Criterion Covered Total %
statement 278 279 99.6
branch 65 76 85.5
condition 14 18 77.7
subroutine 56 57 98.2
pod 33 33 100.0
total 446 463 96.3


line stmt bran cond sub pod time code
1             package Bio::MUST::Core::Ali;
2             # ABSTRACT: Multiple sequence alignment
3             # CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be>
4             # CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com>
5             $Bio::MUST::Core::Ali::VERSION = '0.212670';
6 17     17   13671 use Moose;
  17         47  
  17         129  
7 17     17   92080 use namespace::autoclean;
  17         44  
  17         212  
8              
9 17     17   1661 use autodie;
  17         66  
  17         154  
10 17     17   91106 use feature qw(say);
  17         64  
  17         1478  
11              
12             # use Smart::Comments;
13              
14 17     17   136 use Carp;
  17         40  
  17         1233  
15 17     17   132 use File::Temp;
  17         39  
  17         1437  
16 17     17   139 use List::AllUtils qw(uniq indexes sum0);
  17         47  
  17         982  
17 17     17   118 use Path::Class qw(file);
  17         53  
  17         837  
18 17     17   112 use POSIX qw(ceil floor);
  17         46  
  17         129  
19 17     17   11520 use Statistics::Descriptive;
  17         373152  
  17         701  
20 17     17   8900 use Tie::IxHash;
  17         41782  
  17         645  
21              
22 17     17   149 use Bio::MUST::Core::Types;
  17         44  
  17         500  
23 17     17   102 use Bio::MUST::Core::Constants qw(:gaps :files);
  17         44  
  17         3977  
24 17     17   139 use aliased 'Bio::MUST::Core::Seq';
  17         40  
  17         167  
25 17     17   3904 use aliased 'Bio::MUST::Core::SeqMask';
  17         48  
  17         71  
26              
27             # TODO: add information about methods available in Ali-like objects
28              
29             # ATTRIBUTES
30              
31              
32             has 'seqs' => (
33             traits => ['Array'],
34             is => 'ro',
35             isa => 'ArrayRef[Bio::MUST::Core::Seq]',
36             default => sub { [] },
37             writer => '_set_seqs',
38             handles => {
39             add_seq => 'push',
40             get_seq => 'get',
41             set_seq => 'set',
42             delete_seq => 'delete',
43             insert_seq => 'insert',
44             count_seqs => 'count',
45             all_seqs => 'elements',
46             first_seq => 'first',
47             filter_seqs => 'grep',
48             },
49             );
50              
51              
52             has 'file' => (
53             is => 'ro',
54             isa => 'Bio::MUST::Core::Types::File',
55             default => 'untitled.ali',
56             coerce => 1,
57             handles => {
58             filename => 'stringify',
59             },
60             );
61              
62              
63             has 'guessing' => (
64             traits => ['Bool'],
65             is => 'ro',
66             isa => 'Bool',
67             default => 1,
68             handles => {
69             dont_guess => 'unset',
70             guess => 'set',
71             },
72             );
73              
74              
75             with 'Bio::MUST::Core::Roles::Commentable',
76             'Bio::MUST::Core::Roles::Listable';
77             with 'Bio::MUST::Core::Roles::Aliable'; ## no critic (ProhibitMultipleWiths)
78              
79             # CONSTRUCTORS
80              
81              
82              
83             sub clone {
84 1     1 1 4 my $self = shift;
85              
86             return $self->new(
87             comments => [ $self->all_comments ],
88 1         48 seqs => [ map { $_->clone } $self->all_seqs ],
  250         712  
89             file => file( $self->filename ),
90             guessing => $self->guessing,
91             );
92             }
93              
94             # ACCESSORS
95              
96              
97             sub get_seq_with_id {
98 4     4 1 1985 my $self = shift;
99 4         17 my $id = shift;
100              
101 4     10   200 my $seq = $self->first_seq( sub { $_->full_id eq $id } );
  10         44  
102 4 100       44 carp "[BMC] Warning: cannot find seq with id: $id; returning undef!"
103             unless $seq;
104              
105 4         464 return $seq;
106             }
107              
108              
109             sub all_new_seqs {
110 6     6 1 20 return shift->filter_seqs( sub { $_->is_new } );
  2     2   106  
111             }
112              
113              
114             sub all_but_new_seqs {
115 3     3 1 10 return shift->filter_seqs( sub { not $_->is_new } );
  1     1   82  
116             }
117              
118              
119             sub all_seq_ids {
120 105     105 1 3943 my $self = shift;
121 105         4420 return map { $_->seq_id } $self->all_seqs;
  1510         37655  
122             }
123              
124             # PROPERTIES
125              
126              
127             sub has_uniq_ids {
128 64     64 1 1562 my $self = shift;
129              
130 64         211 my @ids = map { $_->full_id } $self->all_seq_ids;
  941         24065  
131 64 100       2415 return 1 if $self->count_seqs == uniq @ids;
132 7         234 return 0;
133             }
134              
135              
136             sub is_protein {
137 77     77 1 2635 my $self = shift;
138 77 100   96   3277 return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs;
  96         534  
139 5         31 return 0;
140             }
141              
142              
143             sub is_aligned {
144 310     310 1 587 my $self = shift;
145 310 100       10966 return 0 if not $self->guessing;
146 307 100   8588   11599 return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs;
  8588         16726  
147 116         802 return 0;
148             }
149              
150              
151             sub width {
152 189     189 1 433 my $self = shift;
153 189 100       617 $self->uniformize if $self->is_aligned; # pad seqs for robustness
154 189         945 return $self->_max_seq_len;
155             }
156              
157             sub _max_seq_len {
158 365     365   678 my $self = shift;
159 365         12299 my @lengths = map { $_->seq_len } $self->all_seqs;
  12841         384806  
160 365   50     5034 return (List::AllUtils::max @lengths) // 0; # to avoid warnings
161             }
162              
163              
164             sub seq_len_stats {
165 1     1 1 13 my $self = shift;
166              
167 1         44 my @lengths = map { $_->nomiss_seq_len } $self->all_seqs;
  10         27  
168 1         16 my $stat = Statistics::Descriptive::Full->new;
169 1         139 $stat->add_data( \@lengths );
170 1         259 my @quantiles = map { $stat->quantile($_) } 0..4;
  5         253  
171              
172 1         46 return @quantiles;
173             }
174              
175              
176             sub perc_miss {
177 2     2 1 6 my $self = shift;
178              
179 2         85 my $n = sum0 map { $_->nomiss_seq_len } $self->all_seqs;
  255         540  
180 2 50       19 return 0.0 unless $n;
181              
182 2         11 my $total = $self->width * $self->count_seqs;
183 2         19 return 100.0 * ($total - $n) / $total;
184             }
185              
186             # MUTATORS
187              
188              
189             sub uc_seqs {
190 1     1 1 12 my $self = shift;
191              
192 1         44 $_->uc for $self->all_seqs;
193 1         4 return $self;
194             }
195              
196              
197             sub recode_seqs { ## no critic (RequireArgUnpacking)
198 1     1 1 20 my $self = shift;
199              
200 1         44 $_->recode(@_) for $self->all_seqs;
201 1         7 return $self;
202             }
203              
204              
205             sub degap_seqs {
206 1     1 1 8 my $self = shift;
207              
208 1         39 $_->degap for $self->all_seqs;
209 1         4 return $self;
210             }
211              
212              
213             sub spacify_seqs {
214 176     176 1 364 my $self = shift;
215              
216 176         6082 $_->spacify for $self->all_seqs;
217 176         489 return $self;
218             }
219              
220              
221             sub gapify_seqs { ## no critic (RequireArgUnpacking)
222 2     2 1 19 my $self = shift;
223 2         78 $_->gapify(@_) for $self->all_seqs;
224 2         8 return $self;
225             }
226              
227              
228             sub trim_seqs {
229 176     176 1 337 my $self = shift;
230              
231 176         6208 $_->trim for $self->all_seqs;
232 176         691 return $self;
233             }
234              
235              
236             sub pad_seqs {
237 176     176 1 412 my $self = shift;
238              
239 176         538 my $bound = $self->_max_seq_len;
240 176         5923 $_->pad_to($bound) for $self->all_seqs;
241 176         550 return $self;
242             }
243              
244              
245             sub uniformize {
246 175     175 1 371 my $self = shift;
247              
248             # TODO: profile code (triggers?) to avoid useless multiple calls
249              
250 175         624 $self->spacify_seqs;
251 175         795 $self->trim_seqs;
252 175         699 $self->pad_seqs;
253              
254 175         343 return $self;
255             }
256              
257              
258             sub clear_new_tags {
259 1     1 1 4 my $self = shift;
260              
261 1         42 $_->clear_new_tag for $self->all_seqs;
262 1         4 return $self;
263             }
264              
265              
266             sub shorten_ids { ## no critic (RequireArgUnpacking)
267 12     12 1 65 return shift->_change_ids_(1, @_);
268             }
269              
270              
271             sub restore_ids { ## no critic (RequireArgUnpacking)
272 10     10 1 44 return shift->_change_ids_(0, @_);
273             }
274              
275             sub _change_ids_ {
276 22     22   59 my $self = shift;
277 22         47 my $abbr = shift;
278 22         45 my $mapper = shift;
279              
280 22         868 for my $seq ($self->all_seqs) {
281 133 100       702 my $new_id = $abbr ? $mapper->abbr_id_for( $seq->full_id )
282             : $mapper->long_id_for( $seq->full_id );
283 133 50       4236 $seq->set_seq_id($new_id) if $new_id;
284             } # Note: leave id alone if not found
285              
286 22         115 return $self;
287             }
288              
289              
290             sub apply_list {
291 1     1 1 8 my $self = shift;
292 1         4 my $list = shift;
293              
294             # loop through seqs from bottom to top
295             # ... and delete seq from Ali if not listed in list
296 1         42 my $seq_n = $self->count_seqs;
297              
298 1         8 for (my $i = $seq_n-1; $i >= 0; $i--) {
299 250 100       8067 $self->delete_seq($i)
300             unless $list->is_listed( $self->get_seq($i)->full_id );
301             }
302              
303 1         8 return $self;
304             }
305              
306              
307             sub _premask_check {
308 40     40   476 my $self = shift;
309 40         95 my $mask = shift;
310              
311             # warn of unaligned Ali
312 40 50       160 carp '[BMC] Note: Ali does not look aligned!'
313             . ' This might not be an issue if seqs are ultra-conserved!'
314             unless $self->is_aligned;
315 40         254 $self->uniformize;
316              
317             # warn of empty SeqMask
318             carp '[BMC] Note: applying this mask will result in a zero-width Ali!'
319 40 50   1845   1782 unless List::AllUtils::any { $_ } $mask->all_states;
  1845         2302  
320              
321             # check that SeqMask is compatible with Ali
322             # potential bugs could come from constant sites etc
323 40         14410 my $a_width = $self->width;
324 40         1594 my $m_width = $mask->mask_len;
325 40 100       182 carp "[BMC] Note: Ali width does not match mask len: $a_width vs. $m_width!"
326             . ' This might not be an issue if the mask results from blocks.'
327             unless $a_width == $m_width;
328              
329 40         919 return; # return values of before subs are ignored
330             }
331              
332             before 'apply_mask' => \&_premask_check;
333              
334             sub apply_mask {
335             my $self = shift;
336             my $mask = shift;
337              
338             # select sites for each seq using a precomputed array slice
339             my @indexes = indexes { $_ } $mask->all_states;
340             $_->_set_seq( join q{}, ( $_->all_states )[@indexes] )
341             for $self->all_seqs;
342              
343             return $self;
344             }
345              
346              
347             sub idealize { ## no critic (RequireArgUnpacking)
348 13     13 1 107 my $self = shift;
349 13         154 return $self->apply_mask( SeqMask->ideal_mask($self, @_) );
350             }
351              
352             # MISC METHODS
353              
354              
355             sub gapmiss_regex {
356 73 100   73 1 1152 return shift->is_protein ? $GAPPROTMISS : $GAPDNAMISS;
357             }
358              
359              
360             sub map_coords {
361 1     1 1 14 my $self = shift;
362 1         4 my $id = shift;
363 1         3 my $coords_in = shift;
364              
365 1         6 my $seq = $self->get_seq_with_id($id);
366 1         7 my @states = $seq->all_states;
367              
368 1         3 my $count = 0;
369 1         2 my @index;
370 1         4 for my $state (@states) {
371 130 100       351 $count++ if $state !~ $GAP;
372 130         222 push @index, $count;
373             }
374              
375 1         2 my @coords_out = map { $index[$_-1] } @{$coords_in};
  6         14  
  1         3  
376              
377 1         14 return \@coords_out;
378             }
379              
380             # ALIASES
381              
382              
383             sub height {
384 0     0 1 0 return shift->count_seqs;
385             }
386              
387             # I/O METHODS
388              
389              
390             sub load {
391 69     69 1 26466 my $class = shift;
392 69         162 my $infile = shift;
393              
394 69         397 open my $in, '<', $infile;
395              
396 69         33740 my $ali = $class->new( file => $infile );
397 69         451 my $seq_id;
398             my $seq;
399              
400             LINE:
401 69         15067 while (my $line = <$in>) {
402 4091         7888 chomp $line;
403              
404             # skip empty lines and process comments
405 4091 100 100     26130 next LINE if $line =~ $EMPTY_LINE
406             || $ali->is_comment($line);
407              
408             # at each '>' char...
409 3944         14367 my ($defline) = $line =~ $DEF_LINE;
410 3944 100       8598 if ($defline) {
411              
412             # add current seq to ali (if any)
413 1662 100       3365 if ($seq) {
414 1593         48380 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
415 1593         54646 $ali->add_seq($new_seq);
416 1593         2989 $seq = q{};
417             }
418              
419 1662         2768 $seq_id = $defline;
420 1662         24372 next LINE;
421             }
422              
423             # elongate current seq (seqs can be broken on several lines)
424 2282         10687 $seq .= $line;
425             }
426              
427             # add last seq to ali (if any)
428 69 50       283 if ($seq) {
429 69         2410 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
430 69         2428 $ali->add_seq($new_seq);
431             }
432              
433 69         1797 return $ali;
434             }
435              
436              
437             before qr{\Astore}xms => sub {
438             my $self = shift;
439              
440             # perform pre-storage duties
441             carp '[BMC] Warning: non unique seq ids!' unless $self->has_uniq_ids;
442             $self->uniformize if $self->is_aligned;
443              
444             return;
445             };
446              
447              
448             sub store { ## no critic (RequireArgUnpacking)
449             my $self = shift;
450             my $outfile = shift;
451              
452             # automatically redirect to store_fasta for non-Ali outfile names
453             return $self->store_fasta($outfile, @_) # note the currying
454             unless $outfile =~ $ALI_SUFFIX;
455              
456             open my $out, '>', $outfile;
457              
458             print {$out} $self->header;
459             for my $seq ($self->all_seqs) {
460             say {$out} '>' . $seq->full_id;
461             say {$out} $seq->seq;
462             }
463              
464             return;
465             }
466              
467              
468             sub store_fasta {
469             my $self = shift;
470             my $outfile = shift;
471             my $args = shift // {}; # HashRef (should not be empty...)
472              
473             my $degap = $args->{degap} // 0;
474             my $clean = $args->{clean} // 0;
475             my $gap = $args->{gapify} // 'X';
476             my $chunk = $args->{chunk} // 60;
477             my $nowrap = $chunk < 0 ? 1 : 0;
478             my $is_aligned = $self->is_aligned;
479              
480             open my $out, '>', $outfile;
481              
482             for my $seq ($self->all_seqs) {
483             say {$out} '>' . $seq->foreign_id;
484              
485             # optionally clean and/or degap seq
486             $seq = $seq->clone if $clean || $degap; # clone seq only if needed
487             $seq->gapify($gap) if $clean;
488             $seq->degap if $degap;
489              
490             my $width = $seq->seq_len;
491             $chunk = $width if $nowrap; # optionally disable wrap
492              
493             my $str = $seq->wrapped_str($chunk);
494             $str =~ s{$GAP}{-}xmsg if $is_aligned; # restore '-' when aligned
495             print {$out} $str;
496             }
497              
498             return;
499             }
500              
501              
502             sub temp_fasta {
503 9     9 1 25 my $self = shift;
504 9   50     31 my $args = shift // {}; # HashRef (should not be empty...)
505              
506             # abbreviate ids (possibly using a custom prefix)
507 9         23 my $prefix = $args->{id_prefix};
508 9         22 delete $args->{id_prefix}; # not necessarily required
509 9         54 my $mapper = $self->std_mapper($prefix);
510 9         51 $self->shorten_ids($mapper);
511              
512             # write temporary .fasta file using standard abbr_ids
513             # ...and restore long_ids afterwards
514 9         94 my $out = File::Temp->new(UNLINK => 0, EXLOCK => 0, SUFFIX => '.fasta');
515 9         5348 $self->store_fasta($out->filename, $args);
516 9         80 $self->restore_ids($mapper);
517             ### filename: $out->filename
518              
519 9 100       77 return wantarray ? ($out->filename, $mapper) : $out->filename;
520             }
521              
522              
523             # TODO: add an option to remove trailing '_' in ids?
524              
525             sub load_phylip {
526 105     105 1 7455 my $class = shift;
527 105         241 my $infile = shift;
528              
529 105         603 open my $in, '<', $infile;
530              
531 105         37878 my $ali = $class->new( file => $infile );
532 105         254 my $seq_n;
533             my $site_n;
534 105         196 my $n = 0;
535              
536             LINE:
537 105         76005 while (my $line = <$in>) {
538 8557         18371 chomp $line;
539              
540             # skip empty lines
541 8557 100       44127 next LINE if $line =~ $EMPTY_LINE;
542              
543             # get matrix dimensions
544 8524 100       32752 if ($line =~ $DIM_LINE) {
545 105         548 ($seq_n, $site_n) = ($1, $2);
546 105         1623 next LINE;
547             }
548              
549             # process regular line (identifier is optional)
550 8419         43458 my ($seq_id, $seq) = $line =~ $PHY_LINE;
551              
552             # should never happen...
553 8419 50       19424 croak "[BMC] Error: unable to parse PHYLIP file at line $.; aborting!"
554             unless $seq;
555              
556             # delete optional spaces
557 8419         18587 $seq =~ tr/ //d;
558              
559             # process first seq block
560 8419 100       278129 if ($ali->count_seqs < $seq_n) {
561              
562             # seq_id are mandatory in first block
563 8089 50       17295 croak "[BMC] Error: missing id in PHILIP file at line $.; aborting!"
564             unless $seq_id;
565              
566             # store first seq chunk along with seq_id
567 8089         201665 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
568 8089         249422 $ali->add_seq($new_seq);
569             }
570              
571             # process remaining seq blocks
572             else {
573 330         10763 my $curr_seq = $ali->get_seq($n);
574              
575             # elongate current with partial seq chunk mistaken as seq_id
576 330 100 100     1038 if ($seq_id && $seq_id ne $curr_seq->full_id) {
577 110         3689 $curr_seq->append_seq($seq_id);
578             }
579              
580             # elongate current seq with new seq chunk
581 330         11025 $curr_seq->append_seq($seq);
582             }
583              
584             # prepare reading of next seq or next block
585 8419 100       86033 $n = 0 if ++$n == $seq_n;
586             }
587              
588 105         643 my $width = $ali->width;
589 105 50       482 croak "[BMC] Error: unexpected site number in PHYLIP file: $width;"
590             . ' aborting!' if $width != $site_n;
591              
592 105         2940 return $ali;
593             }
594              
595             # PHYLIP: http://evolution.genetics.washington.edu/phylip/doc/main.html
596             # The information for each species follows, starting with a ten-character
597             # species name (which can include blanks and some punctuation marks), and
598             # continuing with the characters for that species. The name should be on the
599             # same line as the first character of the data for that species. (...) The
600             # name should be ten characters in length, filled out to the full ten
601             # characters by blanks if shorter. Any printable ASCII/ISO character is
602             # allowed in the name, except for parentheses ("(" and ")"), square brackets
603             # ("[" and "]"), colon (":"), semicolon (";") and comma (","). (...) Note that
604             # in these sequences we have a blank every ten sites to make them easier to
605             # read: any such blanks are allowed. The blank line which separates the two
606             # groups of lines (the ones containing sites 1-20 and ones containing sites
607             # 21-39) may or may not be present.
608              
609             # PhyML: http://www.atgc-montpellier.fr/phyml/usersguide.php?type=command
610             # The input sequence file is a standard PHYLIP file of aligned DNA or
611             # amino-acids sequences. (...) The maximum number of characters in species
612             # name MUST not exceed 100. Blanks and the symbols "(),:" are not allowed
613             # within sequence names because the Newick tree format makes special use of
614             # these symbols. However, blanks (one or more) MUST appear at the end of each
615             # species name.
616              
617             # TREE-FINDER: http://www.treefinder.de/tf-march2011-manual.pdf
618             # A sequence name may consist of 1 to 10 alphanumeric characters, dashes "-",
619             # dots ".", underscores "_", or some of "/", "?", "*", "+". No space is
620             # allowed inside the names. The first name character must be a letter or an
621             # underscore. A sequence fragment may start behind position 10 after a
622             # sequence name, or anywhere in a line without a name.
623              
624             # TREE-PUZZLE [data/globin.a]
625             # 7 128
626             # HBB_HUMAN HLTPEEKSAV TALWGKVNVD EVGGEALGRL LVVYPWTQRF FESFDLSMGN
627             # HBB_HORSE QLSGEEKAAV LALWDKVNEE EVGGEALGRL LVVYPWTQRF FDSFDLSMGN
628             # HBA_HUMAN VLSPADKTNV KAAWGKVGAG EYGAEALERM FLSFPTTKTY FPHFDLSHGS
629             # HBA_HORSE VLSAADKTNV KAAWSKVGAG EYGAEALERM FLGFPTTKTY FPHFDLSHGS
630             # MYG_PHYCA VLSEGEWQLV LHVWAKVEVA GHGQDILIRL FKSHPETLEK FDRFHLKKAS
631             # GLB5_PETMA PLSAAEKTKI RSAWAPVYYE TSGVDILVKF FTSTPAAQEF FPKFGLTKKS
632             # LGB2_LUPLU ALTESQAALV KSSWEEFNIP KHTHRFFILV LEIAPAAKDL FSFLGTSQNN
633              
634             # RAXML: http://sco.h-its.org/exelixis/oldPage/RAxML-Manual.7.0.4.pdf
635             # The input alignment format of RAxML is relaxed interleaved or sequential
636             # PHYLIP. "Relaxed" means that sequence names can be of variable length
637             # between 1 up to 256 characters. (...) Prohibited Character(s) in taxon names
638             # taxon names that contain any form of whitespace character, like blanks,
639             # tabulators, and carriage returns, as well as one of the following prohibited
640             # characters: :,();[].
641              
642             # PHYLOBAYES: http://megasun.bch.umontreal.ca/People/lartillot/www/phylobayes3.3e.pdf
643             # Taxon names may contain more than 10 characters. Avoid special characters
644             # such as ';' or ')', which will create problems when parsing trees. The best
645             # is to only use letters, digits and '_'. Sequences can be interrupted by
646             # space and tab, but not by return characters. Be sure that the lengths of the
647             # sequences are all the same, and identical to the lengths indicated in the
648             # header. Sequences can be interleaved, in which case the taxon names may or
649             # may not be repeated in each block.
650              
651              
652             sub store_phylip {
653             my $self = shift;
654             my $outfile = shift;
655             my $args = shift // {}; # HashRef (should not be empty...)
656              
657             my $short = $args->{short} // 1;
658             my $clean = $args->{clean} // 0;
659             my $chunk = $args->{chunk} // 60;
660              
661             open my $out, '>', $outfile;
662              
663             # print data matrix dimensions
664             my $height = $self->count_seqs;
665             my $width = $self->width;
666             say {$out} $height . q{ } . $width;
667              
668             # setup id format (this will also affect block-like structure)
669             my $format = $short ? "%-10.10s %s\n" : "%s %s\n";
670             my $method = $short ? 'full_id' : 'foreign_id';
671             my $sep = $short ? q{ } : q{};
672              
673             # optionally disable wrapping
674             $chunk = $width if $chunk < 0;
675              
676             # optionally clean seq
677             my @seqs = $self->all_seqs;
678             @seqs = map { $_->clone->gapify('X') } @seqs if $clean;
679              
680             # output Ali in sequential or interleaved format
681             for (my $site = 0; $site < $width; $site += $chunk) {
682              
683             # leave empty line between chunks (but not after last chunk)
684             print {$out} "\n" if $site;
685              
686             for my $seq (@seqs) {
687              
688             # print 10-chars ids only once
689             # Note: full_ids are only truncated (use IdMapper for more)
690             my $id = $site == 0 ? $seq->$method : q{};
691              
692             # print seq chunks in 10-state blocks
693             # Note: We insert a space in the 11th column to ensure that more
694             # software packages can read the written files. PHYLIP itself will
695             # ignore this spurious space in the 'sequences'.
696             ( my $str = $seq->edit_seq($site, $chunk) ) =~ s{$GAP}{-}xmsg;
697             printf {$out} $format, $id, join $sep,
698             map { substr($str, $_*10, 10) } 0..ceil(length($str)/10)-1;
699             }
700             }
701              
702             return;
703             }
704              
705              
706             sub load_stockholm {
707 2     2 1 33 my $class = shift;
708 2         5 my $infile = shift;
709              
710 2         10 open my $in, '<', $infile;
711              
712 2         445 my $ali = $class->new( file => $infile );
713 2         27 tie my %seq_for, 'Tie::IxHash';
714              
715             LINE:
716 2         1300 while (my $line = <$in>) {
717 44         294 chomp $line;
718              
719             # process GF comments
720 44 100       106 next LINE if $ali->is_comment($line, $STK_COMMENT);
721              
722             # skip empty lines and other comment lines
723 27 100 100     219 next LINE if $line =~ $EMPTY_LINE
724             || $line =~ $COMMENT_LINE;
725 11 100       33 last LINE if $line =~ $END_LINE;
726              
727             # parse stockholm seq
728 9         46 my ($seq_id, $seq) = $line =~ $STK_SEQ;
729              
730             # replace dots and gaps by stars
731 9         30 $seq =~ s/[\.\-]/*/xmsg;
732              
733             # elongate seq
734 9         44 $seq_for{$seq_id} .= $seq;
735             }
736              
737             # populate Ali object
738 2         16 while (my ($seq_id, $seq) = each %seq_for) {
739 9         395 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
740 9         302 $ali->add_seq($new_seq);
741             }
742              
743 2         72 return $ali;
744             }
745              
746              
747             sub instant_store {
748 1     1 1 6 my $class = shift;
749 1         2 my $outfile = shift;
750 1   50     5 my $args = shift // {}; # HashRef (should not be empty...)
751              
752 1         4 my $infile = $args->{infile};
753 1 50       6 croak '[BMC] Error: no infile specified for instant_store; aborting!'
754             unless $infile;
755              
756 1         8 my $coderef = $args->{coderef};
757 1 50       4 croak '[BMC] Error: no coderef specified for instant_store; aborting!'
758             unless $coderef;
759              
760 1         6 open my $in, '<', $infile;
761 1         349 open my $out, '>', $outfile;
762              
763 1         273 my $seq_id;
764             my $seq;
765              
766             LINE:
767 1         793 while (my $line = <$in>) {
768 92         144 chomp $line;
769              
770             # skip empty lines and process comments
771 92 100 66     493 next LINE if $line =~ $EMPTY_LINE
772             || $line =~ $COMMENT_LINE;
773              
774             # at each '>' char...
775 90         223 my ($defline) = $line =~ $DEF_LINE;
776 90 100       154 if ($defline) {
777              
778             # process current seq (if any)
779 3 100       9 if ($seq) {
780 2         80 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
781 2         3 print {$out} $coderef->($new_seq);
  2         12  
782 2         78 $seq = q{};
783             }
784              
785 3         9 $seq_id = $defline;
786 3         19 next LINE;
787             }
788              
789             # elongate current seq (seqs can be broken on several lines)
790 87         251 $seq .= $line;
791             }
792              
793             # process last seq (if any)
794 1 50       6 if ($seq) {
795 1         38 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
796 1         3 print {$out} $coderef->($new_seq);
  1         5  
797             }
798              
799 1         156 return;
800             }
801              
802              
803             sub instant_count {
804 1     1 1 443 my $class = shift;
805 1         4 my $infile = shift;
806              
807 1         2 my $seq_n = 0;
808              
809 1         7 open my $in, '<', $infile;
810 1 100       2860 while (<$in>) { $seq_n++ if m/^>/xms }
  502         2361  
811              
812 1         21 return $seq_n;
813             }
814              
815             __PACKAGE__->meta->make_immutable;
816             1;
817              
818             __END__
819              
820             =pod
821              
822             =head1 NAME
823              
824             Bio::MUST::Core::Ali - Multiple sequence alignment
825              
826             =head1 VERSION
827              
828             version 0.212670
829              
830             =head1 SYNOPSIS
831              
832             #!/usr/bin/env perl
833              
834             use Modern::Perl '2011';
835             # same as:
836             # use strict;
837             # use warnings;
838             # use feature qw(say);
839              
840             use Bio::MUST::Core;
841             use aliased 'Bio::MUST::Core::Ali';
842              
843             # read Ali form disk
844             my $ali = Ali->load('example.ali');
845              
846             # get some properties
847             say 'height: ' . $ali->height; # number of seqs
848             say 'width: ' . $ali->width; # number of sites
849             say '% miss: ' . $ali->perc_miss; # fraction of missing chars (%)
850             say 'seqs are ' . $ali->is_protein ? 'prot' : 'nucl';
851              
852             # turn seqs to uppercase
853             $ali->uc_seqs;
854              
855             # filter out seqs with no species associated
856             my @seqs = $ali->filter_seqs( sub { not $_->is_genus_only } );
857             use aliased 'Bio::MUST::Core::IdList';
858             my $list = IdList->new( ids => \@seqs );
859             $ali->apply_list($list);
860              
861             # alternatively:
862             # $ali = Ali->new( seqs => \@seqs );
863              
864             # filter out gap-rich sites
865             $ali->idealize(0.2); # min 20% non-gaps per site
866              
867             # filter out non-informative sites
868             use aliased 'Bio::MUST::Core::SeqMask';
869             my $mask = SeqMask->parsimony_mask($ali);
870             $ali->apply_mask($mask);
871              
872             # write down reduced Ali to disk
873             $ali->store('example-uc-genus-sp-20.ali');
874             $ali->store_fasta('example-uc-genus-sp-20.fasta');
875              
876             # see below for additional methods
877             # ...
878              
879             =head1 DESCRIPTION
880              
881             This module implements the multiple sequence alignment (MSA) class and its
882             methods. An Ali is modeled as an array of L<Bio::MUST::Core::Seq> objects.
883             Consequently, sequence ids do not absolutely need to be unique for it to
884             work (though id uniqueness helps a lot for sequence access and filtering).
885              
886             An Ali knows whether it contains nucleotide or protein sequences, whether
887             those are aligned or not, as well as its Seq count and exact width. All
888             these properties are introspected on the fly, which means that, while they
889             can be expensive to compute, they are always accurate.
890              
891             This is important because an Ali provides methods for inserting, deleting
892             and editing its Seq objects. Further, it does its best to maintain a semantic
893             distinction between true gap states (encoded as '*') and missing regions of
894             undetermined length (encoded as pure whitespace, for example at the end of a
895             short sequence).
896              
897             It also has methods for mapping its sequence ids (for example, before export
898             to the PHYLIP format), as well as methods to selectively retain sequences
899             and sites based on L<Bio::MUST::Core::IdList> and
900             L<Bio::MUST::Core::SeqMask> objects. For example, the C<idealize> method
901             discards shared gaps and optionally removes gap-rich sites only due to a
902             tiny number of sequences.
903              
904             Finally, an Ali can be stored in MUST pseudo-FASTA format (which handles
905             meaningful whitespace and allows comment lines in the header) or be
906             imported/exported from/to several popular MSA formats, such as plain FASTA,
907             STOCKHOLM and PHYLIP.
908              
909             =head1 ATTRIBUTES
910              
911             =head2 seqs
912              
913             ArrayRef of L<Bio::MUST::Core::Seq> objects (optional)
914              
915             Most of the accessor methods described below are implemented by delegation
916             to this public attribute using L<Moose::Meta::Attribute::Native/Moose native
917             delegation>. Their documentation thus heavily borrows from the corresponding
918             help pages.
919              
920             =head2 file
921              
922             L<Path::Class::File> object (optional)
923              
924             This optional attribute is initialized by class methods that C<load> an Ali
925             from disk. It is meant to improve the introspection capabilities of the Ali.
926             For now, this attribute is not used by the C<store> methods, though it might
927             provide them with a default value in the future.
928              
929             =head2 guessing
930              
931             Boolean (optional)
932              
933             By default, an Ali object tries to guess whether it is aligned or not by
934             looking for gap-like characters in any of its Seq objects (see
935             L<Bio::MUST::Core::Seq> for the exact test performed on each sequence).
936              
937             When this smart behavior causes issues, one can disable it by unsetting this
938             boolean attribute (see C<dont_guess> and C<guess> accessor methods).
939              
940             =head2 comments
941              
942             ArrayRef of strings (optional)
943              
944             An Ali object is commentable, which means that it supports all the methods
945             pertaining to comment lines described in
946             L<Bio::MUST::Core::Roles::Commentable> (such as C<header>).
947              
948             =head1 CONSTRUCTORS
949              
950             =head2 new
951              
952             Default constructor (class method) returning a new Ali.
953              
954             use aliased 'Bio::MUST::Core::Ali';
955             my $ali1 = Ali->new();
956             my @seqs = $ali->all_seqs;
957             my $ali2 = Ali->new( seqs => \@seqs );
958              
959             This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>,
960             C<file>, C<guessing> and C<comments>.
961              
962             =head2 clone
963              
964             Creates a deep copy (a clone) of the Ali. Returns the copy.
965              
966             use aliased 'Bio::MUST::Core::Ali';
967             my $ali = Ali->load('input.ali');
968             my $ali_copy = $ali->clone;
969             # you can now mess with $ali_copy without affecting $ali
970              
971             This method does not accept any arguments.
972              
973             =head1 ACCESSORS
974              
975             =head2 add_seq
976              
977             Adds one (or more) new sequence(s) at the end of the Ali. Returns the new
978             number of sequences of the Ali.
979              
980             use aliased 'Bio::MUST::Core::Seq';
981             my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
982             $ali->add_seq($new_seq);
983              
984             This method accepts any number of arguments.
985              
986             =head2 get_seq
987              
988             Returns a sequence of the Ali by its index. You can also use negative index
989             numbers, just as with Perl's core array handling. If the specified sequence
990             does not exist, this method will return C<undef>.
991              
992             my $seq = $ali->get_seq($index);
993             croak "Seq $index not found in Ali!" unless defined $seq;
994              
995             This method accepts just one argument (and not an array slice).
996              
997             =head2 get_seq_with_id
998              
999             Returns a sequence of the Ali by its id. If the specified id is not unique,
1000             only the first matching sequence will be returned, whereas if no sequence
1001             exists for the specified id, this method will return C<undef>.
1002              
1003             my $id = 'Pyrus malus_3750@658052655';
1004             my $seq = $ali->get_seq_with_id($id);
1005             croak "Seq $id not found in Ali!" unless defined $seq;
1006              
1007             This method accepts just one argument.
1008              
1009             =head2 set_seq
1010              
1011             Given an index and a sequence, sets the specified Ali element to the
1012             sequence. This method returns the new sequence at C<$index>.
1013              
1014             use aliased 'Bio::MUST::Core::Seq';
1015             my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
1016             $ali->set_seq($index, $new_seq);
1017              
1018             This method requires two arguments.
1019              
1020             =head2 delete_seq
1021              
1022             Removes the sequence at the given index from the Ali. This method returns
1023             the deleted sequence. If the specified sequence does not exist, it will
1024             return C<undef> instead.
1025              
1026             $ali->delete_seq($index);
1027              
1028             This method requires one argument.
1029              
1030             =head2 insert_seq
1031              
1032             Inserts a new sequence into the Ali at the given index. This method returns
1033             the new sequence at C<$index>.
1034              
1035             use aliased 'Bio::MUST::Core::Seq';
1036             my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
1037             $ali->insert_seq($index, $new_seq);
1038              
1039             This method requires two arguments.
1040              
1041             =head2 all_seqs
1042              
1043             Returns all the sequences of the Ali (not an array reference).
1044              
1045             my @seqs = $ali->all_seqs;
1046              
1047             This method does not accept any arguments.
1048              
1049             =head2 first_seq
1050              
1051             Returns the first sequence of the Ali matching a given criterion, just like
1052             L<List::Util>'s C<first> function. This method requires a subroutine
1053             implementing the matching logic.
1054              
1055             # emulate get_seq_with_id method
1056             my $id2find = 'seq13';
1057             my $seq = $ali->first_seq( sub { $_->full_id eq $id2find } );
1058              
1059             This method requires a single argument.
1060              
1061             =head2 filter_seqs
1062              
1063             Returns every sequence of the Ali matching a given criterion, just like
1064             Perl's core C<grep> function. This method requires a subroutine implementing
1065             the matching logic.
1066              
1067             # keep only long sequences (ignoring gaps and missing states)
1068             my @long_seqs = $ali->filter_seqs( sub { $_->nomiss_seq_len > 500 } );
1069              
1070             This method requires a single argument.
1071              
1072             =head2 all_new_seqs
1073              
1074             Returns all the sequences of the Ali tagged as #NEW# (not an array reference).
1075              
1076             my @new_seqs = $ali->all_new_seqs;
1077              
1078             This method does not accept any arguments.
1079              
1080             =head2 all_but_new_seqs
1081              
1082             Returns all the sequences of the Ali except those tagged as #NEW# (not an array
1083             reference).
1084              
1085             my @preexisting_seqs = $ali->all_but_new_seqs;
1086              
1087             This method does not accept any arguments.
1088              
1089             =head2 all_seq_ids
1090              
1091             Returns all the sequence ids (L<Bio::MUST::Core::SeqId> objects) of the Ali
1092             (not an array reference). This is only a convenience method.
1093              
1094             use Test::Deeply;
1095             my @ids1 = $ali->all_seq_ids;
1096             my @ids2 = map { $_->seq_id } $ali->all_seqs;
1097             is_deeply \@ids1, \@ids2, 'should be true';
1098             my @orgs = map { $_->org } @ids1;
1099              
1100             This method does not accept any arguments.
1101              
1102             =head2 filename
1103              
1104             Returns the stringified filename of the Ali.
1105              
1106             This method does not accept any arguments.
1107              
1108             =head2 guess
1109              
1110             Turn on the smart detection of gaps (see C<guessing> attribute above).
1111              
1112             This method does not accept any arguments.
1113              
1114             =head2 dont_guess
1115              
1116             Turn off the smart detection of gaps (see C<guessing> attribute above).
1117              
1118             use aliased 'Bio::MUST::Core::Ali';
1119             my $ali = Ali->load('ensembl.fasta');
1120             $ali->dont_guess;
1121              
1122             This method does not accept any arguments.
1123              
1124             =head1 PROPERTIES
1125              
1126             =head2 has_uniq_ids
1127              
1128             Returns true if all the sequence ids are unique.
1129              
1130             carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids;
1131              
1132             This method does not accept any arguments.
1133              
1134             =head2 is_protein
1135              
1136             Returns true if any sequence of the Ali looks like a protein. See
1137             L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
1138              
1139             say 'Your file includes nucleotide sequences' unless $ali->is_protein;
1140              
1141             This method does not accept any arguments.
1142              
1143             =head2 is_aligned
1144              
1145             Returns true if any sequence of the Ali appears to be aligned. See
1146             L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
1147              
1148             If the boolean attribute guessing is not set, always returns false.
1149              
1150             carp 'Warning: file does not look aligned!' unless $ali->is_aligned;
1151              
1152             This method does not accept any arguments.
1153              
1154             =head2 count_seqs
1155              
1156             Returns the number of sequences of the Ali. The alias method C<height> is
1157             provided for convenience.
1158              
1159             my $height = $ali->count_seqs;
1160              
1161             This method does not accept any arguments.
1162              
1163             =head2 width
1164              
1165             Returns the width of the Ali (in characters). If the Ali is not aligned,
1166             returns the length of the longest sequence instead.
1167              
1168             To avoid potential bugs due to caching, this method dynamically computes the
1169             Ali width at each call. Moreover, the Ali is always uniformized (see below)
1170             beforehands to ensure accurate width value. Therefore, this method is
1171             expensive and should not be called repeatedly (e.g., in a loop condition).
1172              
1173             # you'd better looping through sites like this...
1174             my $width = $ali->width;
1175             for my $site (0..$width-1) {
1176             ...
1177             }
1178              
1179             This method does not accept any arguments.
1180              
1181             =head2 seq_len_stats
1182              
1183             Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps).
1184             The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max).
1185              
1186             This method does not accept any arguments.
1187              
1188             =head2 perc_miss
1189              
1190             Returns the percentage of missing (and gap-like) character states in the Ali.
1191              
1192             As this method internally calls C<Ali::width>, the remarks above also apply.
1193              
1194             my $miss_level = $ali->perc_miss;
1195              
1196             This method does not accept any arguments.
1197              
1198             =head1 MUTATORS
1199              
1200             =head2 uc_seqs
1201              
1202             Turn all the sequences of the Ali to uppercase and returns it.
1203              
1204             This method does not accept any arguments.
1205              
1206             =head2 recode_seqs
1207              
1208             Recode all the sequences of the Ali and returns it.
1209              
1210             use aliased 'Bio::MUST::Core::Ali';
1211             my $ali = Ali->load('biased.ali');
1212              
1213             # set up RY recoding for suppressing codon bias
1214             my %base_for = (
1215             A => 'A', G => 'A', # purines
1216             C => 'C', T => 'C', # pyrimidines
1217             );
1218              
1219             my $ali_rec = $ali->recode_seqs( \%base_for );
1220             $ali_rec->store('biased_ry.ali');
1221              
1222             This method requires one argument.
1223              
1224             =head2 degap_seqs
1225              
1226             Remove the gaps in all the sequences of the Ali and returns it.
1227              
1228             This method does not accept any arguments.
1229              
1230             =head2 spacify_seqs
1231              
1232             Spacifies all the sequences of the Ali and returns it. See the corresponding
1233             method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1234             operation.
1235              
1236             This method does not accept any arguments.
1237              
1238             =head2 gapify_seqs
1239              
1240             Gapifies all the sequences of the Ali and returns it. See the corresponding
1241             method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1242             operation.
1243              
1244             This method accepts an optional argument.
1245              
1246             =head2 trim_seqs
1247              
1248             Trims all the sequences of the Ali and returns it. See the corresponding
1249             method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1250             operation.
1251              
1252             This method does not accept any arguments.
1253              
1254             =head2 pad_seqs
1255              
1256             Pads all the sequences of the Ali and returns it. See the corresponding
1257             method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1258             operation.
1259              
1260             This method does not accept any arguments.
1261              
1262             =head2 uniformize
1263              
1264             Performs the three gap-cleaning operations in turn on all the sequences of
1265             the Ali and returns it, which ensures that it is semantically clean and
1266             rectangular.
1267              
1268             This is only a convenience method called internally by the Ali object before
1269             selected methods (such as storage-like methods). However, it might prove
1270             useful in some circumstances, hence it is not defined as private.
1271              
1272             use aliased 'Bio::MUST::Core::Ali';
1273             my $ali = Ali->load('input.ali');
1274             $ali->add_seq(...);
1275             # more editing of the Ali sequences
1276             $ali->uniformize;
1277              
1278             This method does not accept any arguments.
1279              
1280             =head2 clear_new_tags
1281              
1282             Clear the #NEW# tag (if any) from all the sequences of the Ali and returns it.
1283              
1284             use aliased 'Bio::MUST::Core::Ali';
1285             my $ali = Ali->load('input-42.ali');
1286             $ali->clear_new_tags;
1287             my @new_seqs = $ali->all_new_seqs;
1288             # array should be empty
1289              
1290             This method does not accept any arguments.
1291              
1292             =head2 shorten_ids
1293              
1294             Replaces all the sequence ids of the Ali by their abbreviated forms as
1295             specified by the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
1296              
1297             Note that this method will work only if the sequence ids have not been
1298             already shortened or modified in any way since the creation of the IdMapper.
1299             Long ids without abbreviated forms in the IdMapper are left untouched.
1300              
1301             use aliased 'Bio::MUST::Core::Ali';
1302             use aliased 'Bio::MUST::Core::IdMapper';
1303             my $ali = Ali->load('input.ali');
1304             my $mapper = IdMapper->std_mapper($ali, 'lcl|seq');
1305             $ali->shorten_ids($mapper);
1306             $ali->store_fasta('input.4blast.fasta');
1307             # makeblastdb
1308              
1309             # Note: the temp_fasta method does exactly that
1310              
1311             This method requires one argument.
1312              
1313             =head2 restore_ids
1314              
1315             Replaces all the sequence ids of the Ali by their long forms as specified by
1316             the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
1317              
1318             Note that this method will work only if the sequence ids have been previously
1319             abbreviated (see above) and have not been modified in any way since then.
1320             Again, abbreviated ids without long forms in the IdMapper are left untouched.
1321              
1322             use aliased 'Bio::MUST::Core::IdMapper';
1323             my $mapper = IdMapper->gi_mapper($ali);
1324             $ali->shorten_ids($mapper);
1325             ...
1326             $ali->restore_ids($mapper);
1327              
1328             This method requires one argument.
1329              
1330             =head2 apply_list
1331              
1332             Selectively retains or discards sequences from the Ali based on the content
1333             of the passed L<Bio::MUST::Core::IdList> object and returns the Ali.
1334              
1335             use aliased 'Bio::MUST::Core::IdList';
1336             my $list = IdList->load('interesting_seqs.idl');
1337             $ali->apply_list($list); # discard non-interesting seqs
1338              
1339             This method requires one argument.
1340              
1341             =head2 apply_mask
1342              
1343             Selectively retains or discards sites from the Ali based on the content of
1344             the passed L<Bio::MUST::Core::SeqMask> object and returns the Ali.
1345              
1346             use aliased 'Bio::MUST::Core::SeqMask';
1347             my $variable_sites = SeqMask->variable_mask($ali);
1348             $ali->apply_mask($variable_sites); # discard constant sites
1349              
1350             This method requires one argument.
1351              
1352             =head2 idealize
1353              
1354             Computes and applies an ideal sequence mask to the Ali and returns it. This
1355             is only a convenience method.
1356              
1357             When invoked without arguments, it will discard the gaps that are
1358             universally shared by all the sequences. Otherwise, the provided argument
1359             corresponds to the threshold of the C<ideal_mask> method described in
1360             L<Bio::MUST::Core::SeqMask>.
1361              
1362             use aliased 'Bio::MUST::Core::IdList';
1363             my $fast_seqs = IdList->load('fast_evolving_seqs.idl');
1364             my $seqs2keep = $fast_seqs->negative_list($ali);
1365             $ali->apply_list($seqs2keep); # discard fast-evolving seqs
1366             $ali->idealize; # discard newly shared gaps caused by fast seqs
1367              
1368             use aliased 'Bio::MUST::Core::Ali';
1369             my $ali = Ali->load('hmm_based.ali');
1370             $ali->idealize(0.05); # discard insertions due to <5% of the seqs
1371              
1372             This method accepts an optional argument.
1373              
1374             =head1 MISC METHODS
1375              
1376             =head2 gapmiss_regex
1377              
1378             Returns a regular expression matching gaps and ambiguous or missing states.
1379             The exact regex returned depends on the type of sequences in the Ali (nucl. or
1380             proteins).
1381              
1382             my $regex = $ali->gapmiss_regex;
1383             my $first_seq = $ali->get_seq(0)->seq;
1384             my $gapmiss_n = $first_seq =~ m/($regex)/xmsg;
1385             say "The first sequence has $gapmiss_n gaps or ambiguous/missing sites";
1386              
1387             This method does not accept any arguments.
1388              
1389             =head2 map_coords
1390              
1391             Converts a set of site positions from Ali coordinates to coordinates of the
1392             specified sequence (thereby ignoring positions due to gaps). Returns the
1393             converted sites in sequence coordinates as an array refrence.
1394              
1395             use aliased 'Bio::MUST::Core::Ali';
1396             my $ali = Ali->load('input.ali');
1397             my $id = 'GIV-Norovirus Hum.GIV.1.POL_1338688@508124125';
1398             my $ali_coords = [ 4, 25, 73, 89, 104, 116 ];
1399             my $seq_coords = $ali->map_coords($id, $ali_coords);
1400             # $seq_coords is [ 3, 23, 59, 71, 71, 74 ]
1401              
1402             This method requires two arguments: the id of a sequence and an array
1403             reference of input sites in Ali coordinates.
1404              
1405             =head1 I/O METHODS
1406              
1407             =head2 load
1408              
1409             Class method (constructor) returning a new Ali read from disk. This method
1410             will transparently import plain FASTA files in addition to the MUST
1411             pseudo-FASTA format (ALI files).
1412              
1413             use Test::Deeply;
1414             use aliased 'Bio::MUST::Core::Ali';
1415             my $ali1 = Ali->load('example.ali');
1416             my $ali2 = Ali->load('example.fasta');
1417             my @seqs1 = $ali1->all_seqs;
1418             my @seqs2 = $ali2->all_seqs;
1419             is_deeply, \@seqs1, \@seqs2, 'should be true';
1420              
1421             This method requires one argument.
1422              
1423             =head2 store
1424              
1425             Writes the Ali to disk in the MUST pseudo-FASTA format (ALI files).
1426              
1427             Note that the ALI format is only used when the suffix of the outfile name is
1428             '.ali'. In all other cases (including lack of suffix), this method
1429             automatically forwards the call to C<store_fasta>.
1430              
1431             $ali->store('output.ali');
1432             # output.ali is written in ALI format
1433             $ali->store('output.fasta');
1434             # output.fasta is written in FASTA format
1435              
1436             This method requires one argument (but see C<store_fasta> in case of automatic
1437             forwarding of the method call).
1438              
1439             =head2 store_fasta
1440              
1441             Writes the Ali to disk in the plain FASTA format.
1442              
1443             For compatibility purposes, this method automatically fetches sequence ids
1444             using the C<foreign_id> method instead of the native C<full_id> method, both
1445             described in L<Bio::MUST::Core::SeqId>.
1446              
1447             $ali->store_fasta( 'output.fasta' );
1448             $ali->store_fasta( 'output.fasta', {chunk => -1, degap => 1} );
1449              
1450             This method requires one argument and accepts a second optional argument
1451             controlling the output format. It is a hash reference that may contain one
1452             or more of the following keys:
1453              
1454             - clean: replace all ambiguous and missing states by C<X> (default: false)
1455             - degap: boolean value controlling degapping (default: false)
1456             - chunk: line width (default is 60 chars; negative values means no wrap)
1457              
1458             Finally, it is possible to fine-tune the behavior of the C<clean> option by
1459             providing another character than C<X> through the C<gapify> key. This can be
1460             useful to replace all ambiguous and missing states by gaps, as shown below:
1461              
1462             $ali->store_fasta( 'output.fasta, { clean => 1, gapify => '*' } );
1463              
1464             =head2 temp_fasta
1465              
1466             Writes a temporary copy of the Ali to disk in the plain FASTA format using
1467             numeric sequence ids and returns the name of the temporary file. This is
1468             only a convenience method.
1469              
1470             In list context, returns the IdMapper object along with temporary filename.
1471              
1472             my $infile = $ali->temp_fasta( { degap => 1 } );
1473             my $output = `script.sh $infile`;
1474             ...
1475              
1476             This method accepts the same optional argument hash as C<store_fasta>.
1477             However, an additional option (C<id_prefix>) is available to control the way
1478             abbreviated sequence ids are prefixed by the C<std_mapper> method (see
1479             L<Bio::MUST::Core::Listable>).
1480              
1481             my $infile1 = $ali1->temp_fasta( { id_prefix => 'file1-' } );
1482             my $infile2 = $ali2->temp_fasta( { id_prefix => 'file2-' } );
1483              
1484             =head2 load_phylip
1485              
1486             Class method (constructor) returning a new Ali read from a file in the
1487             PHYLIP format. Both sequential and interleaved formats are supported.
1488              
1489             The only constraint is that sequence ids MUST NOT contain whitespace and be
1490             followed by at least one whitespace character. This means that some old-school
1491             PHYLIP files not following this convention will not be processed correctly.
1492              
1493             When using the interleaved format, sequence ids may or may not be repeated in
1494             each block.
1495              
1496             use aliased 'Bio::MUST::Core::Ali';
1497             my $ali = Ali->load_phylip('phylip.phy');
1498             say $ali->count_seqs;
1499             say $ali->width;
1500              
1501             # outputs:
1502             # 10
1503             # 709
1504              
1505             This method requires one argument.
1506              
1507             =head2 store_phylip
1508              
1509             Writes the Ali to disk in the interleaved (or sequential) PHYLIP format.
1510              
1511             To ensure maximal flexibility, this method fetches sequence ids using the
1512             native C<full_id> method described in L<Bio::MUST::Core::SeqId>, but
1513             truncates them to 10 characters, as expected by the original PHYLIP software
1514             package. No other tinkering is carried out on the ids. Thus, if the ids
1515             contain whitespace or are not unique in their 10 first characters, it is
1516             advised to first map them using one of the constructors in
1517             L<Bio::MUST::Core::IdMapper>.
1518              
1519             use aliased 'Bio::MUST::Core::Ali';
1520             use aliased 'Bio::MUST::Core::IdMapper';
1521             my $ali = Ali->load('input.ali');
1522             my $mapper = IdMapper->std_mapper($ali);
1523             $ali->shorten_ids($mapper);
1524             $ali->store_phylip( 'input.phy', { chunk => 50 } );
1525              
1526             This method requires one argument and accepts a second optional argument
1527             controlling the output format. It is a hash reference that may contain one
1528             or more of the following keys:
1529              
1530             - short: truncate ids to 10 chars, as in original PHYLIP (defaut: yes)
1531             - clean: replace all ambiguous and missing states by 'X' (default: false)
1532             - chunk: line width (default: 60 chars; negative values means no wrap)
1533              
1534             To store the Ali in PHYLIP sequential format, specify a negative chunk (-1).
1535              
1536             =head2 load_stockholm
1537              
1538             Class method (constructor) returning a new Ali read from a file in the
1539             STOCKHOLM format. =GF comments are retained (see above) but not the other
1540             comment classes (=GS, =GR and =GC).
1541              
1542             use aliased 'Bio::MUST::Core::Ali';
1543             my $ali = Ali->load('upsk.stockholm');
1544             say $ali->header;
1545              
1546             # outputs:
1547             # ID UPSK
1548             # SE Predicted; Infernal
1549             # SS Published; PMID 9223489
1550             # RN [1]
1551             # RM 9223489
1552             # RT The role of the pseudoknot at the 3' end of turnip yellow mosaic
1553             # RT virus RNA in minus-strand synthesis by the viral RNA-dependent RNA
1554             # RT polymerase.
1555             # RA Deiman BA, Kortlever RM, Pleij CW;
1556             # RL J Virol 1997;71:5990-5996.
1557              
1558             This method requires one argument.
1559              
1560             =head2 instant_store
1561              
1562             Class method intended to transform a large sequence file read from disk
1563             without loading it in memory. This method will transparently process plain
1564             FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
1565              
1566             my $chunk = 200;
1567              
1568             my $split = sub {
1569             my $seq = shift;
1570             my $base_id = ( split /\s+/xms, $seq->full_id )[0];
1571             my $max_pos = $seq->seq_len - $chunk;
1572             my $n = 0;
1573             my $out_str;
1574             for (my $pos = 0; $pos <= $max_pos; $pos += $chunk, $n++) {
1575             $out_str .= ">$base_id.$n\n" . $seq->edit_seq($pos,
1576             $pos + $chunk <= $max_pos ? $chunk : 2 * $chunk
1577             ) . "\n";
1578             }
1579             return $out_str;
1580             };
1581              
1582             use aliased 'Bio::MUST::Core::Ali';
1583              
1584             Ali->instant_store(
1585             'outfile.fasta', { infile => 'infile.fasta', coderef => $split }
1586             );
1587              
1588             This method requires two arguments. The sercond is a hash reference that must
1589             contain the following keys:
1590             - infile: input sequence file
1591             - coderef: subroutine implementing the transforming logic
1592              
1593             =head2 instant_count
1594              
1595             Class method returning the number of seqs in any sequence file read from disk
1596             without loading it in memory. This method will transparently process plain
1597             FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
1598              
1599             use aliased 'Bio::MUST::Core::Ali';
1600             my $seq_n = Ali->instant_count('input.ali');
1601             say $seq_n;
1602              
1603             =head1 ALIASES
1604              
1605             =head2 height
1606              
1607             Alias for C<count_seqs> method. For API consistency.
1608              
1609             =head1 AUTHOR
1610              
1611             Denis BAURAIN <denis.baurain@uliege.be>
1612              
1613             =head1 CONTRIBUTORS
1614              
1615             =for stopwords Catherine COLSON Arnaud DI FRANCO
1616              
1617             =over 4
1618              
1619             =item *
1620              
1621             Catherine COLSON <ccolson@doct.uliege.be>
1622              
1623             =item *
1624              
1625             Arnaud DI FRANCO <arnaud.difranco@gmail.com>
1626              
1627             =back
1628              
1629             =head1 COPYRIGHT AND LICENSE
1630              
1631             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
1632              
1633             This is free software; you can redistribute it and/or modify it under
1634             the same terms as the Perl 5 programming language system itself.
1635              
1636             =cut