File Coverage

blib/lib/Bio/MUST/Core/Ali.pm
Criterion Covered Total %
statement 278 279 99.6
branch 65 76 85.5
condition 14 18 77.7
subroutine 56 57 98.2
pod 33 33 100.0
total 446 463 96.3


line stmt bran cond sub pod time code
1             package Bio::MUST::Core::Ali;
2             # ABSTRACT: Multiple sequence alignment
3             # CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be>
4             # CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com>
5             $Bio::MUST::Core::Ali::VERSION = '0.212650';
6 17     17   13250 use Moose;
  17         50  
  17         128  
7 17     17   89907 use namespace::autoclean;
  17         46  
  17         156  
8              
9 17     17   1573 use autodie;
  17         40  
  17         141  
10 17     17   90486 use feature qw(say);
  17         44  
  17         1508  
11              
12             # use Smart::Comments;
13              
14 17     17   122 use Carp;
  17         51  
  17         1195  
15 17     17   127 use File::Temp;
  17         47  
  17         1461  
16 17     17   123 use List::AllUtils qw(uniq indexes sum0);
  17         45  
  17         1003  
17 17     17   122 use Path::Class qw(file);
  17         51  
  17         807  
18 17     17   107 use POSIX qw(ceil floor);
  17         48  
  17         135  
19 17     17   11910 use Statistics::Descriptive;
  17         376433  
  17         661  
20 17     17   9199 use Tie::IxHash;
  17         42774  
  17         963  
21              
22 17     17   139 use Bio::MUST::Core::Types;
  17         50  
  17         514  
23 17     17   112 use Bio::MUST::Core::Constants qw(:gaps :files);
  17         44  
  17         3823  
24 17     17   148 use aliased 'Bio::MUST::Core::Seq';
  17         47  
  17         183  
25 17     17   4149 use aliased 'Bio::MUST::Core::SeqMask';
  17         46  
  17         73  
26              
27             # TODO: add information about methods available in Ali-like objects
28              
29             # ATTRIBUTES
30              
31              
32             has 'seqs' => (
33             traits => ['Array'],
34             is => 'ro',
35             isa => 'ArrayRef[Bio::MUST::Core::Seq]',
36             default => sub { [] },
37             writer => '_set_seqs',
38             handles => {
39             add_seq => 'push',
40             get_seq => 'get',
41             set_seq => 'set',
42             delete_seq => 'delete',
43             insert_seq => 'insert',
44             count_seqs => 'count',
45             all_seqs => 'elements',
46             first_seq => 'first',
47             filter_seqs => 'grep',
48             },
49             );
50              
51              
52             has 'file' => (
53             is => 'ro',
54             isa => 'Bio::MUST::Core::Types::File',
55             default => 'untitled.ali',
56             coerce => 1,
57             handles => {
58             filename => 'stringify',
59             },
60             );
61              
62              
63             has 'guessing' => (
64             traits => ['Bool'],
65             is => 'ro',
66             isa => 'Bool',
67             default => 1,
68             handles => {
69             dont_guess => 'unset',
70             guess => 'set',
71             },
72             );
73              
74              
75             with 'Bio::MUST::Core::Roles::Commentable',
76             'Bio::MUST::Core::Roles::Listable';
77             with 'Bio::MUST::Core::Roles::Aliable'; ## no critic (ProhibitMultipleWiths)
78              
79             # CONSTRUCTORS
80              
81              
82              
83             sub clone {
84 1     1 1 4 my $self = shift;
85              
86             return $self->new(
87             comments => [ $self->all_comments ],
88 1         48 seqs => [ map { $_->clone } $self->all_seqs ],
  250         698  
89             file => file( $self->filename ),
90             guessing => $self->guessing,
91             );
92             }
93              
94             # ACCESSORS
95              
96              
97             sub get_seq_with_id {
98 4     4 1 2053 my $self = shift;
99 4         9 my $id = shift;
100              
101 4     14   204 my $seq = $self->first_seq( sub { $_->full_id eq $id } );
  14         47  
102 4 100       46 carp "[BMC] Warning: cannot find seq with id: $id; returning undef!"
103             unless $seq;
104              
105 4         433 return $seq;
106             }
107              
108              
109             sub all_new_seqs {
110 6     6 1 18 return shift->filter_seqs( sub { $_->is_new } );
  2     2   87  
111             }
112              
113              
114             sub all_but_new_seqs {
115 3     3 1 9 return shift->filter_seqs( sub { not $_->is_new } );
  1     1   44  
116             }
117              
118              
119             sub all_seq_ids {
120 105     105 1 3878 my $self = shift;
121 105         4197 return map { $_->seq_id } $self->all_seqs;
  1510         37776  
122             }
123              
124             # PROPERTIES
125              
126              
127             sub has_uniq_ids {
128 64     64 1 1689 my $self = shift;
129              
130 64         221 my @ids = map { $_->full_id } $self->all_seq_ids;
  941         23421  
131 64 100       2277 return 1 if $self->count_seqs == uniq @ids;
132 7         234 return 0;
133             }
134              
135              
136             sub is_protein {
137 77     77 1 3162 my $self = shift;
138 77 100   96   3207 return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs;
  96         503  
139 5         28 return 0;
140             }
141              
142              
143             sub is_aligned {
144 310     310 1 570 my $self = shift;
145 310 100       10937 return 0 if not $self->guessing;
146 307 100   8588   11833 return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs;
  8588         17936  
147 116         849 return 0;
148             }
149              
150              
151             sub width {
152 189     189 1 447 my $self = shift;
153 189 100       607 $self->uniformize if $self->is_aligned; # pad seqs for robustness
154 189         878 return $self->_max_seq_len;
155             }
156              
157             sub _max_seq_len {
158 365     365   685 my $self = shift;
159 365         12163 my @lengths = map { $_->seq_len } $self->all_seqs;
  12841         406292  
160 365   50     5088 return (List::AllUtils::max @lengths) // 0; # to avoid warnings
161             }
162              
163              
164             sub seq_len_stats {
165 1     1 1 13 my $self = shift;
166              
167 1         41 my @lengths = map { $_->nomiss_seq_len } $self->all_seqs;
  10         32  
168 1         23 my $stat = Statistics::Descriptive::Full->new;
169 1         160 $stat->add_data( \@lengths );
170 1         215 my @quantiles = map { $stat->quantile($_) } 0..4;
  5         279  
171              
172 1         65 return @quantiles;
173             }
174              
175              
176             sub perc_miss {
177 2     2 1 6 my $self = shift;
178              
179 2         86 my $n = sum0 map { $_->nomiss_seq_len } $self->all_seqs;
  255         601  
180 2 50       21 return 0.0 unless $n;
181              
182 2         9 my $total = $self->width * $self->count_seqs;
183 2         20 return 100.0 * ($total - $n) / $total;
184             }
185              
186             # MUTATORS
187              
188              
189             sub uc_seqs {
190 1     1 1 8 my $self = shift;
191              
192 1         40 $_->uc for $self->all_seqs;
193 1         4 return $self;
194             }
195              
196              
197             sub recode_seqs { ## no critic (RequireArgUnpacking)
198 1     1 1 27 my $self = shift;
199              
200 1         40 $_->recode(@_) for $self->all_seqs;
201 1         8 return $self;
202             }
203              
204              
205             sub degap_seqs {
206 1     1 1 8 my $self = shift;
207              
208 1         41 $_->degap for $self->all_seqs;
209 1         4 return $self;
210             }
211              
212              
213             sub spacify_seqs {
214 176     176 1 307 my $self = shift;
215              
216 176         5727 $_->spacify for $self->all_seqs;
217 176         609 return $self;
218             }
219              
220              
221             sub gapify_seqs { ## no critic (RequireArgUnpacking)
222 2     2 1 14 my $self = shift;
223 2         78 $_->gapify(@_) for $self->all_seqs;
224 2         6 return $self;
225             }
226              
227              
228             sub trim_seqs {
229 176     176 1 325 my $self = shift;
230              
231 176         5755 $_->trim for $self->all_seqs;
232 176         637 return $self;
233             }
234              
235              
236             sub pad_seqs {
237 176     176 1 344 my $self = shift;
238              
239 176         522 my $bound = $self->_max_seq_len;
240 176         5904 $_->pad_to($bound) for $self->all_seqs;
241 176         629 return $self;
242             }
243              
244              
245             sub uniformize {
246 175     175 1 362 my $self = shift;
247              
248             # TODO: profile code (triggers?) to avoid useless multiple calls
249              
250 175         612 $self->spacify_seqs;
251 175         664 $self->trim_seqs;
252 175         644 $self->pad_seqs;
253              
254 175         325 return $self;
255             }
256              
257              
258             sub clear_new_tags {
259 1     1 1 3 my $self = shift;
260              
261 1         41 $_->clear_new_tag for $self->all_seqs;
262 1         3 return $self;
263             }
264              
265              
266             sub shorten_ids { ## no critic (RequireArgUnpacking)
267 12     12 1 70 return shift->_change_ids_(1, @_);
268             }
269              
270              
271             sub restore_ids { ## no critic (RequireArgUnpacking)
272 10     10 1 40 return shift->_change_ids_(0, @_);
273             }
274              
275             sub _change_ids_ {
276 22     22   48 my $self = shift;
277 22         33 my $abbr = shift;
278 22         51 my $mapper = shift;
279              
280 22         834 for my $seq ($self->all_seqs) {
281 133 100       594 my $new_id = $abbr ? $mapper->abbr_id_for( $seq->full_id )
282             : $mapper->long_id_for( $seq->full_id );
283 133 50       4228 $seq->set_seq_id($new_id) if $new_id;
284             } # Note: leave id alone if not found
285              
286 22         99 return $self;
287             }
288              
289              
290             sub apply_list {
291 1     1 1 10 my $self = shift;
292 1         3 my $list = shift;
293              
294             # loop through seqs from bottom to top
295             # ... and delete seq from Ali if not listed in list
296 1         45 my $seq_n = $self->count_seqs;
297              
298 1         8 for (my $i = $seq_n-1; $i >= 0; $i--) {
299 250 100       8337 $self->delete_seq($i)
300             unless $list->is_listed( $self->get_seq($i)->full_id );
301             }
302              
303 1         7 return $self;
304             }
305              
306              
307             sub _premask_check {
308 40     40   450 my $self = shift;
309 40         77 my $mask = shift;
310              
311             # warn of unaligned Ali
312 40 50       133 carp '[BMC] Note: Ali does not look aligned!'
313             . ' This might not be an issue if seqs are ultra-conserved!'
314             unless $self->is_aligned;
315 40         245 $self->uniformize;
316              
317             # warn of empty SeqMask
318             carp '[BMC] Note: applying this mask will result in a zero-width Ali!'
319 40 50   1845   1666 unless List::AllUtils::any { $_ } $mask->all_states;
  1845         2339  
320              
321             # check that SeqMask is compatible with Ali
322             # potential bugs could come from constant sites etc
323 40         13732 my $a_width = $self->width;
324 40         1480 my $m_width = $mask->mask_len;
325 40 100       208 carp "[BMC] Note: Ali width does not match mask len: $a_width vs. $m_width!"
326             . ' This might not be an issue if the mask results from blocks.'
327             unless $a_width == $m_width;
328              
329 40         973 return; # return values of before subs are ignored
330             }
331              
332             before 'apply_mask' => \&_premask_check;
333              
334             sub apply_mask {
335             my $self = shift;
336             my $mask = shift;
337              
338             # select sites for each seq using a precomputed array slice
339             my @indexes = indexes { $_ } $mask->all_states;
340             $_->_set_seq( join q{}, ( $_->all_states )[@indexes] )
341             for $self->all_seqs;
342              
343             return $self;
344             }
345              
346              
347             sub idealize { ## no critic (RequireArgUnpacking)
348 13     13 1 94 my $self = shift;
349 13         123 return $self->apply_mask( SeqMask->ideal_mask($self, @_) );
350             }
351              
352             # MISC METHODS
353              
354              
355             sub gapmiss_regex {
356 73 100   73 1 1149 return shift->is_protein ? $GAPPROTMISS : $GAPDNAMISS;
357             }
358              
359              
360             sub map_coords {
361 1     1 1 23 my $self = shift;
362 1         2 my $id = shift;
363 1         3 my $coords_in = shift;
364              
365 1         10 my $seq = $self->get_seq_with_id($id);
366 1         5 my @states = $seq->all_states;
367              
368 1         4 my $count = 0;
369 1         4 my @index;
370 1         5 for my $state (@states) {
371 130 100       429 $count++ if $state !~ $GAP;
372 130         219 push @index, $count;
373             }
374              
375 1         14 my @coords_out = map { $index[$_-1] } @{$coords_in};
  6         16  
  1         4  
376              
377 1         16 return \@coords_out;
378             }
379              
380             # ALIASES
381              
382              
383             sub height {
384 0     0 1 0 return shift->count_seqs;
385             }
386              
387             # I/O METHODS
388              
389              
390             sub load {
391 69     69 1 26804 my $class = shift;
392 69         156 my $infile = shift;
393              
394 69         348 open my $in, '<', $infile;
395              
396 69         31930 my $ali = $class->new( file => $infile );
397 69         378 my $seq_id;
398             my $seq;
399              
400             LINE:
401 69         18289 while (my $line = <$in>) {
402 4091         7663 chomp $line;
403              
404             # skip empty lines and process comments
405 4091 100 100     24790 next LINE if $line =~ $EMPTY_LINE
406             || $ali->is_comment($line);
407              
408             # at each '>' char...
409 3944         14504 my ($defline) = $line =~ $DEF_LINE;
410 3944 100       8481 if ($defline) {
411              
412             # add current seq to ali (if any)
413 1662 100       3330 if ($seq) {
414 1593         47318 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
415 1593         55059 $ali->add_seq($new_seq);
416 1593         2895 $seq = q{};
417             }
418              
419 1662         2895 $seq_id = $defline;
420 1662         25696 next LINE;
421             }
422              
423             # elongate current seq (seqs can be broken on several lines)
424 2282         10945 $seq .= $line;
425             }
426              
427             # add last seq to ali (if any)
428 69 50       287 if ($seq) {
429 69         2361 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
430 69         2393 $ali->add_seq($new_seq);
431             }
432              
433 69         1605 return $ali;
434             }
435              
436              
437             before qr{\Astore}xms => sub {
438             my $self = shift;
439              
440             # perform pre-storage duties
441             carp '[BMC] Warning: non unique seq ids!' unless $self->has_uniq_ids;
442             $self->uniformize if $self->is_aligned;
443              
444             return;
445             };
446              
447              
448             sub store { ## no critic (RequireArgUnpacking)
449             my $self = shift;
450             my $outfile = shift;
451              
452             # automatically redirect to store_fasta for non-Ali outfile names
453             return $self->store_fasta($outfile, @_) # note the currying
454             unless $outfile =~ $ALI_SUFFIX;
455              
456             open my $out, '>', $outfile;
457              
458             print {$out} $self->header;
459             for my $seq ($self->all_seqs) {
460             say {$out} '>' . $seq->full_id;
461             say {$out} $seq->seq;
462             }
463              
464             return;
465             }
466              
467              
468             sub store_fasta {
469             my $self = shift;
470             my $outfile = shift;
471             my $args = shift // {}; # HashRef (should not be empty...)
472              
473             my $degap = $args->{degap} // 0;
474             my $clean = $args->{clean} // 0;
475             my $gap = $args->{gapify} // 'X';
476             my $chunk = $args->{chunk} // 60;
477             my $nowrap = $chunk < 0 ? 1 : 0;
478             my $is_aligned = $self->is_aligned;
479              
480             open my $out, '>', $outfile;
481              
482             for my $seq ($self->all_seqs) {
483             say {$out} '>' . $seq->foreign_id;
484              
485             # optionally clean and/or degap seq
486             $seq = $seq->clone if $clean || $degap; # clone seq only if needed
487             $seq->gapify($gap) if $clean;
488             $seq->degap if $degap;
489              
490             my $width = $seq->seq_len;
491             $chunk = $width if $nowrap; # optionally disable wrap
492              
493             my $str = $seq->wrapped_str($chunk);
494             $str =~ s{$GAP}{-}xmsg if $is_aligned; # restore '-' when aligned
495             print {$out} $str;
496             }
497              
498             return;
499             }
500              
501              
502             sub temp_fasta {
503 9     9 1 31 my $self = shift;
504 9   50     28 my $args = shift // {}; # HashRef (should not be empty...)
505              
506             # abbreviate ids (possibly using a custom prefix)
507 9         22 my $prefix = $args->{id_prefix};
508 9         16 delete $args->{id_prefix}; # not necessarily required
509 9         49 my $mapper = $self->std_mapper($prefix);
510 9         68 $self->shorten_ids($mapper);
511              
512             # write temporary .fasta file using standard abbr_ids
513             # ...and restore long_ids afterwards
514 9         110 my $out = File::Temp->new(UNLINK => 0, EXLOCK => 0, SUFFIX => '.fasta');
515 9         4999 $self->store_fasta($out->filename, $args);
516 9         86 $self->restore_ids($mapper);
517             ### filename: $out->filename
518              
519 9 100       80 return wantarray ? ($out->filename, $mapper) : $out->filename;
520             }
521              
522              
523             # TODO: add an option to remove trailing '_' in ids?
524              
525             sub load_phylip {
526 105     105 1 8041 my $class = shift;
527 105         207 my $infile = shift;
528              
529 105         614 open my $in, '<', $infile;
530              
531 105         40216 my $ali = $class->new( file => $infile );
532 105         210 my $seq_n;
533             my $site_n;
534 105         207 my $n = 0;
535              
536             LINE:
537 105         40010 while (my $line = <$in>) {
538 8557         19185 chomp $line;
539              
540             # skip empty lines
541 8557 100       46116 next LINE if $line =~ $EMPTY_LINE;
542              
543             # get matrix dimensions
544 8524 100       35714 if ($line =~ $DIM_LINE) {
545 105         621 ($seq_n, $site_n) = ($1, $2);
546 105         1670 next LINE;
547             }
548              
549             # process regular line (identifier is optional)
550 8419         46986 my ($seq_id, $seq) = $line =~ $PHY_LINE;
551              
552             # should never happen...
553 8419 50       22160 croak "[BMC] Error: unable to parse PHYLIP file at line $.; aborting!"
554             unless $seq;
555              
556             # delete optional spaces
557 8419         18969 $seq =~ tr/ //d;
558              
559             # process first seq block
560 8419 100       299822 if ($ali->count_seqs < $seq_n) {
561              
562             # seq_id are mandatory in first block
563 8089 50       18198 croak "[BMC] Error: missing id in PHILIP file at line $.; aborting!"
564             unless $seq_id;
565              
566             # store first seq chunk along with seq_id
567 8089         220680 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
568 8089         271671 $ali->add_seq($new_seq);
569             }
570              
571             # process remaining seq blocks
572             else {
573 330         10899 my $curr_seq = $ali->get_seq($n);
574              
575             # elongate current with partial seq chunk mistaken as seq_id
576 330 100 100     965 if ($seq_id && $seq_id ne $curr_seq->full_id) {
577 110         3578 $curr_seq->append_seq($seq_id);
578             }
579              
580             # elongate current seq with new seq chunk
581 330         10634 $curr_seq->append_seq($seq);
582             }
583              
584             # prepare reading of next seq or next block
585 8419 100       87783 $n = 0 if ++$n == $seq_n;
586             }
587              
588 105         630 my $width = $ali->width;
589 105 50       470 croak "[BMC] Error: unexpected site number in PHYLIP file: $width;"
590             . ' aborting!' if $width != $site_n;
591              
592 105         2733 return $ali;
593             }
594              
595             # PHYLIP: http://evolution.genetics.washington.edu/phylip/doc/main.html
596             # The information for each species follows, starting with a ten-character
597             # species name (which can include blanks and some punctuation marks), and
598             # continuing with the characters for that species. The name should be on the
599             # same line as the first character of the data for that species. (...) The
600             # name should be ten characters in length, filled out to the full ten
601             # characters by blanks if shorter. Any printable ASCII/ISO character is
602             # allowed in the name, except for parentheses ("(" and ")"), square brackets
603             # ("[" and "]"), colon (":"), semicolon (";") and comma (","). (...) Note that
604             # in these sequences we have a blank every ten sites to make them easier to
605             # read: any such blanks are allowed. The blank line which separates the two
606             # groups of lines (the ones containing sites 1-20 and ones containing sites
607             # 21-39) may or may not be present.
608              
609             # PhyML: http://www.atgc-montpellier.fr/phyml/usersguide.php?type=command
610             # The input sequence file is a standard PHYLIP file of aligned DNA or
611             # amino-acids sequences. (...) The maximum number of characters in species
612             # name MUST not exceed 100. Blanks and the symbols "(),:" are not allowed
613             # within sequence names because the Newick tree format makes special use of
614             # these symbols. However, blanks (one or more) MUST appear at the end of each
615             # species name.
616              
617             # TREE-FINDER: http://www.treefinder.de/tf-march2011-manual.pdf
618             # A sequence name may consist of 1 to 10 alphanumeric characters, dashes "-",
619             # dots ".", underscores "_", or some of "/", "?", "*", "+". No space is
620             # allowed inside the names. The first name character must be a letter or an
621             # underscore. A sequence fragment may start behind position 10 after a
622             # sequence name, or anywhere in a line without a name.
623              
624             # TREE-PUZZLE [data/globin.a]
625             # 7 128
626             # HBB_HUMAN HLTPEEKSAV TALWGKVNVD EVGGEALGRL LVVYPWTQRF FESFDLSMGN
627             # HBB_HORSE QLSGEEKAAV LALWDKVNEE EVGGEALGRL LVVYPWTQRF FDSFDLSMGN
628             # HBA_HUMAN VLSPADKTNV KAAWGKVGAG EYGAEALERM FLSFPTTKTY FPHFDLSHGS
629             # HBA_HORSE VLSAADKTNV KAAWSKVGAG EYGAEALERM FLGFPTTKTY FPHFDLSHGS
630             # MYG_PHYCA VLSEGEWQLV LHVWAKVEVA GHGQDILIRL FKSHPETLEK FDRFHLKKAS
631             # GLB5_PETMA PLSAAEKTKI RSAWAPVYYE TSGVDILVKF FTSTPAAQEF FPKFGLTKKS
632             # LGB2_LUPLU ALTESQAALV KSSWEEFNIP KHTHRFFILV LEIAPAAKDL FSFLGTSQNN
633              
634             # RAXML: http://sco.h-its.org/exelixis/oldPage/RAxML-Manual.7.0.4.pdf
635             # The input alignment format of RAxML is relaxed interleaved or sequential
636             # PHYLIP. "Relaxed" means that sequence names can be of variable length
637             # between 1 up to 256 characters. (...) Prohibited Character(s) in taxon names
638             # taxon names that contain any form of whitespace character, like blanks,
639             # tabulators, and carriage returns, as well as one of the following prohibited
640             # characters: :,();[].
641              
642             # PHYLOBAYES: http://megasun.bch.umontreal.ca/People/lartillot/www/phylobayes3.3e.pdf
643             # Taxon names may contain more than 10 characters. Avoid special characters
644             # such as ';' or ')', which will create problems when parsing trees. The best
645             # is to only use letters, digits and '_'. Sequences can be interrupted by
646             # space and tab, but not by return characters. Be sure that the lengths of the
647             # sequences are all the same, and identical to the lengths indicated in the
648             # header. Sequences can be interleaved, in which case the taxon names may or
649             # may not be repeated in each block.
650              
651              
652             sub store_phylip {
653             my $self = shift;
654             my $outfile = shift;
655             my $args = shift // {}; # HashRef (should not be empty...)
656              
657             my $short = $args->{short} // 1;
658             my $clean = $args->{clean} // 0;
659             my $chunk = $args->{chunk} // 60;
660              
661             open my $out, '>', $outfile;
662              
663             # print data matrix dimensions
664             my $height = $self->count_seqs;
665             my $width = $self->width;
666             say {$out} $height . q{ } . $width;
667              
668             # setup id format (this will also affect block-like structure)
669             my $format = $short ? "%-10.10s %s\n" : "%s %s\n";
670             my $method = $short ? 'full_id' : 'foreign_id';
671             my $sep = $short ? q{ } : q{};
672              
673             # optionally disable wrapping
674             $chunk = $width if $chunk < 0;
675              
676             # output Ali in sequential or interleaved format
677             for (my $site = 0; $site < $width; $site += $chunk) {
678              
679             # leave empty line between chunks (but not after last chunk)
680             print {$out} "\n" if $site;
681              
682             for my $seq ($self->all_seqs) {
683              
684             # print 10-chars ids only once
685             # Note: full_ids are only truncated (use IdMapper for more)
686             my $id = $site == 0 ? $seq->$method : q{};
687              
688             # optionally clean seq
689             $seq = $seq->clone->gapify('X') if $clean;
690              
691             # print seq chunks in 10-state blocks
692             # Note: We insert a space in the 11th column to ensure that more
693             # software packages can read the written files. PHYLIP itself will
694             # ignore this spurious space in the 'sequences'.
695             ( my $str = $seq->edit_seq($site, $chunk) ) =~ s{$GAP}{-}xmsg;
696             printf {$out} $format, $id, join $sep,
697             map { substr($str, $_*10, 10) } 0..ceil(length($str)/10)-1;
698             }
699             }
700              
701             return;
702             }
703              
704              
705             sub load_stockholm {
706 2     2 1 49 my $class = shift;
707 2         6 my $infile = shift;
708              
709 2         10 open my $in, '<', $infile;
710              
711 2         513 my $ali = $class->new( file => $infile );
712 2         21 tie my %seq_for, 'Tie::IxHash';
713              
714             LINE:
715 2         1337 while (my $line = <$in>) {
716 44         369 chomp $line;
717              
718             # process GF comments
719 44 100       124 next LINE if $ali->is_comment($line, $STK_COMMENT);
720              
721             # skip empty lines and other comment lines
722 27 100 100     279 next LINE if $line =~ $EMPTY_LINE
723             || $line =~ $COMMENT_LINE;
724 11 100       47 last LINE if $line =~ $END_LINE;
725              
726             # parse stockholm seq
727 9         51 my ($seq_id, $seq) = $line =~ $STK_SEQ;
728              
729             # replace dots and gaps by stars
730 9         34 $seq =~ s/[\.\-]/*/xmsg;
731              
732             # elongate seq
733 9         60 $seq_for{$seq_id} .= $seq;
734             }
735              
736             # populate Ali object
737 2         13 while (my ($seq_id, $seq) = each %seq_for) {
738 9         523 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
739 9         371 $ali->add_seq($new_seq);
740             }
741              
742 2         115 return $ali;
743             }
744              
745              
746             sub instant_store {
747 1     1 1 6 my $class = shift;
748 1         3 my $outfile = shift;
749 1   50     5 my $args = shift // {}; # HashRef (should not be empty...)
750              
751 1         7 my $infile = $args->{infile};
752 1 50       6 croak '[BMC] Error: no infile specified for instant_store; aborting!'
753             unless $infile;
754              
755 1         9 my $coderef = $args->{coderef};
756 1 50       5 croak '[BMC] Error: no coderef specified for instant_store; aborting!'
757             unless $coderef;
758              
759 1         7 open my $in, '<', $infile;
760 1         259 open my $out, '>', $outfile;
761              
762 1         293 my $seq_id;
763             my $seq;
764              
765             LINE:
766 1         968 while (my $line = <$in>) {
767 92         130 chomp $line;
768              
769             # skip empty lines and process comments
770 92 100 66     480 next LINE if $line =~ $EMPTY_LINE
771             || $line =~ $COMMENT_LINE;
772              
773             # at each '>' char...
774 90         220 my ($defline) = $line =~ $DEF_LINE;
775 90 100       164 if ($defline) {
776              
777             # process current seq (if any)
778 3 100       10 if ($seq) {
779 2         80 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
780 2         16 print {$out} $coderef->($new_seq);
  2         11  
781 2         79 $seq = q{};
782             }
783              
784 3         8 $seq_id = $defline;
785 3         16 next LINE;
786             }
787              
788             # elongate current seq (seqs can be broken on several lines)
789 87         253 $seq .= $line;
790             }
791              
792             # process last seq (if any)
793 1 50       17 if ($seq) {
794 1         45 my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
795 1         2 print {$out} $coderef->($new_seq);
  1         6  
796             }
797              
798 1         72 return;
799             }
800              
801              
802             sub instant_count {
803 1     1 1 408 my $class = shift;
804 1         3 my $infile = shift;
805              
806 1         2 my $seq_n = 0;
807              
808 1         7 open my $in, '<', $infile;
809 1 100       2867 while (<$in>) { $seq_n++ if m/^>/xms }
  502         2492  
810              
811 1         21 return $seq_n;
812             }
813              
814             __PACKAGE__->meta->make_immutable;
815             1;
816              
817             __END__
818              
819             =pod
820              
821             =head1 NAME
822              
823             Bio::MUST::Core::Ali - Multiple sequence alignment
824              
825             =head1 VERSION
826              
827             version 0.212650
828              
829             =head1 SYNOPSIS
830              
831             #!/usr/bin/env perl
832              
833             use Modern::Perl '2011';
834             # same as:
835             # use strict;
836             # use warnings;
837             # use feature qw(say);
838              
839             use Bio::MUST::Core;
840             use aliased 'Bio::MUST::Core::Ali';
841              
842             # read Ali form disk
843             my $ali = Ali->load('example.ali');
844              
845             # get some properties
846             say 'height: ' . $ali->height; # number of seqs
847             say 'width: ' . $ali->width; # number of sites
848             say '% miss: ' . $ali->perc_miss; # fraction of missing chars (%)
849             say 'seqs are ' . $ali->is_protein ? 'prot' : 'nucl';
850              
851             # turn seqs to uppercase
852             $ali->uc_seqs;
853              
854             # filter out seqs with no species associated
855             my @seqs = $ali->filter_seqs( sub { not $_->is_genus_only } );
856             use aliased 'Bio::MUST::Core::IdList';
857             my $list = IdList->new( ids => \@seqs );
858             $ali->apply_list($list);
859              
860             # alternatively:
861             # $ali = Ali->new( seqs => \@seqs );
862              
863             # filter out gap-rich sites
864             $ali->idealize(0.2); # min 20% non-gaps per site
865              
866             # filter out non-informative sites
867             use aliased 'Bio::MUST::Core::SeqMask';
868             my $mask = SeqMask->parsimony_mask($ali);
869             $ali->apply_mask($mask);
870              
871             # write down reduced Ali to disk
872             $ali->store('example-uc-genus-sp-20.ali');
873             $ali->store_fasta('example-uc-genus-sp-20.fasta');
874              
875             # see below for additional methods
876             # ...
877              
878             =head1 DESCRIPTION
879              
880             This module implements the multiple sequence alignment (MSA) class and its
881             methods. An Ali is modeled as an array of L<Bio::MUST::Core::Seq> objects.
882             Consequently, sequence ids do not absolutely need to be unique for it to
883             work (though id uniqueness helps a lot for sequence access and filtering).
884              
885             An Ali knows whether it contains nucleotide or protein sequences, whether
886             those are aligned or not, as well as its Seq count and exact width. All
887             these properties are introspected on the fly, which means that, while they
888             can be expensive to compute, they are always accurate.
889              
890             This is important because an Ali provides methods for inserting, deleting
891             and editing its Seq objects. Further, it does its best to maintain a semantic
892             distinction between true gap states (encoded as '*') and missing regions of
893             undetermined length (encoded as pure whitespace, for example at the end of a
894             short sequence).
895              
896             It also has methods for mapping its sequence ids (for example, before export
897             to the PHYLIP format), as well as methods to selectively retain sequences
898             and sites based on L<Bio::MUST::Core::IdList> and
899             L<Bio::MUST::Core::SeqMask> objects. For example, the C<idealize> method
900             discards shared gaps and optionally removes gap-rich sites only due to a
901             tiny number of sequences.
902              
903             Finally, an Ali can be stored in MUST pseudo-FASTA format (which handles
904             meaningful whitespace and allows comment lines in the header) or be
905             imported/exported from/to several popular MSA formats, such as plain FASTA,
906             STOCKHOLM and PHYLIP.
907              
908             =head1 ATTRIBUTES
909              
910             =head2 seqs
911              
912             ArrayRef of L<Bio::MUST::Core::Seq> objects (optional)
913              
914             Most of the accessor methods described below are implemented by delegation
915             to this public attribute using L<Moose::Meta::Attribute::Native/Moose native
916             delegation>. Their documentation thus heavily borrows from the corresponding
917             help pages.
918              
919             =head2 file
920              
921             L<Path::Class::File> object (optional)
922              
923             This optional attribute is initialized by class methods that C<load> an Ali
924             from disk. It is meant to improve the introspection capabilities of the Ali.
925             For now, this attribute is not used by the C<store> methods, though it might
926             provide them with a default value in the future.
927              
928             =head2 guessing
929              
930             Boolean (optional)
931              
932             By default, an Ali object tries to guess whether it is aligned or not by
933             looking for gap-like characters in any of its Seq objects (see
934             L<Bio::MUST::Core::Seq> for the exact test performed on each sequence).
935              
936             When this smart behavior causes issues, one can disable it by unsetting this
937             boolean attribute (see C<dont_guess> and C<guess> accessor methods).
938              
939             =head2 comments
940              
941             ArrayRef of strings (optional)
942              
943             An Ali object is commentable, which means that it supports all the methods
944             pertaining to comment lines described in
945             L<Bio::MUST::Core::Roles::Commentable> (such as C<header>).
946              
947             =head1 CONSTRUCTORS
948              
949             =head2 new
950              
951             Default constructor (class method) returning a new Ali.
952              
953             use aliased 'Bio::MUST::Core::Ali';
954             my $ali1 = Ali->new();
955             my @seqs = $ali->all_seqs;
956             my $ali2 = Ali->new( seqs => \@seqs );
957              
958             This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>,
959             C<file>, C<guessing> and C<comments>.
960              
961             =head2 clone
962              
963             Creates a deep copy (a clone) of the Ali. Returns the copy.
964              
965             use aliased 'Bio::MUST::Core::Ali';
966             my $ali = Ali->load('input.ali');
967             my $ali_copy = $ali->clone;
968             # you can now mess with $ali_copy without affecting $ali
969              
970             This method does not accept any arguments.
971              
972             =head1 ACCESSORS
973              
974             =head2 add_seq
975              
976             Adds one (or more) new sequence(s) at the end of the Ali. Returns the new
977             number of sequences of the Ali.
978              
979             use aliased 'Bio::MUST::Core::Seq';
980             my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
981             $ali->add_seq($new_seq);
982              
983             This method accepts any number of arguments.
984              
985             =head2 get_seq
986              
987             Returns a sequence of the Ali by its index. You can also use negative index
988             numbers, just as with Perl's core array handling. If the specified sequence
989             does not exist, this method will return C<undef>.
990              
991             my $seq = $ali->get_seq($index);
992             croak "Seq $index not found in Ali!" unless defined $seq;
993              
994             This method accepts just one argument (and not an array slice).
995              
996             =head2 get_seq_with_id
997              
998             Returns a sequence of the Ali by its id. If the specified id is not unique,
999             only the first matching sequence will be returned, whereas if no sequence
1000             exists for the specified id, this method will return C<undef>.
1001              
1002             my $id = 'Pyrus malus_3750@658052655';
1003             my $seq = $ali->get_seq_with_id($id);
1004             croak "Seq $id not found in Ali!" unless defined $seq;
1005              
1006             This method accepts just one argument.
1007              
1008             =head2 set_seq
1009              
1010             Given an index and a sequence, sets the specified Ali element to the
1011             sequence. This method returns the new sequence at C<$index>.
1012              
1013             use aliased 'Bio::MUST::Core::Seq';
1014             my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
1015             $ali->set_seq($index, $new_seq);
1016              
1017             This method requires two arguments.
1018              
1019             =head2 delete_seq
1020              
1021             Removes the sequence at the given index from the Ali. This method returns
1022             the deleted sequence. If the specified sequence does not exist, it will
1023             return C<undef> instead.
1024              
1025             $ali->delete_seq($index);
1026              
1027             This method requires one argument.
1028              
1029             =head2 insert_seq
1030              
1031             Inserts a new sequence into the Ali at the given index. This method returns
1032             the new sequence at C<$index>.
1033              
1034             use aliased 'Bio::MUST::Core::Seq';
1035             my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
1036             $ali->insert_seq($index, $new_seq);
1037              
1038             This method requires two arguments.
1039              
1040             =head2 all_seqs
1041              
1042             Returns all the sequences of the Ali (not an array reference).
1043              
1044             my @seqs = $ali->all_seqs;
1045              
1046             This method does not accept any arguments.
1047              
1048             =head2 first_seq
1049              
1050             Returns the first sequence of the Ali matching a given criterion, just like
1051             L<List::Util>'s C<first> function. This method requires a subroutine
1052             implementing the matching logic.
1053              
1054             # emulate get_seq_with_id method
1055             my $id2find = 'seq13';
1056             my $seq = $ali->first_seq( sub { $_->full_id eq $id2find } );
1057              
1058             This method requires a single argument.
1059              
1060             =head2 filter_seqs
1061              
1062             Returns every sequence of the Ali matching a given criterion, just like
1063             Perl's core C<grep> function. This method requires a subroutine implementing
1064             the matching logic.
1065              
1066             # keep only long sequences (ignoring gaps and missing states)
1067             my @long_seqs = $ali->filter_seqs( sub { $_->nomiss_seq_len > 500 } );
1068              
1069             This method requires a single argument.
1070              
1071             =head2 all_new_seqs
1072              
1073             Returns all the sequences of the Ali tagged as #NEW# (not an array reference).
1074              
1075             my @new_seqs = $ali->all_new_seqs;
1076              
1077             This method does not accept any arguments.
1078              
1079             =head2 all_but_new_seqs
1080              
1081             Returns all the sequences of the Ali except those tagged as #NEW# (not an array
1082             reference).
1083              
1084             my @preexisting_seqs = $ali->all_but_new_seqs;
1085              
1086             This method does not accept any arguments.
1087              
1088             =head2 all_seq_ids
1089              
1090             Returns all the sequence ids (L<Bio::MUST::Core::SeqId> objects) of the Ali
1091             (not an array reference). This is only a convenience method.
1092              
1093             use Test::Deeply;
1094             my @ids1 = $ali->all_seq_ids;
1095             my @ids2 = map { $_->seq_id } $ali->all_seqs;
1096             is_deeply \@ids1, \@ids2, 'should be true';
1097             my @orgs = map { $_->org } @ids1;
1098              
1099             This method does not accept any arguments.
1100              
1101             =head2 filename
1102              
1103             Returns the stringified filename of the Ali.
1104              
1105             This method does not accept any arguments.
1106              
1107             =head2 guess
1108              
1109             Turn on the smart detection of gaps (see C<guessing> attribute above).
1110              
1111             This method does not accept any arguments.
1112              
1113             =head2 dont_guess
1114              
1115             Turn off the smart detection of gaps (see C<guessing> attribute above).
1116              
1117             use aliased 'Bio::MUST::Core::Ali';
1118             my $ali = Ali->load('ensembl.fasta');
1119             $ali->dont_guess;
1120              
1121             This method does not accept any arguments.
1122              
1123             =head1 PROPERTIES
1124              
1125             =head2 has_uniq_ids
1126              
1127             Returns true if all the sequence ids are unique.
1128              
1129             carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids;
1130              
1131             This method does not accept any arguments.
1132              
1133             =head2 is_protein
1134              
1135             Returns true if any sequence of the Ali looks like a protein. See
1136             L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
1137              
1138             say 'Your file includes nucleotide sequences' unless $ali->is_protein;
1139              
1140             This method does not accept any arguments.
1141              
1142             =head2 is_aligned
1143              
1144             Returns true if any sequence of the Ali appears to be aligned. See
1145             L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
1146              
1147             If the boolean attribute guessing is not set, always returns false.
1148              
1149             carp 'Warning: file does not look aligned!' unless $ali->is_aligned;
1150              
1151             This method does not accept any arguments.
1152              
1153             =head2 count_seqs
1154              
1155             Returns the number of sequences of the Ali. The alias method C<height> is
1156             provided for convenience.
1157              
1158             my $height = $ali->count_seqs;
1159              
1160             This method does not accept any arguments.
1161              
1162             =head2 width
1163              
1164             Returns the width of the Ali (in characters). If the Ali is not aligned,
1165             returns the length of the longest sequence instead.
1166              
1167             To avoid potential bugs due to caching, this method dynamically computes the
1168             Ali width at each call. Moreover, the Ali is always uniformized (see below)
1169             beforehands to ensure accurate width value. Therefore, this method is
1170             expensive and should not be called repeatedly (e.g., in a loop condition).
1171              
1172             # you'd better looping through sites like this...
1173             my $width = $ali->width;
1174             for my $site (0..$width-1) {
1175             ...
1176             }
1177              
1178             This method does not accept any arguments.
1179              
1180             =head2 seq_len_stats
1181              
1182             Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps).
1183             The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max).
1184              
1185             This method does not accept any arguments.
1186              
1187             =head2 perc_miss
1188              
1189             Returns the percentage of missing (and gap-like) character states in the Ali.
1190              
1191             As this method internally calls C<Ali::width>, the remarks above also apply.
1192              
1193             my $miss_level = $ali->perc_miss;
1194              
1195             This method does not accept any arguments.
1196              
1197             =head1 MUTATORS
1198              
1199             =head2 uc_seqs
1200              
1201             Turn all the sequences of the Ali to uppercase and returns it.
1202              
1203             This method does not accept any arguments.
1204              
1205             =head2 recode_seqs
1206              
1207             Recode all the sequences of the Ali and returns it.
1208              
1209             use aliased 'Bio::MUST::Core::Ali';
1210             my $ali = Ali->load('biased.ali');
1211              
1212             # set up RY recoding for suppressing codon bias
1213             my %base_for = (
1214             A => 'A', G => 'A', # purines
1215             C => 'C', T => 'C', # pyrimidines
1216             );
1217              
1218             my $ali_rec = $ali->recode_seqs( \%base_for );
1219             $ali_rec->store('biased_ry.ali');
1220              
1221             This method requires one argument.
1222              
1223             =head2 degap_seqs
1224              
1225             Remove the gaps in all the sequences of the Ali and returns it.
1226              
1227             This method does not accept any arguments.
1228              
1229             =head2 spacify_seqs
1230              
1231             Spacifies all the sequences of the Ali and returns it. See the corresponding
1232             method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1233             operation.
1234              
1235             This method does not accept any arguments.
1236              
1237             =head2 gapify_seqs
1238              
1239             Gapifies all the sequences of the Ali and returns it. See the corresponding
1240             method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1241             operation.
1242              
1243             This method accepts an optional argument.
1244              
1245             =head2 trim_seqs
1246              
1247             Trims all the sequences of the Ali and returns it. See the corresponding
1248             method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1249             operation.
1250              
1251             This method does not accept any arguments.
1252              
1253             =head2 pad_seqs
1254              
1255             Pads all the sequences of the Ali and returns it. See the corresponding
1256             method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1257             operation.
1258              
1259             This method does not accept any arguments.
1260              
1261             =head2 uniformize
1262              
1263             Performs the three gap-cleaning operations in turn on all the sequences of
1264             the Ali and returns it, which ensures that it is semantically clean and
1265             rectangular.
1266              
1267             This is only a convenience method called internally by the Ali object before
1268             selected methods (such as storage-like methods). However, it might prove
1269             useful in some circumstances, hence it is not defined as private.
1270              
1271             use aliased 'Bio::MUST::Core::Ali';
1272             my $ali = Ali->load('input.ali');
1273             $ali->add_seq(...);
1274             # more editing of the Ali sequences
1275             $ali->uniformize;
1276              
1277             This method does not accept any arguments.
1278              
1279             =head2 clear_new_tags
1280              
1281             Clear the #NEW# tag (if any) from all the sequences of the Ali and returns it.
1282              
1283             use aliased 'Bio::MUST::Core::Ali';
1284             my $ali = Ali->load('input-42.ali');
1285             $ali->clear_new_tags;
1286             my @new_seqs = $ali->all_new_seqs;
1287             # array should be empty
1288              
1289             This method does not accept any arguments.
1290              
1291             =head2 shorten_ids
1292              
1293             Replaces all the sequence ids of the Ali by their abbreviated forms as
1294             specified by the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
1295              
1296             Note that this method will work only if the sequence ids have not been
1297             already shortened or modified in any way since the creation of the IdMapper.
1298             Long ids without abbreviated forms in the IdMapper are left untouched.
1299              
1300             use aliased 'Bio::MUST::Core::Ali';
1301             use aliased 'Bio::MUST::Core::IdMapper';
1302             my $ali = Ali->load('input.ali');
1303             my $mapper = IdMapper->std_mapper($ali, 'lcl|seq');
1304             $ali->shorten_ids($mapper);
1305             $ali->store_fasta('input.4blast.fasta');
1306             # makeblastdb
1307              
1308             # Note: the temp_fasta method does exactly that
1309              
1310             This method requires one argument.
1311              
1312             =head2 restore_ids
1313              
1314             Replaces all the sequence ids of the Ali by their long forms as specified by
1315             the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
1316              
1317             Note that this method will work only if the sequence ids have been previously
1318             abbreviated (see above) and have not been modified in any way since then.
1319             Again, abbreviated ids without long forms in the IdMapper are left untouched.
1320              
1321             use aliased 'Bio::MUST::Core::IdMapper';
1322             my $mapper = IdMapper->gi_mapper($ali);
1323             $ali->shorten_ids($mapper);
1324             ...
1325             $ali->restore_ids($mapper);
1326              
1327             This method requires one argument.
1328              
1329             =head2 apply_list
1330              
1331             Selectively retains or discards sequences from the Ali based on the content
1332             of the passed L<Bio::MUST::Core::IdList> object and returns the Ali.
1333              
1334             use aliased 'Bio::MUST::Core::IdList';
1335             my $list = IdList->load('interesting_seqs.idl');
1336             $ali->apply_list($list); # discard non-interesting seqs
1337              
1338             This method requires one argument.
1339              
1340             =head2 apply_mask
1341              
1342             Selectively retains or discards sites from the Ali based on the content of
1343             the passed L<Bio::MUST::Core::SeqMask> object and returns the Ali.
1344              
1345             use aliased 'Bio::MUST::Core::SeqMask';
1346             my $variable_sites = SeqMask->variable_mask($ali);
1347             $ali->apply_mask($variable_sites); # discard constant sites
1348              
1349             This method requires one argument.
1350              
1351             =head2 idealize
1352              
1353             Computes and applies an ideal sequence mask to the Ali and returns it. This
1354             is only a convenience method.
1355              
1356             When invoked without arguments, it will discard the gaps that are
1357             universally shared by all the sequences. Otherwise, the provided argument
1358             corresponds to the threshold of the C<ideal_mask> method described in
1359             L<Bio::MUST::Core::SeqMask>.
1360              
1361             use aliased 'Bio::MUST::Core::IdList';
1362             my $fast_seqs = IdList->load('fast_evolving_seqs.idl');
1363             my $seqs2keep = $fast_seqs->negative_list($ali);
1364             $ali->apply_list($seqs2keep); # discard fast-evolving seqs
1365             $ali->idealize; # discard newly shared gaps caused by fast seqs
1366              
1367             use aliased 'Bio::MUST::Core::Ali';
1368             my $ali = Ali->load('hmm_based.ali');
1369             $ali->idealize(0.05); # discard insertions due to <5% of the seqs
1370              
1371             This method accepts an optional argument.
1372              
1373             =head1 MISC METHODS
1374              
1375             =head2 gapmiss_regex
1376              
1377             Returns a regular expression matching gaps and ambiguous or missing states.
1378             The exact regex returned depends on the type of sequences in the Ali (nucl. or
1379             proteins).
1380              
1381             my $regex = $ali->gapmiss_regex;
1382             my $first_seq = $ali->get_seq(0)->seq;
1383             my $gapmiss_n = $first_seq =~ m/($regex)/xmsg;
1384             say "The first sequence has $gapmiss_n gaps or ambiguous/missing sites";
1385              
1386             This method does not accept any arguments.
1387              
1388             =head2 map_coords
1389              
1390             Converts a set of site positions from Ali coordinates to coordinates of the
1391             specified sequence (thereby ignoring positions due to gaps). Returns the
1392             converted sites in sequence coordinates as an array refrence.
1393              
1394             use aliased 'Bio::MUST::Core::Ali';
1395             my $ali = Ali->load('input.ali');
1396             my $id = 'GIV-Norovirus Hum.GIV.1.POL_1338688@508124125';
1397             my $ali_coords = [ 4, 25, 73, 89, 104, 116 ];
1398             my $seq_coords = $ali->map_coords($id, $ali_coords);
1399             # $seq_coords is [ 3, 23, 59, 71, 71, 74 ]
1400              
1401             This method requires two arguments: the id of a sequence and an array
1402             reference of input sites in Ali coordinates.
1403              
1404             =head1 I/O METHODS
1405              
1406             =head2 load
1407              
1408             Class method (constructor) returning a new Ali read from disk. This method
1409             will transparently import plain FASTA files in addition to the MUST
1410             pseudo-FASTA format (ALI files).
1411              
1412             use Test::Deeply;
1413             use aliased 'Bio::MUST::Core::Ali';
1414             my $ali1 = Ali->load('example.ali');
1415             my $ali2 = Ali->load('example.fasta');
1416             my @seqs1 = $ali1->all_seqs;
1417             my @seqs2 = $ali2->all_seqs;
1418             is_deeply, \@seqs1, \@seqs2, 'should be true';
1419              
1420             This method requires one argument.
1421              
1422             =head2 store
1423              
1424             Writes the Ali to disk in the MUST pseudo-FASTA format (ALI files).
1425              
1426             Note that the ALI format is only used when the suffix of the outfile name is
1427             '.ali'. In all other cases (including lack of suffix), this method
1428             automatically forwards the call to C<store_fasta>.
1429              
1430             $ali->store('output.ali');
1431             # output.ali is written in ALI format
1432             $ali->store('output.fasta');
1433             # output.fasta is written in FASTA format
1434              
1435             This method requires one argument (but see C<store_fasta> in case of automatic
1436             forwarding of the method call).
1437              
1438             =head2 store_fasta
1439              
1440             Writes the Ali to disk in the plain FASTA format.
1441              
1442             For compatibility purposes, this method automatically fetches sequence ids
1443             using the C<foreign_id> method instead of the native C<full_id> method, both
1444             described in L<Bio::MUST::Core::SeqId>.
1445              
1446             $ali->store_fasta( 'output.fasta' );
1447             $ali->store_fasta( 'output.fasta', {chunk => -1, degap => 1} );
1448              
1449             This method requires one argument and accepts a second optional argument
1450             controlling the output format. It is a hash reference that may contain one
1451             or more of the following keys:
1452              
1453             - clean: replace all ambiguous and missing states by C<X> (default: false)
1454             - degap: boolean value controlling degapping (default: false)
1455             - chunk: line width (default is 60 chars; negative values means no wrap)
1456              
1457             Finally, it is possible to fine-tune the behavior of the C<clean> option by
1458             providing another character than C<X> through the C<gapify> key. This can be
1459             useful to replace all ambiguous and missing states by gaps, as shown below:
1460              
1461             $ali->store_fasta( 'output.fasta, { clean => 1, gapify => '*' } );
1462              
1463             =head2 temp_fasta
1464              
1465             Writes a temporary copy of the Ali to disk in the plain FASTA format using
1466             numeric sequence ids and returns the name of the temporary file. This is
1467             only a convenience method.
1468              
1469             In list context, returns the IdMapper object along with temporary filename.
1470              
1471             my $infile = $ali->temp_fasta( { degap => 1 } );
1472             my $output = `script.sh $infile`;
1473             ...
1474              
1475             This method accepts the same optional argument hash as C<store_fasta>.
1476             However, an additional option (C<id_prefix>) is available to control the way
1477             abbreviated sequence ids are prefixed by the C<std_mapper> method (see
1478             L<Bio::MUST::Core::Listable>).
1479              
1480             my $infile1 = $ali1->temp_fasta( { id_prefix => 'file1-' } );
1481             my $infile2 = $ali2->temp_fasta( { id_prefix => 'file2-' } );
1482              
1483             =head2 load_phylip
1484              
1485             Class method (constructor) returning a new Ali read from a file in the
1486             PHYLIP format. Both sequential and interleaved formats are supported.
1487              
1488             The only constraint is that sequence ids MUST NOT contain whitespace and be
1489             followed by at least one whitespace character. This means that some old-school
1490             PHYLIP files not following this convention will not be processed correctly.
1491              
1492             When using the interleaved format, sequence ids may or may not be repeated in
1493             each block.
1494              
1495             use aliased 'Bio::MUST::Core::Ali';
1496             my $ali = Ali->load_phylip('phylip.phy');
1497             say $ali->count_seqs;
1498             say $ali->width;
1499              
1500             # outputs:
1501             # 10
1502             # 709
1503              
1504             This method requires one argument.
1505              
1506             =head2 store_phylip
1507              
1508             Writes the Ali to disk in the interleaved (or sequential) PHYLIP format.
1509              
1510             To ensure maximal flexibility, this method fetches sequence ids using the
1511             native C<full_id> method described in L<Bio::MUST::Core::SeqId>, but
1512             truncates them to 10 characters, as expected by the original PHYLIP software
1513             package. No other tinkering is carried out on the ids. Thus, if the ids
1514             contain whitespace or are not unique in their 10 first characters, it is
1515             advised to first map them using one of the constructors in
1516             L<Bio::MUST::Core::IdMapper>.
1517              
1518             use aliased 'Bio::MUST::Core::Ali';
1519             use aliased 'Bio::MUST::Core::IdMapper';
1520             my $ali = Ali->load('input.ali');
1521             my $mapper = IdMapper->std_mapper($ali);
1522             $ali->shorten_ids($mapper);
1523             $ali->store_phylip( 'input.phy', { chunk => 50 } );
1524              
1525             This method requires one argument and accepts a second optional argument
1526             controlling the output format. It is a hash reference that may contain one
1527             or more of the following keys:
1528              
1529             - short: truncate ids to 10 chars, as in original PHYLIP (defaut: yes)
1530             - clean: replace all ambiguous and missing states by 'X' (default: false)
1531             - chunk: line width (default: 60 chars; negative values means no wrap)
1532              
1533             To store the Ali in PHYLIP sequential format, specify a negative chunk (-1).
1534              
1535             =head2 load_stockholm
1536              
1537             Class method (constructor) returning a new Ali read from a file in the
1538             STOCKHOLM format. =GF comments are retained (see above) but not the other
1539             comment classes (=GS, =GR and =GC).
1540              
1541             use aliased 'Bio::MUST::Core::Ali';
1542             my $ali = Ali->load('upsk.stockholm');
1543             say $ali->header;
1544              
1545             # outputs:
1546             # ID UPSK
1547             # SE Predicted; Infernal
1548             # SS Published; PMID 9223489
1549             # RN [1]
1550             # RM 9223489
1551             # RT The role of the pseudoknot at the 3' end of turnip yellow mosaic
1552             # RT virus RNA in minus-strand synthesis by the viral RNA-dependent RNA
1553             # RT polymerase.
1554             # RA Deiman BA, Kortlever RM, Pleij CW;
1555             # RL J Virol 1997;71:5990-5996.
1556              
1557             This method requires one argument.
1558              
1559             =head2 instant_store
1560              
1561             Class method intended to transform a large sequence file read from disk
1562             without loading it in memory. This method will transparently process plain
1563             FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
1564              
1565             my $chunk = 200;
1566              
1567             my $split = sub {
1568             my $seq = shift;
1569             my $base_id = ( split /\s+/xms, $seq->full_id )[0];
1570             my $max_pos = $seq->seq_len - $chunk;
1571             my $n = 0;
1572             my $out_str;
1573             for (my $pos = 0; $pos <= $max_pos; $pos += $chunk, $n++) {
1574             $out_str .= ">$base_id.$n\n" . $seq->edit_seq($pos,
1575             $pos + $chunk <= $max_pos ? $chunk : 2 * $chunk
1576             ) . "\n";
1577             }
1578             return $out_str;
1579             };
1580              
1581             use aliased 'Bio::MUST::Core::Ali';
1582              
1583             Ali->instant_store(
1584             'outfile.fasta', { infile => 'infile.fasta', coderef => $split }
1585             );
1586              
1587             This method requires two arguments. The sercond is a hash reference that must
1588             contain the following keys:
1589             - infile: input sequence file
1590             - coderef: subroutine implementing the transforming logic
1591              
1592             =head2 instant_count
1593              
1594             Class method returning the number of seqs in any sequence file read from disk
1595             without loading it in memory. This method will transparently process plain
1596             FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
1597              
1598             use aliased 'Bio::MUST::Core::Ali';
1599             my $seq_n = Ali->instant_count('input.ali');
1600             say $seq_n;
1601              
1602             =head1 ALIASES
1603              
1604             =head2 height
1605              
1606             Alias for C<count_seqs> method. For API consistency.
1607              
1608             =head1 AUTHOR
1609              
1610             Denis BAURAIN <denis.baurain@uliege.be>
1611              
1612             =head1 CONTRIBUTORS
1613              
1614             =for stopwords Catherine COLSON Arnaud DI FRANCO
1615              
1616             =over 4
1617              
1618             =item *
1619              
1620             Catherine COLSON <ccolson@doct.uliege.be>
1621              
1622             =item *
1623              
1624             Arnaud DI FRANCO <arnaud.difranco@gmail.com>
1625              
1626             =back
1627              
1628             =head1 COPYRIGHT AND LICENSE
1629              
1630             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
1631              
1632             This is free software; you can redistribute it and/or modify it under
1633             the same terms as the Perl 5 programming language system itself.
1634              
1635             =cut