File Coverage

blib/lib/Bio/MUST/Core/Ali/Stash.pm
Criterion Covered Total %
statement 43 43 100.0
branch 4 4 100.0
condition 2 2 100.0
subroutine 12 12 100.0
pod 2 2 100.0
total 63 63 100.0


line stmt bran cond sub pod time code
1             package Bio::MUST::Core::Ali::Stash;
2             # ABSTRACT: Thin wrapper for an indexed Ali read from disk
3             $Bio::MUST::Core::Ali::Stash::VERSION = '0.212670';
4 17     17   147 use Moose;
  17         39  
  17         142  
5 17     17   90921 use namespace::autoclean;
  17         46  
  17         190  
6              
7 17     17   1878 use autodie;
  17         52  
  17         168  
8 17     17   93838 use feature qw(say);
  17         53  
  17         1726  
9              
10 17     17   122 use Smart::Comments;
  17         42  
  17         195  
11              
12 17     17   16045 use Carp;
  17         44  
  17         1287  
13              
14 17     17   110 use Bio::MUST::Core::Types;
  17         43  
  17         444  
15 17     17   103 use Bio::MUST::Core::Constants qw(:seqids);
  17         43  
  17         3155  
16 17     17   129 use aliased 'Bio::MUST::Core::Ali';
  17         42  
  17         146  
17              
18             # ATTRIBUTES
19              
20              
21             has 'seqs' => (
22             is => 'ro',
23             isa => 'Bio::MUST::Core::Ali',
24             required => 1,
25             handles => [
26             qw(count_comments all_comments get_comment
27             guessing all_seq_ids has_uniq_ids is_protein is_aligned
28             get_seq first_seq all_seqs filter_seqs count_seqs
29             gapmiss_regex
30             )
31             ], # comment-related methods needed by IdList
32             );
33              
34              
35             has 'lookup' => (
36             is => 'ro',
37             isa => 'Bio::MUST::Core::IdList',
38             init_arg => undef,
39             lazy => 1,
40             builder => '_build_lookup',
41             handles => [ qw(index_for) ],
42             );
43              
44             with 'Bio::MUST::Core::Roles::Aliable';
45              
46             ## no critic (ProhibitUnusedPrivateSubroutines)
47              
48             sub _build_lookup {
49 2     2   62 return shift->seqs->new_lookup;
50             }
51              
52             ## use critic
53              
54             # ACCESSORS
55              
56              
57             sub get_seq_with_id {
58 3     3 1 13 my $self = shift;
59 3         9 my $id = shift;
60              
61             # override Ali method with faster lookup-based alternative
62 3         15 my $index = $self->index_for($id);
63 3 100       18 return $self->get_seq($index)
64             if defined $index;
65              
66 1         24 carp "[BMC] Warning: cannot find seq with id: $id; returning undef!";
67 1         486 return;
68             }
69              
70             # I/O methods
71              
72              
73             sub load {
74 2     2 1 574 my $class = shift;
75 2         5 my $infile = shift;
76 2   100     11 my $args = shift // {}; # HashRef (should not be empty...)
77              
78 2         13 my $seqs = Ali->load($infile);
79 2         87 $seqs->dont_guess;
80              
81 2 100       8 if ( $args->{truncate_ids} ) {
82 1         10 my $mapper = $seqs->regex_mapper( q{}, $DEF_ID );
83 1         7 $seqs->shorten_ids($mapper);
84             }
85              
86 2         87 return $class->new(seqs => $seqs);
87             }
88              
89             __PACKAGE__->meta->make_immutable;
90             1;
91              
92             __END__
93              
94             =pod
95              
96             =head1 NAME
97              
98             Bio::MUST::Core::Ali::Stash - Thin wrapper for an indexed Ali read from disk
99              
100             =head1 VERSION
101              
102             version 0.212670
103              
104             =head1 SYNOPSIS
105              
106             #!/usr/bin/env perl
107              
108             use Modern::Perl '2011';
109             # same as:
110             # use strict;
111             # use warnings;
112             # use feature qw(say);
113              
114             use Bio::MUST::Core;
115             use aliased 'Bio::MUST::Core::Ali::Stash';
116             use aliased 'Bio::MUST::Core::IdList';
117              
118             # load database
119             my $db = Stash->load('database.fasta');
120              
121             # process OrthoFinder-like output file
122             # where each line defines a cluster followed by its member sequences
123             # cluster1: seq3 seq7 seq2
124             # cluster2: seq1 seq4 seq6 seq5
125             # ...
126              
127             open my $in, '<', 'clusters.txt';
128             while (my $line = <$in>) {
129             chomp $line;
130              
131             # extract member id list for current cluster
132             my ($cluster, @ids) = split /\s+/xms, $line;
133             $cluster =~ s/:\z//xms; # remove trailing colon (:)
134             my $list = IdList->new( ids => \@ids );
135              
136             # assemble Ali and store it as FASTA file
137             my $ali = $list->reordered_ali($db);
138             $ali->dont_guess;
139             $ali->store( $cluster . '.fasta' );
140             }
141              
142             =head1 DESCRIPTION
143              
144             This module implements a class representing a sequence database where ids are
145             indexed for faster access. To this end, it combines an internal
146             L<Bio::MUST::Core::Ali> object and a L<Bio::MUST::Core::IdList> object.
147              
148             An Ali::Stash is meant to be built from an existing ALI (or FASTA) file
149             residing on disk and cannot be altered once loaded. Its sequences are supposed
150             not to be aligned but aligned FASTA files are also processed correctly. By
151             default, the full-length sequence ids are indexed. If the first word of each
152             id (non-whitespace containing string or accession) is unique across the
153             database, it can be used instead via the option C<<truncate_ids => 1>> of the
154             C<load> method (see the SYNOPSIS for an example).
155              
156             While this class is more efficient than the standard C<Ali>, it is way slower
157             at reading large sequence databases than specialized external programs such as
158             NCBI C<blastdbcmd> working on indexed binary files. Thus, if you need more
159             performance, have a look at the C<Blast::Database> class from the
160             L<Bio::MUST::Drivers> distribution.
161              
162             =head1 ATTRIBUTES
163              
164             =head2 seqs
165              
166             L<Bio::MUST::Core::Ali> object (required)
167              
168             This required attribute contains the L<Bio::MUST::Core::Seq> objects that
169             populate the associated sequence database file. It should be initialized
170             through the class method C<load> (see the SYNOPSIS for an example).
171              
172             For now, it provides the following methods: C<count_comments>,
173             C<all_comments>, C<get_comment>, C<guessing>, C<all_seq_ids>, C<has_uniq_ids>,
174             C<is_protein>, C<is_aligned>, C<get_seq>, C<get_seq_with_id> (see below),
175             C<first_seq>, C<all_seqs>, C<filter_seqs> and C<count_seqs> (see
176             L<Bio::MUST::Core::Ali>).
177              
178             =head2 lookup
179              
180             L<Bio::MUST::Core::IdList> object (auto)
181              
182             This attribute is automatically initialized with the list indexing the
183             sequence ids of the internal C<Ali> object. Thus, it cannot be user-specified.
184              
185             It provides the following method: C<index_for> (see
186             L<Bio::MUST::Core::IdList>). Yet, it is nearly a private method. Instead,
187             individual sequences should be accessed through the C<get_seq_with_id> method
188             (see below), while sequence batches should be recovered via user-specified
189             IdList objects (see the SYNOPSIS for an example).
190              
191             =head1 ACCESSORS
192              
193             =head2 get_seq_with_id
194              
195             Returns a sequence of the Ali::Stash by its id. Note that sequence ids are
196             assumed to be unique in the corresponding database. If no sequence exists for
197             the specified id, this method will return C<undef>.
198              
199             my $id = 'Pyrus malus_3750@658052655';
200             my $seq = $db->get_seq_with_id($id);
201             croak "Seq $id not found in Ali::Stash!" unless defined $seq;
202              
203             This method accepts just one argument (and not an array slice).
204              
205             It is a faster implementation of the same method from the C<Ali> class.
206              
207             =head1 I/O METHODS
208              
209             =head2 load
210              
211             Class method (constructor) returning a new Ali::Stash read from disk. As in
212             C<Ali>, this method will transparently import plain FASTA files in addition to
213             the MUST pseudo-FASTA format (ALI files).
214              
215             # load database
216             my $db = Stash->load( 'database.fasta' );
217              
218             # alternatively... (indexing only accessions)
219             my $db = Stash->load( 'database.fasta', { truncate_ids => 1 } );
220              
221             This method requires one argument and accepts a second optional argument
222             controlling the way sequence ids are processed. It is a hash reference that
223             may only contain the following key:
224              
225             - truncate_ids: consider only the first id word (accession)
226              
227             =head1 AUTHOR
228              
229             Denis BAURAIN <denis.baurain@uliege.be>
230              
231             =head1 COPYRIGHT AND LICENSE
232              
233             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
234              
235             This is free software; you can redistribute it and/or modify it under
236             the same terms as the Perl 5 programming language system itself.
237              
238             =cut