File Coverage

blib/lib/Bio/FastParsers/Uclust.pm
Criterion Covered Total %
statement 26 26 100.0
branch 4 4 100.0
condition n/a
subroutine 5 5 100.0
pod 0 1 0.0
total 35 36 97.2


line stmt bran cond sub pod time code
1             package Bio::FastParsers::Uclust;
2             # ABSTRACT: Front-end class for UCLUST parser
3             # CONTRIBUTOR: Amandine BERTRAND <amandine.bertrand@doct.uliege.be>
4             $Bio::FastParsers::Uclust::VERSION = '0.201110';
5 7     7   58 use Moose;
  7         21  
  7         67  
6 7     7   50618 use namespace::autoclean;
  7         22  
  7         77  
7              
8 7     7   697 use autodie;
  7         22  
  7         67  
9              
10 7     7   39249 use Tie::IxHash;
  7         21  
  7         2456  
11              
12             extends 'Bio::FastParsers::Base';
13              
14              
15             # public attributes (inherited)
16              
17              
18             with 'Bio::FastParsers::Roles::Clusterable';
19              
20              
21             sub BUILD {
22 1     1 0 3 my $self = shift;
23              
24 1         11 my $infile = $self->filename;
25 1         48 open my $in, '<', $infile;
26              
27 1         2181 tie my %members_for, 'Tie::IxHash';
28              
29             LINE:
30 1         50 while (my $line = <$in>) {
31 73         721 chomp $line;
32 73         223 my ($type, @fields) = split /\t/xms, $line;
33              
34             # https://www.drive5.com/usearch/manual/opt_uc.html
35             # Field Description
36             # - Record type S, H, C or N (see table below).
37             # 0 Cluster number (0-based).
38             # 1 Sequence length (S, N and H) or cluster size (C).
39             # 2 For H records, percent identity with target.
40             # 3 For H records, the strand: + or - for nucleotides, . for proteins.
41             # 4 Not used, parsers should ignore this field. Included for backwards compatibility.
42             # 5 Not used, parsers should ignore this field. Included for backwards compatibility.
43             # 6 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10).
44             # 7 Label of query sequence (always present).
45             # 8 Label of target sequence (H records only).
46              
47 73 100       193 if ($type eq 'C') {
    100          
48 22         28 push @{ $members_for{ $fields[7] } }, ();
  22         59  
49             }
50             elsif ($type eq 'H') {
51 29         37 push @{ $members_for{ $fields[8] } }, $fields[7];
  29         82  
52             }
53             }
54              
55             # store representative and member sequence ids
56 1         83 $self->_set_members_for( \%members_for );
57              
58 1         54 return;
59             }
60              
61             __PACKAGE__->meta->make_immutable;
62             1;
63              
64             __END__
65              
66             =pod
67              
68             =head1 NAME
69              
70             Bio::FastParsers::Uclust - Front-end class for UCLUST parser
71              
72             =head1 VERSION
73              
74             version 0.201110
75              
76             =head1 SYNOPSIS
77              
78             use aliased 'Bio::FastParsers::Uclust';
79              
80             # open and parse UCLUST report
81             my $infile = 'test/uclust.uc';
82             my $report = Uclust->new( file => $infile );
83              
84             # loop through representatives to get members
85             for my $repr ( $report->all_representatives ) {
86             my $members = $report->members_for($repr);
87             # ...
88             }
89              
90             # get representatives ordered by descending cluster size
91             my @reprs = $report->all_representatives_by_cluster_size;
92              
93             # create IdMapper
94             # Note: this requires Bio::MUST::Core
95             my $mapper = $report->clust_mapper(':');
96             my @long_ids = $mapper->all_long_ids;
97              
98             # ...
99              
100             =head1 DESCRIPTION
101              
102             This module implements a parser for the output file of the UCLUST program. It
103             provides methods for getting the ids of the representative sequences (either
104             sorted by descending cluster size or not) and for obtaining the members of any
105             cluster from the id of its representative.
106              
107             It also has a method for facilitating the re-mapping of all the ids of every
108             cluster on a phylogenetic tree through a L<Bio::MUST::Core::IdMapper> object.
109              
110             =head1 ATTRIBUTES
111              
112             =head2 file
113              
114             Path to UCLUST report file to be parsed
115              
116             =head1 METHODS
117              
118             =head2 all_representatives
119              
120             Returns all the ids of the representative sequences of the clusters (not an
121             array reference).
122              
123             # $report is a Bio::FastParsers::Uclust
124             for my $repr ( $report->all_representatives ) {
125             # process $repr
126             # ...
127             }
128              
129             This method does not accept any arguments.
130              
131             =head2 all_representatives_by_cluster_size
132              
133             Returns all the ids of the representative sequences of the clusters (not an
134             array reference) sorted by descending cluster size (and then lexically by id).
135              
136             # $report is a Bio::FastParsers::Uclust
137             for my $repr ( $report->all_representatives_by_cluster_size ) {
138             # process $repr
139             # ...
140             }
141              
142             This method does not accept any arguments.
143              
144             =head2 members_for
145              
146             Returns all the ids of the member sequences of the cluster corresponding to
147             the id of the specified representative (as an array refrence).
148              
149             # $report is a Bio::FastParsers::Uclust
150             for my $repr ( $report->all_representatives ) {
151             my $members = $report->members_for($repr);
152             # process $members ArrayRef
153             # ...
154             }
155              
156             This method requires one argument: the id of the representative.
157              
158             =head2 clust_mapper
159              
160             Returns a L<Bio::MUST::Core::IdMapper> object associating representative
161             sequence ids to stringified full lists of their member sequence ids (including
162             the representatives themselves).
163              
164             This method needs L<Bio::MUST::Core> to be installed on the computer.
165              
166             # $report is a Bio::FastParsers::Uclust
167             my $mapper = $report->clust_mapper(':');
168              
169             The native methods from L<Bio::MUST::Core::IdMapper> can be applied on
170             C<$mapper>, e.g., C<all_long_ids> or C<long_id_for>.
171              
172             This method accepts an optional argument: the id separator (default: C</>).
173              
174             =head1 AUTHOR
175              
176             Denis BAURAIN <denis.baurain@uliege.be>
177              
178             =head1 CONTRIBUTOR
179              
180             =for stopwords Amandine BERTRAND
181              
182             Amandine BERTRAND <amandine.bertrand@doct.uliege.be>
183              
184             =head1 COPYRIGHT AND LICENSE
185              
186             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
187              
188             This is free software; you can redistribute it and/or modify it under
189             the same terms as the Perl 5 programming language system itself.
190              
191             =cut