File Coverage

blib/lib/Bio/FastParsers/Uclust.pm
Criterion Covered Total %
statement 26 26 100.0
branch 4 4 100.0
condition n/a
subroutine 5 5 100.0
pod 0 1 0.0
total 35 36 97.2


line stmt bran cond sub pod time code
1             # ABSTRACT: Front-end class for UCLUST parser
2             # CONTRIBUTOR: Amandine BERTRAND <amandine.bertrand@doct.uliege.be>
3             $Bio::FastParsers::Uclust::VERSION = '0.221230';
4             use Moose;
5 7     7   54 use namespace::autoclean;
  7         16  
  7         51  
6 7     7   42325  
  7         18  
  7         81  
7             use autodie;
8 7     7   689  
  7         18  
  7         1160  
9             use Tie::IxHash;
10 7     7   33039  
  7         17  
  7         2060  
11             extends 'Bio::FastParsers::Base';
12              
13              
14             # public attributes (inherited)
15              
16              
17             with 'Bio::FastParsers::Roles::Clusterable';
18              
19              
20             my $self = shift;
21              
22 1     1 0 3 my $infile = $self->filename;
23             open my $in, '<', $infile;
24 1         10  
25 1         41 tie my %members_for, 'Tie::IxHash';
26              
27 1         1863 LINE:
28             while (my $line = <$in>) {
29             chomp $line;
30 1         54 my ($type, @fields) = split /\t/xms, $line;
31 73         585  
32 73         186 # https://www.drive5.com/usearch/manual/opt_uc.html
33             # Field Description
34             # - Record type S, H, C or N (see table below).
35             # 0 Cluster number (0-based).
36             # 1 Sequence length (S, N and H) or cluster size (C).
37             # 2 For H records, percent identity with target.
38             # 3 For H records, the strand: + or - for nucleotides, . for proteins.
39             # 4 Not used, parsers should ignore this field. Included for backwards compatibility.
40             # 5 Not used, parsers should ignore this field. Included for backwards compatibility.
41             # 6 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10).
42             # 7 Label of query sequence (always present).
43             # 8 Label of target sequence (H records only).
44              
45             if ($type eq 'C') {
46             push @{ $members_for{ $fields[7] } }, ();
47 73 100       157 }
    100          
48 22         25 elsif ($type eq 'H') {
  22         50  
49             push @{ $members_for{ $fields[8] } }, $fields[7];
50             }
51 29         34 }
  29         73  
52              
53             # store representative and member sequence ids
54             $self->_set_members_for( \%members_for );
55              
56 1         70 return;
57             }
58 1         44  
59             __PACKAGE__->meta->make_immutable;
60             1;
61              
62              
63             =pod
64              
65             =head1 NAME
66              
67             Bio::FastParsers::Uclust - Front-end class for UCLUST parser
68              
69             =head1 VERSION
70              
71             version 0.221230
72              
73             =head1 SYNOPSIS
74              
75             use aliased 'Bio::FastParsers::Uclust';
76              
77             # open and parse UCLUST report
78             my $infile = 'test/uclust.uc';
79             my $report = Uclust->new( file => $infile );
80              
81             # loop through representatives to get members
82             for my $repr ( $report->all_representatives ) {
83             my $members = $report->members_for($repr);
84             # ...
85             }
86              
87             # get representatives ordered by descending cluster size
88             my @reprs = $report->all_representatives_by_cluster_size;
89              
90             # create IdMapper
91             # Note: this requires Bio::MUST::Core
92             my $mapper = $report->clust_mapper(':');
93             my @long_ids = $mapper->all_long_ids;
94              
95             # ...
96              
97             =head1 DESCRIPTION
98              
99             This module implements a parser for the output file of the UCLUST program. It
100             provides methods for getting the ids of the representative sequences (either
101             sorted by descending cluster size or not) and for obtaining the members of any
102             cluster from the id of its representative.
103              
104             It also has a method for facilitating the re-mapping of all the ids of every
105             cluster on a phylogenetic tree through a L<Bio::MUST::Core::IdMapper> object.
106              
107             =head1 ATTRIBUTES
108              
109             =head2 file
110              
111             Path to UCLUST report file to be parsed
112              
113             =head1 METHODS
114              
115             =head2 all_representatives
116              
117             Returns all the ids of the representative sequences of the clusters (not an
118             array reference).
119              
120             # $report is a Bio::FastParsers::Uclust
121             for my $repr ( $report->all_representatives ) {
122             # process $repr
123             # ...
124             }
125              
126             This method does not accept any arguments.
127              
128             =head2 all_representatives_by_cluster_size
129              
130             Returns all the ids of the representative sequences of the clusters (not an
131             array reference) sorted by descending cluster size (and then lexically by id).
132              
133             # $report is a Bio::FastParsers::Uclust
134             for my $repr ( $report->all_representatives_by_cluster_size ) {
135             # process $repr
136             # ...
137             }
138              
139             This method does not accept any arguments.
140              
141             =head2 members_for
142              
143             Returns all the ids of the member sequences of the cluster corresponding to
144             the id of the specified representative (as an array refrence).
145              
146             # $report is a Bio::FastParsers::Uclust
147             for my $repr ( $report->all_representatives ) {
148             my $members = $report->members_for($repr);
149             # process $members ArrayRef
150             # ...
151             }
152              
153             This method requires one argument: the id of the representative.
154              
155             =head2 clust_mapper
156              
157             Returns a L<Bio::MUST::Core::IdMapper> object associating representative
158             sequence ids to stringified full lists of their member sequence ids (including
159             the representatives themselves).
160              
161             This method needs L<Bio::MUST::Core> to be installed on the computer.
162              
163             # $report is a Bio::FastParsers::Uclust
164             my $mapper = $report->clust_mapper(':');
165              
166             The native methods from L<Bio::MUST::Core::IdMapper> can be applied on
167             C<$mapper>, e.g., C<all_long_ids> or C<long_id_for>.
168              
169             This method accepts an optional argument: the id separator (default: C</>).
170              
171             =head1 AUTHOR
172              
173             Denis BAURAIN <denis.baurain@uliege.be>
174              
175             =head1 CONTRIBUTOR
176              
177             =for stopwords Amandine BERTRAND
178              
179             Amandine BERTRAND <amandine.bertrand@doct.uliege.be>
180              
181             =head1 COPYRIGHT AND LICENSE
182              
183             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
184              
185             This is free software; you can redistribute it and/or modify it under
186             the same terms as the Perl 5 programming language system itself.
187              
188             =cut