File Coverage

blib/lib/Bio/FastParsers/CdHit.pm
Criterion Covered Total %
statement 34 34 100.0
branch 8 10 80.0
condition n/a
subroutine 5 5 100.0
pod 0 1 0.0
total 47 50 94.0


line stmt bran cond sub pod time code
1             package Bio::FastParsers::CdHit;
2             # ABSTRACT: Front-end class for CD-HIT parser
3             # CONTRIBUTOR: Amandine BERTRAND <amandine.bertrand@doct.uliege.be>
4             $Bio::FastParsers::CdHit::VERSION = '0.213510';
5 7     7   60 use Moose;
  7         20  
  7         64  
6 7     7   52265 use namespace::autoclean;
  7         21  
  7         81  
7              
8 7     7   726 use autodie;
  7         20  
  7         80  
9              
10 7     7   45753 use Tie::IxHash;
  7         16945  
  7         3529  
11              
12             extends 'Bio::FastParsers::Base';
13              
14              
15             # public attributes (inherited)
16              
17              
18             with 'Bio::FastParsers::Roles::Clusterable';
19              
20              
21             sub BUILD {
22 1     1 0 5 my $self = shift;
23              
24 1         7 my $cluster_like = qr{\>Cluster \s (\d+)}xms;
25 1         6 my $repr_id_like = qr{\d+ \t \d+\w{2}\, \s \>([\w\|\.]+) .{4} \* }xms;
26 1         5 my $memb_id_like = qr{\d+ \t \d+\w{2}\, \s \>([\w\|\.]+) .{4} at .* \%}xms;
27              
28 1         10 my $infile = $self->filename;
29 1         52 open my $in, '<', $infile;
30              
31 1         2087 tie my %members_for, 'Tie::IxHash';
32              
33 1         36 my $repr_id;
34             my @members;
35              
36 1         49 while (my $line = <$in>) {
37 1157         2027 chomp $line;
38              
39 1157 100       9945 if ($line =~ $cluster_like){
    100          
    50          
40             #### cluster: $line
41 112 100       229 push @{ $members_for{$repr_id} }, @members
  111         491  
42             if $repr_id;
43 112         2942 $repr_id = q{};
44 112         402 @members = ();
45             #### $repr_id
46             #### @members
47             #### %members_for
48             }
49              
50             elsif ($line =~ $repr_id_like) {
51             #### reference sequence: $line
52 112         396 $repr_id = $1;
53             #### $repr_id
54             }
55              
56             # find other seq (array)
57             elsif ($line =~ $memb_id_like){
58             #### member sequence: $line
59 933         1996 my $memb_id = $1;
60 933         2740 push @members, $memb_id;
61             #### @members
62             }
63             }
64              
65 1 50       6 push @{ $members_for{$repr_id} }, @members
  1         9  
66             if $repr_id;
67             #### %members_for
68              
69             # store representative and member sequence ids
70 1         104 $self->_set_members_for( \%members_for );
71              
72 1         61 return;
73             }
74              
75             __PACKAGE__->meta->make_immutable;
76             1;
77              
78             __END__
79              
80             =pod
81              
82             =head1 NAME
83              
84             Bio::FastParsers::CdHit - Front-end class for CD-HIT parser
85              
86             =head1 VERSION
87              
88             version 0.213510
89              
90             =head1 SYNOPSIS
91              
92             use aliased 'Bio::FastParsers::CdHit';
93              
94             # open and parse CD-HIT report (cluster file)
95             my $infile = 'test/cdHit.out.clstr';
96             my $report = CdHit->new( file => $infile );
97              
98             # loop through representatives to get members
99             for my $repr ( $report->all_representatives ) {
100             my $members = $report->members_for($repr);
101             # ...
102             }
103              
104             # get representatives ordered by descending cluster size
105             my @reprs = $report->all_representatives_by_cluster_size;
106              
107             # create IdMapper
108             # Note: this requires Bio::MUST::Core
109             my $mapper = $report->clust_mapper(':');
110             my @long_ids = $mapper->all_long_ids;
111              
112             # ...
113              
114             =head1 DESCRIPTION
115              
116             This module implements a parser for the output file of the CD-HIT program. It
117             provides methods for getting the ids of the representative sequences (either
118             sorted by descending cluster size or not) and for obtaining the members of any
119             cluster from the id of its representative.
120              
121             It also has a method for facilitating the re-mapping of all the ids of every
122             cluster on a phylogenetic tree through a L<Bio::MUST::Core::IdMapper> object.
123              
124             =head1 ATTRIBUTES
125              
126             =head2 file
127              
128             Path to CD-HIT report file to be parsed
129              
130             =head1 METHODS
131              
132             =head2 all_representatives
133              
134             Returns all the ids of the representative sequences of the clusters (not an
135             array reference).
136              
137             # $report is a Bio::FastParsers::CdHit
138             for my $repr ( $report->all_representatives ) {
139             # process $repr
140             # ...
141             }
142              
143             This method does not accept any arguments.
144              
145             =head2 all_representatives_by_cluster_size
146              
147             Returns all the ids of the representative sequences of the clusters (not an
148             array reference) sorted by descending cluster size (and then lexically by id).
149              
150             # $report is a Bio::FastParsers::CdHit
151             for my $repr ( $report->all_representatives_by_cluster_size ) {
152             # process $repr
153             # ...
154             }
155              
156             This method does not accept any arguments.
157              
158             =head2 members_for
159              
160             Returns all the ids of the member sequences of the cluster corresponding to
161             the id of the specified representative (as an array refrence).
162              
163             # $report is a Bio::FastParsers::CdHit
164             for my $repr ( $report->all_representatives ) {
165             my $members = $report->members_for($repr);
166             # process $members ArrayRef
167             # ...
168             }
169              
170             This method requires one argument: the id of the representative.
171              
172             =head2 clust_mapper
173              
174             Returns a L<Bio::MUST::Core::IdMapper> object associating representative
175             sequence ids to stringified full lists of their member sequence ids (including
176             the representatives themselves).
177              
178             This method needs L<Bio::MUST::Core> to be installed on the computer.
179              
180             # $report is a Bio::FastParsers::CdHit
181             my $mapper = $report->clust_mapper(':');
182              
183             The native methods from L<Bio::MUST::Core::IdMapper> can be applied on
184             C<$mapper>, e.g., C<all_long_ids> or C<long_id_for>.
185              
186             This method accepts an optional argument: the id separator (default: C</>).
187              
188             =head1 AUTHOR
189              
190             Denis BAURAIN <denis.baurain@uliege.be>
191              
192             =head1 CONTRIBUTOR
193              
194             =for stopwords Amandine BERTRAND
195              
196             Amandine BERTRAND <amandine.bertrand@doct.uliege.be>
197              
198             =head1 COPYRIGHT AND LICENSE
199              
200             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
201              
202             This is free software; you can redistribute it and/or modify it under
203             the same terms as the Perl 5 programming language system itself.
204              
205             =cut