File Coverage

blib/lib/Bio/FastParsers/CdHit.pm
Criterion Covered Total %
statement 34 34 100.0
branch 8 10 80.0
condition n/a
subroutine 5 5 100.0
pod 0 1 0.0
total 47 50 94.0


line stmt bran cond sub pod time code
1             # ABSTRACT: Front-end class for CD-HIT parser
2             # CONTRIBUTOR: Amandine BERTRAND <amandine.bertrand@doct.uliege.be>
3             $Bio::FastParsers::CdHit::VERSION = '0.221230';
4             use Moose;
5 7     7   50 use namespace::autoclean;
  7         16  
  7         57  
6 7     7   42217  
  7         21  
  7         61  
7             use autodie;
8 7     7   567  
  7         15  
  7         57  
9             use Tie::IxHash;
10 7     7   36735  
  7         13588  
  7         2653  
11             extends 'Bio::FastParsers::Base';
12              
13              
14             # public attributes (inherited)
15              
16              
17             with 'Bio::FastParsers::Roles::Clusterable';
18              
19              
20             my $self = shift;
21              
22 1     1 0 4 my $cluster_like = qr{\>Cluster \s (\d+)}xms;
23             my $repr_id_like = qr{\d+ \t \d+\w{2}\, \s \>([\w\|\.]+) .{4} \* }xms;
24 1         6 my $memb_id_like = qr{\d+ \t \d+\w{2}\, \s \>([\w\|\.]+) .{4} at .* \%}xms;
25 1         5  
26 1         4 my $infile = $self->filename;
27             open my $in, '<', $infile;
28 1         9  
29 1         40 tie my %members_for, 'Tie::IxHash';
30              
31 1         1679 my $repr_id;
32             my @members;
33 1         17  
34             while (my $line = <$in>) {
35             chomp $line;
36 1         55  
37 1157         1391 if ($line =~ $cluster_like){
38             #### cluster: $line
39 1157 100       7989 push @{ $members_for{$repr_id} }, @members
    100          
    50          
40             if $repr_id;
41 112 100       216 $repr_id = q{};
  111         360  
42             @members = ();
43 112         2260 #### $repr_id
44 112         351 #### @members
45             #### %members_for
46             }
47              
48             elsif ($line =~ $repr_id_like) {
49             #### reference sequence: $line
50             $repr_id = $1;
51             #### $repr_id
52 112         325 }
53              
54             # find other seq (array)
55             elsif ($line =~ $memb_id_like){
56             #### member sequence: $line
57             my $memb_id = $1;
58             push @members, $memb_id;
59 933         1669 #### @members
60 933         2161 }
61             }
62              
63             push @{ $members_for{$repr_id} }, @members
64             if $repr_id;
65 1 50       4 #### %members_for
  1         5  
66              
67             # store representative and member sequence ids
68             $self->_set_members_for( \%members_for );
69              
70 1         68 return;
71             }
72 1         44  
73             __PACKAGE__->meta->make_immutable;
74             1;
75              
76              
77             =pod
78              
79             =head1 NAME
80              
81             Bio::FastParsers::CdHit - Front-end class for CD-HIT parser
82              
83             =head1 VERSION
84              
85             version 0.221230
86              
87             =head1 SYNOPSIS
88              
89             use aliased 'Bio::FastParsers::CdHit';
90              
91             # open and parse CD-HIT report (cluster file)
92             my $infile = 'test/cdHit.out.clstr';
93             my $report = CdHit->new( file => $infile );
94              
95             # loop through representatives to get members
96             for my $repr ( $report->all_representatives ) {
97             my $members = $report->members_for($repr);
98             # ...
99             }
100              
101             # get representatives ordered by descending cluster size
102             my @reprs = $report->all_representatives_by_cluster_size;
103              
104             # create IdMapper
105             # Note: this requires Bio::MUST::Core
106             my $mapper = $report->clust_mapper(':');
107             my @long_ids = $mapper->all_long_ids;
108              
109             # ...
110              
111             =head1 DESCRIPTION
112              
113             This module implements a parser for the output file of the CD-HIT program. It
114             provides methods for getting the ids of the representative sequences (either
115             sorted by descending cluster size or not) and for obtaining the members of any
116             cluster from the id of its representative.
117              
118             It also has a method for facilitating the re-mapping of all the ids of every
119             cluster on a phylogenetic tree through a L<Bio::MUST::Core::IdMapper> object.
120              
121             =head1 ATTRIBUTES
122              
123             =head2 file
124              
125             Path to CD-HIT report file to be parsed
126              
127             =head1 METHODS
128              
129             =head2 all_representatives
130              
131             Returns all the ids of the representative sequences of the clusters (not an
132             array reference).
133              
134             # $report is a Bio::FastParsers::CdHit
135             for my $repr ( $report->all_representatives ) {
136             # process $repr
137             # ...
138             }
139              
140             This method does not accept any arguments.
141              
142             =head2 all_representatives_by_cluster_size
143              
144             Returns all the ids of the representative sequences of the clusters (not an
145             array reference) sorted by descending cluster size (and then lexically by id).
146              
147             # $report is a Bio::FastParsers::CdHit
148             for my $repr ( $report->all_representatives_by_cluster_size ) {
149             # process $repr
150             # ...
151             }
152              
153             This method does not accept any arguments.
154              
155             =head2 members_for
156              
157             Returns all the ids of the member sequences of the cluster corresponding to
158             the id of the specified representative (as an array refrence).
159              
160             # $report is a Bio::FastParsers::CdHit
161             for my $repr ( $report->all_representatives ) {
162             my $members = $report->members_for($repr);
163             # process $members ArrayRef
164             # ...
165             }
166              
167             This method requires one argument: the id of the representative.
168              
169             =head2 clust_mapper
170              
171             Returns a L<Bio::MUST::Core::IdMapper> object associating representative
172             sequence ids to stringified full lists of their member sequence ids (including
173             the representatives themselves).
174              
175             This method needs L<Bio::MUST::Core> to be installed on the computer.
176              
177             # $report is a Bio::FastParsers::CdHit
178             my $mapper = $report->clust_mapper(':');
179              
180             The native methods from L<Bio::MUST::Core::IdMapper> can be applied on
181             C<$mapper>, e.g., C<all_long_ids> or C<long_id_for>.
182              
183             This method accepts an optional argument: the id separator (default: C</>).
184              
185             =head1 AUTHOR
186              
187             Denis BAURAIN <denis.baurain@uliege.be>
188              
189             =head1 CONTRIBUTOR
190              
191             =for stopwords Amandine BERTRAND
192              
193             Amandine BERTRAND <amandine.bertrand@doct.uliege.be>
194              
195             =head1 COPYRIGHT AND LICENSE
196              
197             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
198              
199             This is free software; you can redistribute it and/or modify it under
200             the same terms as the Perl 5 programming language system itself.
201              
202             =cut