File Coverage

Bio/DB/Taxonomy/greengenes.pm
Criterion Covered Total %
statement 32 32 100.0
branch 7 8 87.5
condition 2 2 100.0
subroutine 4 4 100.0
pod 1 1 100.0
total 46 47 97.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Taxonomy::greengenes
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Florent Angly
7             #
8             # Copyright Florent Angly
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::DB::Taxonomy::greengenes - Use the Greengenes taxonomy
17              
18             =head1 SYNOPSIS
19              
20             use Bio::DB::Taxonomy;
21              
22             my $db = Bio::DB::Taxonomy->new(
23             -source => 'greengenes',
24             -taxofile => 'taxonomy_16S_candiv_gg_2011_1.txt'
25             );
26              
27             =head1 DESCRIPTION
28              
29             I
30              
31             Bio::DB::Taxonomy::greengenes is an implementation of Bio::DB::Taxonomy which
32             stores and accesses the Greengenes taxonomy of Bacteria and Archaea. Internally,
33             it keeps the taxonomy into memory by using Bio::DB::Taxonomy::list. As a
34             consequence, note that the IDs assigned to the taxonomy nodes, e.g. gg123, are
35             arbitrary, contrary to the pre-defined IDs that NCBI assigns to taxons.
36              
37             The latest release of the Greengene taxonomy (2011) contains about 4,600 taxa
38             and occupies about 4MB of memory once parsed into a Bio::DB::Taxonomy::greengenes
39             object. The taxonomy files taxonomy_16S_all_gg_2011_1.txt and
40             taxonomy_16S_candiv_gg_2011_1.txt that this module can use are available from
41             L.
42              
43             =head1 FEEDBACK
44              
45             =head2 Mailing Lists
46              
47             User feedback is an integral part of the evolution of this and other
48             Bioperl modules. Send your comments and suggestions preferably to
49             the Bioperl mailing list. Your participation is much appreciated.
50              
51             bioperl-l@bioperl.org - General discussion
52             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53              
54             =head2 Support
55              
56             Please direct usage questions or support issues to the mailing list:
57              
58             I
59              
60             rather than to the module maintainer directly. Many experienced and
61             reponsive experts will be able look at the problem and quickly
62             address it. Please include a thorough description of the problem
63             with code and data examples if at all possible.
64              
65             =head2 Reporting Bugs
66              
67             Report bugs to the Bioperl bug tracking system to help us keep track
68             of the bugs and their resolution. Bug reports can be submitted via
69             the web:
70              
71             https://github.com/bioperl/bioperl-live/issues
72              
73             =head1 AUTHOR - Florent Angly
74              
75             florent.angly@gmail.com
76              
77             =head1 APPENDIX
78              
79             The rest of the documentation details each of the object methods.
80             Internal methods are usually preceded with a _
81              
82             =cut
83              
84             # Let the code begin...
85              
86             package Bio::DB::Taxonomy::greengenes;
87              
88 1     1   3 use strict;
  1         1  
  1         26  
89 1     1   3 use base qw(Bio::DB::Taxonomy Bio::DB::Taxonomy::list);
  1         1  
  1         335  
90              
91             $Bio::DB::Taxonomy::list::prefix = 'gg';
92              
93              
94             =head2 new
95              
96             Title : new
97             Usage : my $obj = Bio::DB::Taxonomy::greengenes->new();
98             Function: Builds a new Bio::DB::Taxonomy::greengenes object
99             Returns : an instance of Bio::DB::Taxonomy::greengenes
100             Args : -taxofile => name of the file containing the taxonomic information,
101             typically 'taxonomy_16S_candiv_gg_2011_1.txt' (mandatory)
102              
103             =cut
104              
105             sub new {
106             # Override Bio::DB::Taxonomy
107 2     2 1 4 my($class, @args) = @_;
108 2         7 my $self = $class->SUPER::new(@args);
109 2         8 my ($taxofile) = $self->_rearrange([qw(TAXOFILE)], @args);
110            
111 2 100       5 if ( $taxofile ) {
112 1         2 $self = $self->_build_taxonomy($taxofile);
113             }
114              
115 2         11 return $self;
116             }
117              
118              
119             sub _build_taxonomy {
120 1     1   1 my ($self, $taxofile) = @_;
121              
122 1         2 my $all_ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'];
123              
124 1         2 my $taxonomy = Bio::DB::Taxonomy::list->new();
125              
126 1 50       36 open my $fh, '<', $taxofile or $self->throw("Could not read file '$taxofile': $!");
127              
128             # Will skip header line: prokMSA_id taxonomy
129 1         1 my $prev_taxo_string = 'taxonomy';
130              
131 1         1 my $line;
132              
133             # Parse taxonomy lines. Example:
134             # 348902 k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__Bacteroides plebeius
135 1         20 while ($line = <$fh>) {
136 51         39 chomp $line;
137 51         98 my ($prokmsa_id, $taxo_string) = split "\t", $line;
138              
139             # Skip taxonomy string already seen on previous line (much faster!)
140 51 100       84 next if $taxo_string eq $prev_taxo_string;
141 40         33 $prev_taxo_string = $taxo_string;
142              
143             # Remove ambiguous taxons, i.e. go from:
144             # k__Archaea; p__pMC2A384; c__; o__; f__; g__; s__
145             # to:
146             # k__Archaea; p__pMC2A384
147 40         194 my $names = [split /;\s*/, $taxo_string];
148 40   100     117 while ( ($names->[-1] || '') =~ m/__$/) {
149 50         101 pop @$names;
150             }
151              
152 40         30 my $nof_ranks = scalar @$names;
153 40 100       51 next if $nof_ranks < 1;
154              
155             $taxonomy->add_lineage(
156 39         56 -ranks => [ @{$all_ranks}[0..$nof_ranks-1] ],
  39         118  
157             -names => $names,
158             );
159              
160             }
161              
162 1         28 close $fh;
163              
164 1         5 return $taxonomy;
165             }
166              
167              
168             1;