File Coverage

blib/lib/Bio/LITE/Taxonomy/NCBI.pm
Criterion Covered Total %
statement 64 88 72.7
branch 14 32 43.7
condition 4 13 30.7
subroutine 12 17 70.5
pod 4 4 100.0
total 98 154 63.6


line stmt bran cond sub pod time code
1             package Bio::LITE::Taxonomy::NCBI;
2              
3             =head1 NAME
4              
5             Bio::LITE::Taxonomy::NCBI - Lightweight and efficient NCBI taxonomic manager
6              
7             =head1 SYNOPSIS
8              
9             use Bio::LITE::Taxonomy::NCBI;
10              
11             my $taxDB = Bio::LITE::Taxonomy::NCBI->new (
12             db=>"NCBI",
13             names=> "/path/to/names.dmp",
14             nodes=>"/path/to/nodes.dmp"
15             );
16              
17             my $tax = $taxDB->get_taxonomy(1442); # 1442 is a Taxid
18             my $taxid = $taxDB->get_taxid_from_name("Bacteroidetes");
19             my $term = $taxDB->get_term_at_level(1442,"family");
20              
21             my $taxDB2 = Bio::LITE::Taxonomy::NCBI-> new (
22             db=>"NCBI",
23             names=> "/path/to/names.dmp",
24             nodes=>"/path/to/nodes.dmp",
25             dict=>"/path/to/dictionary/file",
26             );
27             my $tax2 = $taxDB2->get_taxonomy_from_gi(12553);
28              
29             # Methods from Bio::LITE::Taxonomy::NCBI::Gi2taxid
30             # can also be called directly:
31              
32             my $taxid2 = $taxDB2->get_taxid(12553);
33              
34              
35             =head1 DESCRIPTION
36              
37             This module provides easy and efficient access to the NCBI taxonomy with minimal dependencies and without intermediary databases.
38              
39             This module is not part of the Bioperl bundle. For bioperl alternatives see the L section of this document.
40              
41             =head1 CONSTRUCTOR
42              
43             =head2 C
44              
45             Creates a Bio::LITE::Taxonomy::NCBI object.
46              
47             The following parameters are needed
48              
49             =over 4
50              
51             =item names
52              
53             The location of the I file. Filehandles are also allowed. I.
54              
55             =item nodes
56              
57             The location of the I file. Filehandles are also allowed. I.
58              
59             =item synonyms
60              
61             An array reference listing the categories of synonymous names made available to methods C and C. This parameter is optional and set to C<['synonym']> by default.
62              
63             As of May 2015, meaningful values are: I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I.
64              
65             my $taxDB = Bio::LITE::Taxonomy::NCBI->new (
66             db=>"NCBI",
67             names=> "/path/to/names.dmp",
68             nodes=>"/path/to/nodes.dmp",
69             synonyms=>['anamorph','teleomorph','synonym']
70             );
71              
72             =item dict
73              
74             You can query the tree using Cs directly instead of Cs. For doing this, you should provide the NCBIs GI to Taxid mapper in binary format as explained in L. I
75              
76             =item save_mem
77              
78             Use this option to avoid to load the binary dictionary (GI to Taxid) into memory. This will save almost 1GB of system memory but looking up for Taxids will be ~20% slower. This parameter is optional, only makes sense if you are using the C to C dictionary and is I by default.
79              
80             =back
81              
82             =head1 METHODS
83              
84             This module inherits from L so all the methods explained there are accessible.
85             These methods are also available:
86              
87             =over 4
88              
89             =item get_taxonomy_from_gi
90              
91             Accepts a C as input and returns an array with its ascendants ordered from top to bottom.
92              
93             my @tax = $tax->get_taxonomy_from_gi($gi);
94             print "$_\n" for (@tax);
95              
96             If called in scalar context, returns an array reference instead of the array.
97             See L::get_taxonomy
98              
99             =item get_taxonomy_with_levels_from_gi
100              
101             The same as get_taxonomy_from_gi but instead of getting the ascendants returns an array of array references. Each array reference has the ascendant and its taxonomic level (at positions 0 and 1 respectively). This is simpler than it sounds. Check L::get_taxonomy_with_levels for more information.
102              
103             If called in scalar context, returns an array reference instead of the array.
104              
105             =item get_term_at_level_from_gi
106              
107             Given a gi and a taxonomic level as input, returns the taxon. For example,
108              
109             my $taxon = $tax->get_term_at_level_from_gi($gi,"family");
110              
111             See L::get_term_at_level.
112              
113             =back
114              
115             =head1 SEE ALSO
116              
117             L - Module to convert NCBIs GIs to Taxids
118              
119             L
120              
121             L
122              
123             L - Bioperl alternative for NCBI taxonomies.
124              
125              
126             =head1 AUTHOR
127              
128             Miguel Pignatelli
129             Any comments or suggestions should be addressed to emepyc@gmail.com
130              
131             =head1 CONTRIBUTORS
132              
133             Denis Baurain (denis.baurain -AT- ulg.ac.be)
134              
135             =head1 LICENSE
136              
137             Copyright 2015 Miguel Pignatelli, all rights reserved.
138              
139             This library is free software; you may redistribute it and/or modify it under the same terms as Perl itself.
140              
141             =cut
142              
143 2     2   33994 use strict;
  2         3  
  2         60  
144 2     2   7 use warnings;
  2         2  
  2         49  
145 2     2   7 use Carp qw/croak/;
  2         10  
  2         94  
146              
147             #use Bio::LITE::Taxonomy;
148             #if (do {(sprintf "%vd",$^V) =~ /5\.(\d\d)/; $1} >= 10}) {
149             # import base qw(Taxonomy);
150             #} else {
151             # import parent qw(Taxonomy);
152             #}
153 2     2   9 use base qw(Bio::LITE::Taxonomy);
  2         2  
  2         701  
154              
155             our $VERSION = 0.1;
156              
157 2     2   895 use constant FS => '\t\|\t';
  2         3  
  2         126  
158 2     2   8 use constant RS => '\t\|\n';
  2         5  
  2         1805  
159              
160             sub new {
161 1     1 1 1696 my ($class,%args) = @_;
162 1         4 my %opts;
163              
164 1 50       7 defined $args{'nodes'} or croak "Need the file nodes.dmp";
165 1 50       4 defined $args{'names'} or croak "Need the file names.dmp";
166              
167 1         7 @opts{qw/nodesFile namesFile synList/} = @args{qw/nodes names synonyms/};
168 1         4 my $self = bless \%opts, $class;
169 1         6 $self->_build_taxonomy();
170 1 50       10 if (defined $args{dict}) {
171 0         0 require Bio::LITE::Taxonomy::NCBI::Gi2taxid;
172 0   0     0 $self->{dict} = Bio::LITE::Taxonomy::NCBI::Gi2taxid->new(dict=>$args{dict},save_mem=>$args{save_mem} || 0);
173             }
174 1         13 return $self;
175             }
176              
177             sub _build_taxonomy {
178 1     1   2 my ($self) = @_;
179 1         9 my $nodesFile = $self->{nodesFile};
180 1         2 my $tax;
181 1 50 33     18 if ((UNIVERSAL::isa($nodesFile, 'GLOB')) or (ref \$nodesFile eq 'GLOB')) {
182 0         0 $tax = $nodesFile;
183             } else {
184 1 50       38 open $tax, "<", $nodesFile or croak "$!";
185             }
186 1         9363 while (<$tax>){
187 560790         693244 chomp;
188 560790         736204 $self -> _create_node(_parse_tax_rec($_));
189             }
190 1         5 $self -> _name_nodes();
191 1 50 33     38 close $tax unless ((UNIVERSAL::isa($nodesFile, 'GLOB')) or (ref \$nodesFile eq 'GLOB'));
192             }
193              
194             sub _create_node {
195 560790     560790   969653 my ($self,$node,$parent,$level) = @_;
196 560790 100       1274400 $self->{allowed_levels}{$level} = 1 if (! defined $self->{allowed_levels}{$level});
197 560790         553750 @{$self->{nodes}->{$node}}{qw/parent level/} = ($parent,$level);
  560790         10497321  
198             }
199              
200             sub _name_nodes {
201 1     1   2 my ($self) = @_;
202 1         2 my $namesFile = $self->{namesFile};
203 1         1 my $nodesNames;
204 1 50 33     21 if ((UNIVERSAL::isa($namesFile, 'GLOB')) or (ref \$namesFile eq 'GLOB')) {
205 0         0 $nodesNames = $namesFile;
206             } else {
207 1 50       62 open $nodesNames, "<", $namesFile or croak $!;
208             }
209 1   50     2 my %wanted = map { $_ => 1 } @{$self->{synList} // ['synonym']};
  3         13  
  1         5  
210 1         3367937 while (<$nodesNames>){
211 806059         1026659 chomp;
212 806059         1130829 my ($taxId,$taxName,$comment) = _process_tax_name ($_);
213 806059 100       16799870 if ($comment eq 'scientific name'){
    100          
214 560790         510459 ${$self->{nodes}->{$taxId}}{name} = $taxName;
  560790         1663771  
215 560790         47439646 $self->{names}->{$taxName} = $taxId;
216             } elsif ($wanted{$comment}) {
217 138762         20893392 $self->{names}->{$taxName} = $taxId;
218             }
219             }
220 1         44 close $nodesNames;
221             }
222              
223             sub _parse_tax_rec {
224 560790     560790   797380 my $line = shift @_;
225 560790         3263971 return (split FS,$line)[0,1,2];
226             }
227              
228              
229             sub _process_tax_name {
230 806059     806059   1229101 my $line = shift @_;
231 806059         3446090 my @fields = split FS, $line;
232 806059         2543159 $fields[3] =~ s/\t\|$//;
233 806059         2866887 return ($fields[0],$fields[1],$fields[3]);
234             }
235              
236             sub get_taxonomy_from_gi {
237 0     0 1   my ($self,$gi) = @_;
238 0 0         croak "Undefined GI\n" unless (defined $gi);
239 0           my $taxid = $self->{dict}->get_taxid($gi);
240 0           return $self->get_taxonomy($taxid);
241             }
242              
243             sub get_taxonomy_with_levels_from_gi {
244 0     0 1   my ($self,$gi) = @_;
245 0 0         croak "Undefined GI\n" unless (defined $gi);
246 0           my $taxid = $self->{dict}->get_taxid($gi);
247 0           return $self->get_taxonomy_with_levels($taxid);
248             }
249              
250             sub get_term_at_level_from_gi {
251 0     0 1   my ($self,$gi,$level) = @_;
252 0 0         croak "Undefined GI\n" unless (defined $gi);
253 0 0         croak "Undefined Level\n" unless (defined $level);
254 0           my $taxid = $self->{dict}->get_taxid($gi);
255 0           return $self->get_term_at_level($taxid,$level);
256             }
257              
258             # Note: Use methods in Gi2taxid as if they were from here
259             sub AUTOLOAD {
260 0     0     my ($self,$args) = @_;
261 0           our $AUTOLOAD;
262 0           my $method = $AUTOLOAD;
263 0           $method =~ s/.*:://;
264 0 0         $self->{dict}->can($method) or croak "$method not defined in package __PACKAGE__\n";
265 0           return $self->{dict}->$method($args);
266             }
267              
268             # Needed to dont call AUTLOAD on object destruction
269 0     0     sub DESTROY { }
270              
271             1;