File Coverage

Bio/DB/Taxonomy/list.pm
Criterion Covered Total %
statement 107 133 80.4
branch 42 64 65.6
condition 18 38 47.3
subroutine 9 10 90.0
pod 7 7 100.0
total 183 252 72.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Taxonomy::list
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Sendu Bala
7             #
8             # Copyright Sendu Bala
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14              
15             =head1 NAME
16              
17             Bio::DB::Taxonomy::list - An implementation of Bio::DB::Taxonomy
18             that accepts lists of words to build a database
19              
20             =head1 SYNOPSIS
21              
22             use Bio::DB::Taxonomy;
23              
24             my $db = Bio::DB::Taxonomy->new( -source => 'list' );
25              
26             my @ranks = ('superkingdom', 'class', 'genus', 'species');
27             my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
28             $db->add_lineage(-names => \@names, -ranks => \@ranks);
29              
30             @names = ('Eukaryota', 'Mammalia', 'Mus', 'Mus musculus');
31             $db->add_lineage(-names => \@names, -ranks => \@ranks);
32              
33             =head1 DESCRIPTION
34              
35             This is an implementation which uses supplied lists of words to create a
36             database from which you can extract Bio::Taxon objects.
37              
38             =head1 TODO
39              
40             It is possible this module could do something like store the data it builds
41             up to disc. Would that be useful?
42             At any rate, this is why the module is called 'list' and not 'in_memory' or
43             similar.
44              
45             =head1 FEEDBACK
46              
47             =head2 Mailing Lists
48              
49             User feedback is an integral part of the evolution of this and other
50             Bioperl modules. Send your comments and suggestions preferably to
51             the Bioperl mailing list. Your participation is much appreciated.
52              
53             bioperl-l@bioperl.org - General discussion
54             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55              
56             =head2 Support
57              
58             Please direct usage questions or support issues to the mailing list:
59              
60             I
61              
62             rather than to the module maintainer directly. Many experienced and
63             reponsive experts will be able look at the problem and quickly
64             address it. Please include a thorough description of the problem
65             with code and data examples if at all possible.
66              
67             =head2 Reporting Bugs
68              
69             Report bugs to the Bioperl bug tracking system to help us keep track
70             of the bugs and their resolution. Bug reports can be submitted via
71             the web:
72              
73             https://github.com/bioperl/bioperl-live/issues
74              
75             =head1 AUTHOR - Sendu Bala
76              
77             Email bix@sendu.me.uk
78              
79             =head1 APPENDIX
80              
81             The rest of the documentation details each of the object methods.
82             Internal methods are usually preceded with a _
83              
84             =cut
85              
86             # Let the code begin...
87              
88              
89             package Bio::DB::Taxonomy::list;
90              
91 34     34   121 use strict;
  34         40  
  34         870  
92 34     34   770 use Bio::Taxon;
  34         33  
  34         723  
93              
94 34     34   98 use base qw(Bio::DB::Taxonomy);
  34         40  
  34         34247  
95              
96             our $prefix = 'list';
97              
98              
99             =head2 new
100              
101             Title : new
102             Usage : my $obj = Bio::DB::Taxonomy::list->new();
103             Function: Builds a new Bio::DB::Taxonomy::list object
104             Returns : an instance of Bio::DB::Taxonomy::list
105             Args : optional, as per the add_lineage() method.
106              
107             =cut
108              
109             sub new {
110 252     252 1 566 my ($class, @args) = @_;
111 252         887 my $self = $class->SUPER::new(@args);
112 252         590 my %args = @args;
113 252         378 delete $args{'-source'};
114              
115 252 100       1128 $self->add_lineage(%args) if %args;
116            
117 252         1142 return $self;
118             }
119              
120              
121             =head2 add_lineage
122              
123             Title : add_lineage
124             Usage : my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
125             my @ranks = ('superkingdom', 'class', 'genus', 'species');
126             $db->add_lineage( -names => \@names, -ranks => \@ranks );
127             Function: Add a lineage to the database, where the lineage is described by
128             a list of scientific names in the order root->leaf. The rank of each
129             name can optionally be described by supplying an additional list
130             of rank names in the same order (eg. superkingdom->species).
131             Returns : 1 for success
132             Args : -names => [] : array ref of scientific names, REQUIRED
133             -ranks => [] : array ref of rank names, same order as above, OPTIONAL
134              
135             =cut
136              
137             sub add_lineage {
138 331     331 1 556 my ($self, @args) = @_;
139 331         1185 my ($names, $ranks) = $self->_rearrange([qw (NAMES RANKS)], @args);
140 331 50 33     1752 $self->throw("-names must be supplied and its value must be an array reference")
141             unless $names && ref($names) eq 'ARRAY';
142              
143 331         508 my $names_idx = scalar @$names - 1;
144              
145 331 100       617 if ($ranks) {
146 39 50       51 $self->throw("-ranks must be an array reference")
147             unless ref($ranks) eq 'ARRAY';
148 39 50       48 $self->throw("The -names and -ranks lists must be of equal length")
149             unless $names_idx == scalar @$ranks - 1;
150             }
151            
152             # This is non-trivial because names are not guaranteed unique in a taxonomy,
153             # and neither are name&rank combinations. Furthermore, different name&rank
154             # combinations can actually refer to the same taxon, eg. when one time
155             # 'Homo'&'genus' is supplied, while another time 'Homo'&'no rank'.
156             #
157             # name&rank&ancestor could well be unique (or good enough 99.9999% of the
158             # time), but we have the added complication that lineages could sometimes be
159             # supplied with differing numbers of taxa. Ideally we want to realise that
160             # the first of these two lineages shares all its nodes with the second:
161             # ('Mammalia', 'Homo', 'Homo sapiens')
162             # ('Mammalia', 'Hominidae', 'Homo', 'Homo sapiens')
163             #
164             # Clearly with limited information we can't do a perfect job, but we can try
165             # and do a reasonable one. So, let's just do the trivial implementation now
166             # and see how bad it is! (assumes ranks are unique except for 'no rank')
167            
168 331         470 my $ancestors = $self->{ancestors};
169 331         603 my $node_data = $self->{node_data};
170 331         364 my $name_to_id = $self->{name_to_id};
171 331         360 my $children = $self->{children};
172              
173 331         354 my $my_ancestor_id = '';
174 331         279 my @node_ids;
175 331         869 for my $i (0 .. $names_idx) {
176 3010         2614 my $name = $names->[$i];
177 3010         2167 my $rank = $ranks->[$i]; # if undef, this node has 'no rank'
178              
179             # This is a new node with a new id if we haven't seen this name before.
180             # It's also always a new node if this is the first lineage going into
181             # the db.
182             #
183             # We need to handle, however, situations in the future where we try to
184             # merge in a new lineage but we have non-unique names in the lineage
185             # and possible missing classes in some lineages, e.g.
186             # '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
187             # merged with
188             # '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
189             # but still need the 'tree' to be correct
190              
191             # Look for a node that is consistent with this lineage
192 3010         1870 my $node_id;
193 3010         4869 SAME_NAMED: for my $same_id (@{$name_to_id->{$name}}) {
  3010         6442  
194              
195             # Taxa are the same if it they have the same ancestor or none
196 557   100     774 my $this_ancestor_id = $ancestors->{$same_id} || '';
197 557 100       647 if ($my_ancestor_id eq $this_ancestor_id) {
198 282         153 $node_id = $same_id;
199 282         235 last SAME_NAMED;
200             }
201            
202             # Compare children
203 275 100       290 next if $i >= $names_idx; # this taxon has no child
204 223         155 my $my_child_name = $names->[$i + 1];
205             #while ( my ($this_child_id, undef) = each %{$children->{$same_id}} ) {
206 223         129 for my $this_child_id (keys %{$children->{$same_id}}) {
  223         267  
207 234 100       349 if ($my_child_name eq $node_data->{$this_child_id}->[0]) { # both children have same name
208 5 50       6 if ($my_ancestor_id) {
209 5         3 my @s_ancestors;
210 5         12 while ($this_ancestor_id = $ancestors->{$this_ancestor_id}) {
211 5 50       6 if ($my_ancestor_id eq $this_ancestor_id) {
212 0         0 $my_ancestor_id = $ancestors->{$same_id};
213 0         0 push @node_ids, @s_ancestors, $my_ancestor_id;
214 0         0 $node_id = $same_id;
215 0         0 last SAME_NAMED;
216             }
217 5         16 unshift @s_ancestors, $this_ancestor_id;
218             }
219             } else {
220             # This new lineage (@$names) doesn't start at the
221             # same root as the existing lineages. Assuming
222             # '$name' corresponds to node $same_id");
223 0         0 $node_id = $same_id;
224 0         0 last SAME_NAMED;
225             }
226             }
227             }
228             }
229            
230 3010 100       4211 if (not defined $node_id) {
231             # This is a new node. Add it to the database, using the prefix 'list'
232             # for its ID to distinguish it from the IDs from other taxonomies.
233 2728         2413 my $next_num = ++$self->{node_ids};
234 2728         2586 $node_id = $prefix.$next_num;
235 2728         1727 push @{$self->{name_to_id}->{$name}}, $node_id;
  2728         5019  
236 2728         4742 $self->{node_data}->{$node_id}->[0] = $name;
237             }
238              
239 3010 100 100     4974 if ( (defined $rank) && (not defined $node_data->{$node_id}->[1]) ) {
240             # Save rank if node in database has no rank but the current node has one
241 114         100 $self->{node_data}->{$node_id}->[1] = $rank;
242             }
243              
244 3010 100       3561 if ($my_ancestor_id) {
245 2679 50 66     4786 if ($self->{ancestors}->{$node_id} && $self->{ancestors}->{$node_id} ne $my_ancestor_id) {
246 0         0 $self->throw("The lineage '".join(', ', @$names)."' and a ".
247             "previously stored lineage share a node name but have ".
248             "different ancestries for that node. Can't cope!");
249             }
250 2679         3223 $self->{ancestors}->{$node_id} = $my_ancestor_id;
251             }
252            
253 3010         2074 $my_ancestor_id = $node_id;
254 3010         3557 push @node_ids, $node_id;
255             }
256            
257             # Go through the lineage in reverse so we can remember the children
258 331         977 for (my $i = $names_idx - 1; $i >= 0; $i--) {
259 2679         5942 $self->{children}->{$node_ids[$i]}->{$node_ids[$i+1]} = undef;
260             }
261 331         1459 return 1;
262             }
263              
264              
265             =head2 Bio::DB::Taxonomy Interface implementation
266              
267             =head2 get_num_taxa
268              
269             Title : get_num_taxa
270             Usage : my $num = $db->get_num_taxa();
271             Function: Get the number of taxa stored in the database.
272             Returns : A number
273             Args : None
274              
275             =cut
276              
277             sub get_num_taxa {
278 0     0 1 0 my ($self) = @_;
279 0   0     0 return $self->{node_ids} || 0;
280             }
281              
282              
283             =head2 get_taxon
284              
285             Title : get_taxon
286             Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
287             Function: Get a Bio::Taxon object from the database.
288             Returns : Bio::Taxon object
289             Args : A single value which is the ID of the taxon to retrieve
290             OR named args, as follows:
291             -taxonid => Taxonomy ID (NB: these are not NCBI taxonomy ids but
292             'list' pre-fixed ids unique to the list database).
293             OR
294             -name => String (to query by a taxonomy name). A given taxon name
295             can match different taxonomy objects. When that is the
296             case, a warning is displayed and the first matching taxon
297             is reported. See get_taxonids() to get all matching taxon
298             IDs.
299             OR
300             -names => Array ref of lineage names, like in add_lineage(). To
301             overcome the limitations of -name, you can use -names to
302             provide the full lineage of the taxon you want and get a
303             unique, unambiguous taxon object.
304              
305             =cut
306              
307             sub get_taxon {
308 6877     6877 1 10174 my ($self, @args) = @_;
309              
310 6877         5125 my $taxonid;
311 6877 100       11003 if (scalar @args == 1) {
312             # Argument is a taxon ID
313 2145         2451 $taxonid = $args[0];
314             } else {
315             # Got named arguments
316 4732         3555 my ($name, $names);
317 4732         13349 ($taxonid, $name, $names) = $self->_rearrange([qw(TAXONID NAME NAMES)], @args);
318 4732 100       10090 if ($name) {
319 2         4 $names = [$name];
320             }
321 4732 100       6943 if ($names) {
322 2         2 $name = $names->[-1];
323              
324 2         5 my @taxonids = $self->get_taxonids($name);
325 2         3 $taxonid = $taxonids[0];
326              
327             # Use provided lineage to find correct ID amongst several matching IDs
328 2 50 33     6 if ( (scalar @taxonids > 1) && (scalar @$names > 1) ) {
329 0         0 for my $query_taxonid (@taxonids) {
330 0         0 my $matched = 1;
331 0         0 my $db_ancestor = $self->get_taxon($query_taxonid);
332 0         0 for (my $i = $#$names-1; $i >= 0; $i--) {
333 0         0 my $query_ancestor_name = $names->[$i];
334 0         0 $db_ancestor = $db_ancestor->ancestor;
335 0         0 my $db_ancestor_name = '';
336 0 0       0 if ($db_ancestor) {
337 0         0 $db_ancestor_name = $db_ancestor->node_name;
338             }
339 0 0       0 if (not ($query_ancestor_name eq $db_ancestor_name) ) {
340 0         0 $matched = 0;
341 0         0 last; # done testing this taxonid
342             }
343             }
344 0 0       0 if ($matched == 1) {
345 0         0 @taxonids = [$query_taxonid];
346 0         0 $taxonid = $query_taxonid;
347 0         0 last; # done testing all taxonids
348             }
349             }
350             }
351              
352             # Warn if several taxon IDs matched
353 2 50       6 if (scalar @taxonids > 1) {
354 0 0       0 $self->warn("There were multiple ids (@taxonids) matching '$name',".
355             " using '$taxonid'") if scalar @taxonids > 1;
356             }
357              
358             }
359             }
360            
361             # Now that we have the taxon ID, retrieve the corresponding Taxon object
362 6877         5095 my $taxon;
363 6877         8775 my $node = $self->{node_data}->{$taxonid};
364 6877 100       10387 if ($node) {
365 6873         7809 my ($sci_name, $rank) = @$node;
366 6873         19560 $taxon = Bio::Taxon->new(
367             -name => $sci_name,
368             -object_id => $taxonid, # not an ncbi taxid, simply an object id
369             );
370              
371 6873 100       11191 if ($rank) {
372 53         81 $taxon->rank($rank);
373             }
374              
375             # we can't use -dbh or the db_handle() method ourselves or we'll go
376             # infinite on the merge attempt
377 6873         7465 $taxon->{'db_handle'} = $self;
378            
379 6873         14527 $self->_handle_internal_id($taxon, 1);
380             }
381              
382 6877         14636 return $taxon;
383             }
384              
385             *get_Taxonomy_Node = \&get_taxon;
386              
387              
388             =head2 get_taxonids
389              
390             Title : get_taxonids
391             Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
392             Function: Searches for a taxonid (generated by the list module) based on a
393             query string. Note that multiple taxonids can match to the same
394             supplied name.
395             Returns : array of integer ids in list context, one of these in scalar context
396             Args : string representing taxon's name
397              
398             =cut
399              
400             sub get_taxonids {
401 257     257 1 1297 my ($self, $name) = @_;
402 257 100       581 return wantarray() ? @{$self->{name_to_id}->{$name} || []} : $self->{name_to_id}->{$name}->[0];
  257 50       1061  
403             }
404              
405             *get_taxonid = \&get_taxonids;
406              
407              
408             =head2 ancestor
409              
410             Title : ancestor
411             Usage : my $ancestor_taxon = $db->ancestor($taxon)
412             Function: Retrieve the full ancestor taxon of a supplied Taxon from the
413             database.
414             Returns : Bio::Taxon
415             Args : Bio::Taxon (that was retrieved from this database)
416              
417             =cut
418              
419             sub ancestor {
420 4476     4476 1 4243 my ($self, $taxon) = @_;
421 4476 50       6786 $taxon || return; # for bug 2092, or something similar to it at least: shouldn't need this!
422 4476 50 33     18752 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
423 4476 50 33     6720 $self->throw("The supplied Taxon must belong to this database")
424             unless $taxon->db_handle && $taxon->db_handle eq $self;
425 4476   33     6937 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
426            
427 4476   100     14412 my $ancestor_id = $self->{ancestors}->{$id} || return;
428 2120         3303 return $self->get_taxon($ancestor_id);
429             }
430              
431              
432             =head2 each_Descendent
433              
434             Title : each_Descendent
435             Usage : my @taxa = $db->each_Descendent($taxon);
436             Function: Get all the descendents of the supplied Taxon (but not their
437             descendents, ie. not a recursive fetchall).
438             Returns : Array of Bio::Taxon objects
439             Args : Bio::Taxon (that was retrieved from this database)
440              
441             =cut
442              
443             sub each_Descendent {
444 18     18 1 20 my ($self, $taxon) = @_;
445 18 50 33     75 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
446 18 50 33     22 $self->throw("The supplied Taxon must belong to this database")
447             unless $taxon->db_handle && $taxon->db_handle eq $self;
448 18   33     23 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
449            
450 18         15 my @children;
451 18         14 while ( my ($child_id, undef) = each %{$self->{children}->{$id}} ) {
  38         102  
452 20   50     27 push @children, ($self->get_taxon($child_id) || next);
453             }
454            
455 18         42 return @children;
456             }
457              
458              
459             1;