File Coverage

Bio/DB/Taxonomy/list.pm
Criterion Covered Total %
statement 107 133 80.4
branch 42 64 65.6
condition 18 38 47.3
subroutine 9 10 90.0
pod 7 7 100.0
total 183 252 72.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Taxonomy::list
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Sendu Bala
7             #
8             # Copyright Sendu Bala
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14              
15             =head1 NAME
16              
17             Bio::DB::Taxonomy::list - An implementation of Bio::DB::Taxonomy
18             that accepts lists of words to build a database
19              
20             =head1 SYNOPSIS
21              
22             use Bio::DB::Taxonomy;
23              
24             my $db = Bio::DB::Taxonomy->new( -source => 'list' );
25              
26             my @ranks = ('superkingdom', 'class', 'genus', 'species');
27             my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
28             $db->add_lineage(-names => \@names, -ranks => \@ranks);
29              
30             @names = ('Eukaryota', 'Mammalia', 'Mus', 'Mus musculus');
31             $db->add_lineage(-names => \@names, -ranks => \@ranks);
32              
33             =head1 DESCRIPTION
34              
35             This is an implementation which uses supplied lists of words to create a
36             database from which you can extract Bio::Taxon objects.
37              
38             =head1 TODO
39              
40             It is possible this module could do something like store the data it builds
41             up to disc. Would that be useful?
42             At any rate, this is why the module is called 'list' and not 'in_memory' or
43             similar.
44              
45             =head1 FEEDBACK
46              
47             =head2 Mailing Lists
48              
49             User feedback is an integral part of the evolution of this and other
50             Bioperl modules. Send your comments and suggestions preferably to
51             the Bioperl mailing list. Your participation is much appreciated.
52              
53             bioperl-l@bioperl.org - General discussion
54             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55              
56             =head2 Support
57              
58             Please direct usage questions or support issues to the mailing list:
59              
60             I
61              
62             rather than to the module maintainer directly. Many experienced and
63             reponsive experts will be able look at the problem and quickly
64             address it. Please include a thorough description of the problem
65             with code and data examples if at all possible.
66              
67             =head2 Reporting Bugs
68              
69             Report bugs to the Bioperl bug tracking system to help us keep track
70             of the bugs and their resolution. Bug reports can be submitted via
71             the web:
72              
73             https://github.com/bioperl/bioperl-live/issues
74              
75             =head1 AUTHOR - Sendu Bala
76              
77             Email bix@sendu.me.uk
78              
79             =head1 APPENDIX
80              
81             The rest of the documentation details each of the object methods.
82             Internal methods are usually preceded with a _
83              
84             =cut
85              
86             # Let the code begin...
87              
88              
89             package Bio::DB::Taxonomy::list;
90              
91 34     34   191 use strict;
  34         54  
  34         912  
92 34     34   689 use Bio::Taxon;
  34         56  
  34         822  
93              
94 34     34   137 use base qw(Bio::DB::Taxonomy);
  34         61  
  34         37078  
95              
96             our $prefix = 'list';
97              
98              
99             =head2 new
100              
101             Title : new
102             Usage : my $obj = Bio::DB::Taxonomy::list->new();
103             Function: Builds a new Bio::DB::Taxonomy::list object
104             Returns : an instance of Bio::DB::Taxonomy::list
105             Args : optional, as per the add_lineage() method.
106              
107             =cut
108              
109             sub new {
110 254     254 1 927 my ($class, @args) = @_;
111 254         1114 my $self = $class->SUPER::new(@args);
112 254         767 my %args = @args;
113 254         629 delete $args{'-source'};
114              
115 254 100       1454 $self->add_lineage(%args) if %args;
116            
117 254         1244 return $self;
118             }
119              
120              
121             =head2 add_lineage
122              
123             Title : add_lineage
124             Usage : my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
125             my @ranks = ('superkingdom', 'class', 'genus', 'species');
126             $db->add_lineage( -names => \@names, -ranks => \@ranks );
127             Function: Add a lineage to the database, where the lineage is described by
128             a list of scientific names in the order root->leaf. The rank of each
129             name can optionally be described by supplying an additional list
130             of rank names in the same order (eg. superkingdom->species).
131             Returns : 1 for success
132             Args : -names => [] : array ref of scientific names, REQUIRED
133             -ranks => [] : array ref of rank names, same order as above, OPTIONAL
134              
135             =cut
136              
137             sub add_lineage {
138 333     333 1 889 my ($self, @args) = @_;
139 333         1377 my ($names, $ranks) = $self->_rearrange([qw (NAMES RANKS)], @args);
140 333 50 33     1943 $self->throw("-names must be supplied and its value must be an array reference")
141             unless $names && ref($names) eq 'ARRAY';
142              
143 333         785 my $names_idx = scalar @$names - 1;
144              
145 333 100       863 if ($ranks) {
146 39 50       54 $self->throw("-ranks must be an array reference")
147             unless ref($ranks) eq 'ARRAY';
148 39 50       55 $self->throw("The -names and -ranks lists must be of equal length")
149             unless $names_idx == scalar @$ranks - 1;
150             }
151            
152             # This is non-trivial because names are not guaranteed unique in a taxonomy,
153             # and neither are name&rank combinations. Furthermore, different name&rank
154             # combinations can actually refer to the same taxon, eg. when one time
155             # 'Homo'&'genus' is supplied, while another time 'Homo'&'no rank'.
156             #
157             # name&rank&ancestor could well be unique (or good enough 99.9999% of the
158             # time), but we have the added complication that lineages could sometimes be
159             # supplied with differing numbers of taxa. Ideally we want to realise that
160             # the first of these two lineages shares all its nodes with the second:
161             # ('Mammalia', 'Homo', 'Homo sapiens')
162             # ('Mammalia', 'Hominidae', 'Homo', 'Homo sapiens')
163             #
164             # Clearly with limited information we can't do a perfect job, but we can try
165             # and do a reasonable one. So, let's just do the trivial implementation now
166             # and see how bad it is! (assumes ranks are unique except for 'no rank')
167            
168 333         661 my $ancestors = $self->{ancestors};
169 333         534 my $node_data = $self->{node_data};
170 333         526 my $name_to_id = $self->{name_to_id};
171 333         532 my $children = $self->{children};
172              
173 333         593 my $my_ancestor_id = '';
174 333         509 my @node_ids;
175 333         1036 for my $i (0 .. $names_idx) {
176 3023         3592 my $name = $names->[$i];
177 3023         3089 my $rank = $ranks->[$i]; # if undef, this node has 'no rank'
178              
179             # This is a new node with a new id if we haven't seen this name before.
180             # It's also always a new node if this is the first lineage going into
181             # the db.
182             #
183             # We need to handle, however, situations in the future where we try to
184             # merge in a new lineage but we have non-unique names in the lineage
185             # and possible missing classes in some lineages, e.g.
186             # '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
187             # merged with
188             # '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
189             # but still need the 'tree' to be correct
190              
191             # Look for a node that is consistent with this lineage
192 3023         2681 my $node_id;
193 3023         2876 SAME_NAMED: for my $same_id (@{$name_to_id->{$name}}) {
  3023         6502  
194              
195             # Taxa are the same if it they have the same ancestor or none
196 557   100     792 my $this_ancestor_id = $ancestors->{$same_id} || '';
197 557 100       639 if ($my_ancestor_id eq $this_ancestor_id) {
198 282         244 $node_id = $same_id;
199 282         271 last SAME_NAMED;
200             }
201            
202             # Compare children
203 275 100       302 next if $i >= $names_idx; # this taxon has no child
204 223         212 my $my_child_name = $names->[$i + 1];
205             #while ( my ($this_child_id, undef) = each %{$children->{$same_id}} ) {
206 223         183 for my $this_child_id (keys %{$children->{$same_id}}) {
  223         311  
207 234 100       372 if ($my_child_name eq $node_data->{$this_child_id}->[0]) { # both children have same name
208 5 50       8 if ($my_ancestor_id) {
209 5         5 my @s_ancestors;
210 5         11 while ($this_ancestor_id = $ancestors->{$this_ancestor_id}) {
211 5 50       7 if ($my_ancestor_id eq $this_ancestor_id) {
212 0         0 $my_ancestor_id = $ancestors->{$same_id};
213 0         0 push @node_ids, @s_ancestors, $my_ancestor_id;
214 0         0 $node_id = $same_id;
215 0         0 last SAME_NAMED;
216             }
217 5         15 unshift @s_ancestors, $this_ancestor_id;
218             }
219             } else {
220             # This new lineage (@$names) doesn't start at the
221             # same root as the existing lineages. Assuming
222             # '$name' corresponds to node $same_id");
223 0         0 $node_id = $same_id;
224 0         0 last SAME_NAMED;
225             }
226             }
227             }
228             }
229            
230 3023 100       4677 if (not defined $node_id) {
231             # This is a new node. Add it to the database, using the prefix 'list'
232             # for its ID to distinguish it from the IDs from other taxonomies.
233 2741         3575 my $next_num = ++$self->{node_ids};
234 2741         3507 $node_id = $prefix.$next_num;
235 2741         2642 push @{$self->{name_to_id}->{$name}}, $node_id;
  2741         5853  
236 2741         7388 $self->{node_data}->{$node_id}->[0] = $name;
237             }
238              
239 3023 100 100     5037 if ( (defined $rank) && (not defined $node_data->{$node_id}->[1]) ) {
240             # Save rank if node in database has no rank but the current node has one
241 114         127 $self->{node_data}->{$node_id}->[1] = $rank;
242             }
243              
244 3023 100       4106 if ($my_ancestor_id) {
245 2690 50 66     4847 if ($self->{ancestors}->{$node_id} && $self->{ancestors}->{$node_id} ne $my_ancestor_id) {
246 0         0 $self->throw("The lineage '".join(', ', @$names)."' and a ".
247             "previously stored lineage share a node name but have ".
248             "different ancestries for that node. Can't cope!");
249             }
250 2690         5103 $self->{ancestors}->{$node_id} = $my_ancestor_id;
251             }
252            
253 3023         3101 $my_ancestor_id = $node_id;
254 3023         4688 push @node_ids, $node_id;
255             }
256            
257             # Go through the lineage in reverse so we can remember the children
258 333         1106 for (my $i = $names_idx - 1; $i >= 0; $i--) {
259 2690         6757 $self->{children}->{$node_ids[$i]}->{$node_ids[$i+1]} = undef;
260             }
261 333         1527 return 1;
262             }
263              
264              
265             =head2 Bio::DB::Taxonomy Interface implementation
266              
267             =head2 get_num_taxa
268              
269             Title : get_num_taxa
270             Usage : my $num = $db->get_num_taxa();
271             Function: Get the number of taxa stored in the database.
272             Returns : A number
273             Args : None
274              
275             =cut
276              
277             sub get_num_taxa {
278 0     0 1 0 my ($self) = @_;
279 0   0     0 return $self->{node_ids} || 0;
280             }
281              
282              
283             =head2 get_taxon
284              
285             Title : get_taxon
286             Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
287             Function: Get a Bio::Taxon object from the database.
288             Returns : Bio::Taxon object
289             Args : A single value which is the ID of the taxon to retrieve
290             OR named args, as follows:
291             -taxonid => Taxonomy ID (NB: these are not NCBI taxonomy ids but
292             'list' pre-fixed ids unique to the list database).
293             OR
294             -name => String (to query by a taxonomy name). A given taxon name
295             can match different taxonomy objects. When that is the
296             case, a warning is displayed and the first matching taxon
297             is reported. See get_taxonids() to get all matching taxon
298             IDs.
299             OR
300             -names => Array ref of lineage names, like in add_lineage(). To
301             overcome the limitations of -name, you can use -names to
302             provide the full lineage of the taxon you want and get a
303             unique, unambiguous taxon object.
304              
305             =cut
306              
307             sub get_taxon {
308 6914     6914 1 13900 my ($self, @args) = @_;
309              
310 6914         7934 my $taxonid;
311 6914 100       11093 if (scalar @args == 1) {
312             # Argument is a taxon ID
313 2156         2837 $taxonid = $args[0];
314             } else {
315             # Got named arguments
316 4758         6025 my ($name, $names);
317 4758         14156 ($taxonid, $name, $names) = $self->_rearrange([qw(TAXONID NAME NAMES)], @args);
318 4758 100       10764 if ($name) {
319 2         6 $names = [$name];
320             }
321 4758 100       7789 if ($names) {
322 2         3 $name = $names->[-1];
323              
324 2         9 my @taxonids = $self->get_taxonids($name);
325 2         4 $taxonid = $taxonids[0];
326              
327             # Use provided lineage to find correct ID amongst several matching IDs
328 2 50 33     6 if ( (scalar @taxonids > 1) && (scalar @$names > 1) ) {
329 0         0 for my $query_taxonid (@taxonids) {
330 0         0 my $matched = 1;
331 0         0 my $db_ancestor = $self->get_taxon($query_taxonid);
332 0         0 for (my $i = $#$names-1; $i >= 0; $i--) {
333 0         0 my $query_ancestor_name = $names->[$i];
334 0         0 $db_ancestor = $db_ancestor->ancestor;
335 0         0 my $db_ancestor_name = '';
336 0 0       0 if ($db_ancestor) {
337 0         0 $db_ancestor_name = $db_ancestor->node_name;
338             }
339 0 0       0 if (not ($query_ancestor_name eq $db_ancestor_name) ) {
340 0         0 $matched = 0;
341 0         0 last; # done testing this taxonid
342             }
343             }
344 0 0       0 if ($matched == 1) {
345 0         0 @taxonids = [$query_taxonid];
346 0         0 $taxonid = $query_taxonid;
347 0         0 last; # done testing all taxonids
348             }
349             }
350             }
351              
352             # Warn if several taxon IDs matched
353 2 50       9 if (scalar @taxonids > 1) {
354 0 0       0 $self->warn("There were multiple ids (@taxonids) matching '$name',".
355             " using '$taxonid'") if scalar @taxonids > 1;
356             }
357              
358             }
359             }
360            
361             # Now that we have the taxon ID, retrieve the corresponding Taxon object
362 6914         7504 my $taxon;
363 6914         10702 my $node = $self->{node_data}->{$taxonid};
364 6914 100       11086 if ($node) {
365 6910         11054 my ($sci_name, $rank) = @$node;
366 6910         19204 $taxon = Bio::Taxon->new(
367             -name => $sci_name,
368             -object_id => $taxonid, # not an ncbi taxid, simply an object id
369             );
370              
371 6910 100       12583 if ($rank) {
372 53         100 $taxon->rank($rank);
373             }
374              
375             # we can't use -dbh or the db_handle() method ourselves or we'll go
376             # infinite on the merge attempt
377 6910         9792 $taxon->{'db_handle'} = $self;
378            
379 6910         15257 $self->_handle_internal_id($taxon, 1);
380             }
381              
382 6914         17557 return $taxon;
383             }
384              
385             *get_Taxonomy_Node = \&get_taxon;
386              
387              
388             =head2 get_taxonids
389              
390             Title : get_taxonids
391             Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
392             Function: Searches for a taxonid (generated by the list module) based on a
393             query string. Note that multiple taxonids can match to the same
394             supplied name.
395             Returns : array of integer ids in list context, one of these in scalar context
396             Args : string representing taxon's name
397              
398             =cut
399              
400             sub get_taxonids {
401 259     259 1 1857 my ($self, $name) = @_;
402 259 100       741 return wantarray() ? @{$self->{name_to_id}->{$name} || []} : $self->{name_to_id}->{$name}->[0];
  259 50       1218  
403             }
404              
405             *get_taxonid = \&get_taxonids;
406              
407              
408             =head2 ancestor
409              
410             Title : ancestor
411             Usage : my $ancestor_taxon = $db->ancestor($taxon)
412             Function: Retrieve the full ancestor taxon of a supplied Taxon from the
413             database.
414             Returns : Bio::Taxon
415             Args : Bio::Taxon (that was retrieved from this database)
416              
417             =cut
418              
419             sub ancestor {
420 4500     4500 1 6578 my ($self, $taxon) = @_;
421 4500 50       7419 $taxon || return; # for bug 2092, or something similar to it at least: shouldn't need this!
422 4500 50 33     17189 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
423 4500 50 33     8067 $self->throw("The supplied Taxon must belong to this database")
424             unless $taxon->db_handle && $taxon->db_handle eq $self;
425 4500   33     8313 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
426            
427 4500   100     15262 my $ancestor_id = $self->{ancestors}->{$id} || return;
428 2131         3646 return $self->get_taxon($ancestor_id);
429             }
430              
431              
432             =head2 each_Descendent
433              
434             Title : each_Descendent
435             Usage : my @taxa = $db->each_Descendent($taxon);
436             Function: Get all the descendents of the supplied Taxon (but not their
437             descendents, ie. not a recursive fetchall).
438             Returns : Array of Bio::Taxon objects
439             Args : Bio::Taxon (that was retrieved from this database)
440              
441             =cut
442              
443             sub each_Descendent {
444 18     18 1 37 my ($self, $taxon) = @_;
445 18 50 33     83 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
446 18 50 33     35 $self->throw("The supplied Taxon must belong to this database")
447             unless $taxon->db_handle && $taxon->db_handle eq $self;
448 18   33     34 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
449            
450 18         23 my @children;
451 18         21 while ( my ($child_id, undef) = each %{$self->{children}->{$id}} ) {
  38         121  
452 20   50     41 push @children, ($self->get_taxon($child_id) || next);
453             }
454            
455 18         54 return @children;
456             }
457              
458              
459             1;