File Coverage

Bio/DB/Taxonomy/list.pm
Criterion Covered Total %
statement 107 133 80.4
branch 42 64 65.6
condition 18 38 47.3
subroutine 9 10 90.0
pod 7 7 100.0
total 183 252 72.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Taxonomy::list
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Sendu Bala
7             #
8             # Copyright Sendu Bala
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14              
15             =head1 NAME
16              
17             Bio::DB::Taxonomy::list - An implementation of Bio::DB::Taxonomy
18             that accepts lists of words to build a database
19              
20             =head1 SYNOPSIS
21              
22             use Bio::DB::Taxonomy;
23              
24             my $db = Bio::DB::Taxonomy->new( -source => 'list' );
25              
26             my @ranks = ('superkingdom', 'class', 'genus', 'species');
27             my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
28             $db->add_lineage(-names => \@names, -ranks => \@ranks);
29              
30             @names = ('Eukaryota', 'Mammalia', 'Mus', 'Mus musculus');
31             $db->add_lineage(-names => \@names, -ranks => \@ranks);
32              
33             =head1 DESCRIPTION
34              
35             This is an implementation which uses supplied lists of words to create a
36             database from which you can extract Bio::Taxon objects.
37              
38             =head1 TODO
39              
40             It is possible this module could do something like store the data it builds
41             up to disc. Would that be useful?
42             At any rate, this is why the module is called 'list' and not 'in_memory' or
43             similar.
44              
45             =head1 FEEDBACK
46              
47             =head2 Mailing Lists
48              
49             User feedback is an integral part of the evolution of this and other
50             Bioperl modules. Send your comments and suggestions preferably to
51             the Bioperl mailing list. Your participation is much appreciated.
52              
53             bioperl-l@bioperl.org - General discussion
54             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55              
56             =head2 Support
57              
58             Please direct usage questions or support issues to the mailing list:
59              
60             I
61              
62             rather than to the module maintainer directly. Many experienced and
63             reponsive experts will be able look at the problem and quickly
64             address it. Please include a thorough description of the problem
65             with code and data examples if at all possible.
66              
67             =head2 Reporting Bugs
68              
69             Report bugs to the Bioperl bug tracking system to help us keep track
70             of the bugs and their resolution. Bug reports can be submitted via
71             the web:
72              
73             https://github.com/bioperl/bioperl-live/issues
74              
75             =head1 AUTHOR - Sendu Bala
76              
77             Email bix@sendu.me.uk
78              
79             =head1 APPENDIX
80              
81             The rest of the documentation details each of the object methods.
82             Internal methods are usually preceded with a _
83              
84             =cut
85              
86             # Let the code begin...
87              
88              
89             package Bio::DB::Taxonomy::list;
90              
91 34     34   123 use strict;
  34         40  
  34         836  
92 34     34   709 use Bio::Taxon;
  34         35  
  34         703  
93              
94 34     34   97 use base qw(Bio::DB::Taxonomy);
  34         34  
  34         34051  
95              
96             our $prefix = 'list';
97              
98              
99             =head2 new
100              
101             Title : new
102             Usage : my $obj = Bio::DB::Taxonomy::list->new();
103             Function: Builds a new Bio::DB::Taxonomy::list object
104             Returns : an instance of Bio::DB::Taxonomy::list
105             Args : optional, as per the add_lineage() method.
106              
107             =cut
108              
109             sub new {
110 252     252 1 524 my ($class, @args) = @_;
111 252         1197 my $self = $class->SUPER::new(@args);
112 252         587 my %args = @args;
113 252         355 delete $args{'-source'};
114              
115 252 100       1107 $self->add_lineage(%args) if %args;
116            
117 252         1085 return $self;
118             }
119              
120              
121             =head2 add_lineage
122              
123             Title : add_lineage
124             Usage : my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
125             my @ranks = ('superkingdom', 'class', 'genus', 'species');
126             $db->add_lineage( -names => \@names, -ranks => \@ranks );
127             Function: Add a lineage to the database, where the lineage is described by
128             a list of scientific names in the order root->leaf. The rank of each
129             name can optionally be described by supplying an additional list
130             of rank names in the same order (eg. superkingdom->species).
131             Returns : 1 for success
132             Args : -names => [] : array ref of scientific names, REQUIRED
133             -ranks => [] : array ref of rank names, same order as above, OPTIONAL
134              
135             =cut
136              
137             sub add_lineage {
138 331     331 1 543 my ($self, @args) = @_;
139 331         1171 my ($names, $ranks) = $self->_rearrange([qw (NAMES RANKS)], @args);
140 331 50 33     1691 $self->throw("-names must be supplied and its value must be an array reference")
141             unless $names && ref($names) eq 'ARRAY';
142              
143 331         549 my $names_idx = scalar @$names - 1;
144              
145 331 100       610 if ($ranks) {
146 39 50       47 $self->throw("-ranks must be an array reference")
147             unless ref($ranks) eq 'ARRAY';
148 39 50       47 $self->throw("The -names and -ranks lists must be of equal length")
149             unless $names_idx == scalar @$ranks - 1;
150             }
151            
152             # This is non-trivial because names are not guaranteed unique in a taxonomy,
153             # and neither are name&rank combinations. Furthermore, different name&rank
154             # combinations can actually refer to the same taxon, eg. when one time
155             # 'Homo'&'genus' is supplied, while another time 'Homo'&'no rank'.
156             #
157             # name&rank&ancestor could well be unique (or good enough 99.9999% of the
158             # time), but we have the added complication that lineages could sometimes be
159             # supplied with differing numbers of taxa. Ideally we want to realise that
160             # the first of these two lineages shares all its nodes with the second:
161             # ('Mammalia', 'Homo', 'Homo sapiens')
162             # ('Mammalia', 'Hominidae', 'Homo', 'Homo sapiens')
163             #
164             # Clearly with limited information we can't do a perfect job, but we can try
165             # and do a reasonable one. So, let's just do the trivial implementation now
166             # and see how bad it is! (assumes ranks are unique except for 'no rank')
167            
168 331         459 my $ancestors = $self->{ancestors};
169 331         629 my $node_data = $self->{node_data};
170 331         380 my $name_to_id = $self->{name_to_id};
171 331         366 my $children = $self->{children};
172              
173 331         418 my $my_ancestor_id = '';
174 331         292 my @node_ids;
175 331         771 for my $i (0 .. $names_idx) {
176 3010         2545 my $name = $names->[$i];
177 3010         2122 my $rank = $ranks->[$i]; # if undef, this node has 'no rank'
178              
179             # This is a new node with a new id if we haven't seen this name before.
180             # It's also always a new node if this is the first lineage going into
181             # the db.
182             #
183             # We need to handle, however, situations in the future where we try to
184             # merge in a new lineage but we have non-unique names in the lineage
185             # and possible missing classes in some lineages, e.g.
186             # '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
187             # merged with
188             # '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
189             # but still need the 'tree' to be correct
190              
191             # Look for a node that is consistent with this lineage
192 3010         1849 my $node_id;
193 3010         1955 SAME_NAMED: for my $same_id (@{$name_to_id->{$name}}) {
  3010         6367  
194              
195             # Taxa are the same if it they have the same ancestor or none
196 557   100     797 my $this_ancestor_id = $ancestors->{$same_id} || '';
197 557 100       611 if ($my_ancestor_id eq $this_ancestor_id) {
198 282         162 $node_id = $same_id;
199 282         235 last SAME_NAMED;
200             }
201            
202             # Compare children
203 275 100       300 next if $i >= $names_idx; # this taxon has no child
204 223         149 my $my_child_name = $names->[$i + 1];
205             #while ( my ($this_child_id, undef) = each %{$children->{$same_id}} ) {
206 223         113 for my $this_child_id (keys %{$children->{$same_id}}) {
  223         347  
207 234 100       363 if ($my_child_name eq $node_data->{$this_child_id}->[0]) { # both children have same name
208 5 50       6 if ($my_ancestor_id) {
209 5         5 my @s_ancestors;
210 5         7 while ($this_ancestor_id = $ancestors->{$this_ancestor_id}) {
211 5 50       7 if ($my_ancestor_id eq $this_ancestor_id) {
212 0         0 $my_ancestor_id = $ancestors->{$same_id};
213 0         0 push @node_ids, @s_ancestors, $my_ancestor_id;
214 0         0 $node_id = $same_id;
215 0         0 last SAME_NAMED;
216             }
217 5         14 unshift @s_ancestors, $this_ancestor_id;
218             }
219             } else {
220             # This new lineage (@$names) doesn't start at the
221             # same root as the existing lineages. Assuming
222             # '$name' corresponds to node $same_id");
223 0         0 $node_id = $same_id;
224 0         0 last SAME_NAMED;
225             }
226             }
227             }
228             }
229            
230 3010 100       4267 if (not defined $node_id) {
231             # This is a new node. Add it to the database, using the prefix 'list'
232             # for its ID to distinguish it from the IDs from other taxonomies.
233 2728         2397 my $next_num = ++$self->{node_ids};
234 2728         2371 $node_id = $prefix.$next_num;
235 2728         1670 push @{$self->{name_to_id}->{$name}}, $node_id;
  2728         4855  
236 2728         4634 $self->{node_data}->{$node_id}->[0] = $name;
237             }
238              
239 3010 100 100     4782 if ( (defined $rank) && (not defined $node_data->{$node_id}->[1]) ) {
240             # Save rank if node in database has no rank but the current node has one
241 114         99 $self->{node_data}->{$node_id}->[1] = $rank;
242             }
243              
244 3010 100       3477 if ($my_ancestor_id) {
245 2679 50 66     4510 if ($self->{ancestors}->{$node_id} && $self->{ancestors}->{$node_id} ne $my_ancestor_id) {
246 0         0 $self->throw("The lineage '".join(', ', @$names)."' and a ".
247             "previously stored lineage share a node name but have ".
248             "different ancestries for that node. Can't cope!");
249             }
250 2679         3126 $self->{ancestors}->{$node_id} = $my_ancestor_id;
251             }
252            
253 3010         2096 $my_ancestor_id = $node_id;
254 3010         3573 push @node_ids, $node_id;
255             }
256            
257             # Go through the lineage in reverse so we can remember the children
258 331         976 for (my $i = $names_idx - 1; $i >= 0; $i--) {
259 2679         5875 $self->{children}->{$node_ids[$i]}->{$node_ids[$i+1]} = undef;
260             }
261 331         1365 return 1;
262             }
263              
264              
265             =head2 Bio::DB::Taxonomy Interface implementation
266              
267             =head2 get_num_taxa
268              
269             Title : get_num_taxa
270             Usage : my $num = $db->get_num_taxa();
271             Function: Get the number of taxa stored in the database.
272             Returns : A number
273             Args : None
274              
275             =cut
276              
277             sub get_num_taxa {
278 0     0 1 0 my ($self) = @_;
279 0   0     0 return $self->{node_ids} || 0;
280             }
281              
282              
283             =head2 get_taxon
284              
285             Title : get_taxon
286             Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
287             Function: Get a Bio::Taxon object from the database.
288             Returns : Bio::Taxon object
289             Args : A single value which is the ID of the taxon to retrieve
290             OR named args, as follows:
291             -taxonid => Taxonomy ID (NB: these are not NCBI taxonomy ids but
292             'list' pre-fixed ids unique to the list database).
293             OR
294             -name => String (to query by a taxonomy name). A given taxon name
295             can match different taxonomy objects. When that is the
296             case, a warning is displayed and the first matching taxon
297             is reported. See get_taxonids() to get all matching taxon
298             IDs.
299             OR
300             -names => Array ref of lineage names, like in add_lineage(). To
301             overcome the limitations of -name, you can use -names to
302             provide the full lineage of the taxon you want and get a
303             unique, unambiguous taxon object.
304              
305             =cut
306              
307             sub get_taxon {
308 6877     6877 1 10434 my ($self, @args) = @_;
309              
310 6877         5107 my $taxonid;
311 6877 100       10567 if (scalar @args == 1) {
312             # Argument is a taxon ID
313 2145         2142 $taxonid = $args[0];
314             } else {
315             # Got named arguments
316 4732         3570 my ($name, $names);
317 4732         12648 ($taxonid, $name, $names) = $self->_rearrange([qw(TAXONID NAME NAMES)], @args);
318 4732 100       9658 if ($name) {
319 2         3 $names = [$name];
320             }
321 4732 100       7016 if ($names) {
322 2         3 $name = $names->[-1];
323              
324 2         6 my @taxonids = $self->get_taxonids($name);
325 2         2 $taxonid = $taxonids[0];
326              
327             # Use provided lineage to find correct ID amongst several matching IDs
328 2 50 33     7 if ( (scalar @taxonids > 1) && (scalar @$names > 1) ) {
329 0         0 for my $query_taxonid (@taxonids) {
330 0         0 my $matched = 1;
331 0         0 my $db_ancestor = $self->get_taxon($query_taxonid);
332 0         0 for (my $i = $#$names-1; $i >= 0; $i--) {
333 0         0 my $query_ancestor_name = $names->[$i];
334 0         0 $db_ancestor = $db_ancestor->ancestor;
335 0         0 my $db_ancestor_name = '';
336 0 0       0 if ($db_ancestor) {
337 0         0 $db_ancestor_name = $db_ancestor->node_name;
338             }
339 0 0       0 if (not ($query_ancestor_name eq $db_ancestor_name) ) {
340 0         0 $matched = 0;
341 0         0 last; # done testing this taxonid
342             }
343             }
344 0 0       0 if ($matched == 1) {
345 0         0 @taxonids = [$query_taxonid];
346 0         0 $taxonid = $query_taxonid;
347 0         0 last; # done testing all taxonids
348             }
349             }
350             }
351              
352             # Warn if several taxon IDs matched
353 2 50       7 if (scalar @taxonids > 1) {
354 0 0       0 $self->warn("There were multiple ids (@taxonids) matching '$name',".
355             " using '$taxonid'") if scalar @taxonids > 1;
356             }
357              
358             }
359             }
360            
361             # Now that we have the taxon ID, retrieve the corresponding Taxon object
362 6877         5522 my $taxon;
363 6877         8662 my $node = $self->{node_data}->{$taxonid};
364 6877 100       9796 if ($node) {
365 6873         7541 my ($sci_name, $rank) = @$node;
366 6873         18886 $taxon = Bio::Taxon->new(
367             -name => $sci_name,
368             -object_id => $taxonid, # not an ncbi taxid, simply an object id
369             );
370              
371 6873 100       10748 if ($rank) {
372 53         93 $taxon->rank($rank);
373             }
374              
375             # we can't use -dbh or the db_handle() method ourselves or we'll go
376             # infinite on the merge attempt
377 6873         7678 $taxon->{'db_handle'} = $self;
378            
379 6873         13874 $self->_handle_internal_id($taxon, 1);
380             }
381              
382 6877         14629 return $taxon;
383             }
384              
385             *get_Taxonomy_Node = \&get_taxon;
386              
387              
388             =head2 get_taxonids
389              
390             Title : get_taxonids
391             Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
392             Function: Searches for a taxonid (generated by the list module) based on a
393             query string. Note that multiple taxonids can match to the same
394             supplied name.
395             Returns : array of integer ids in list context, one of these in scalar context
396             Args : string representing taxon's name
397              
398             =cut
399              
400             sub get_taxonids {
401 257     257 1 1286 my ($self, $name) = @_;
402 257 100       609 return wantarray() ? @{$self->{name_to_id}->{$name} || []} : $self->{name_to_id}->{$name}->[0];
  257 50       1017  
403             }
404              
405             *get_taxonid = \&get_taxonids;
406              
407              
408             =head2 ancestor
409              
410             Title : ancestor
411             Usage : my $ancestor_taxon = $db->ancestor($taxon)
412             Function: Retrieve the full ancestor taxon of a supplied Taxon from the
413             database.
414             Returns : Bio::Taxon
415             Args : Bio::Taxon (that was retrieved from this database)
416              
417             =cut
418              
419             sub ancestor {
420 4476     4476 1 4261 my ($self, $taxon) = @_;
421 4476 50       6311 $taxon || return; # for bug 2092, or something similar to it at least: shouldn't need this!
422 4476 50 33     18255 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
423 4476 50 33     6102 $self->throw("The supplied Taxon must belong to this database")
424             unless $taxon->db_handle && $taxon->db_handle eq $self;
425 4476   33     6478 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
426            
427 4476   100     13895 my $ancestor_id = $self->{ancestors}->{$id} || return;
428 2120         3110 return $self->get_taxon($ancestor_id);
429             }
430              
431              
432             =head2 each_Descendent
433              
434             Title : each_Descendent
435             Usage : my @taxa = $db->each_Descendent($taxon);
436             Function: Get all the descendents of the supplied Taxon (but not their
437             descendents, ie. not a recursive fetchall).
438             Returns : Array of Bio::Taxon objects
439             Args : Bio::Taxon (that was retrieved from this database)
440              
441             =cut
442              
443             sub each_Descendent {
444 18     18 1 26 my ($self, $taxon) = @_;
445 18 50 33     79 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
446 18 50 33     26 $self->throw("The supplied Taxon must belong to this database")
447             unless $taxon->db_handle && $taxon->db_handle eq $self;
448 18   33     25 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
449            
450 18         16 my @children;
451 18         16 while ( my ($child_id, undef) = each %{$self->{children}->{$id}} ) {
  38         110  
452 20   50     30 push @children, ($self->get_taxon($child_id) || next);
453             }
454            
455 18         50 return @children;
456             }
457              
458              
459             1;