File Coverage

Bio/DB/Taxonomy/list.pm

Criterion	Covered	Total	%
statement	107	133	80.4
branch	42	64	65.6
condition	18	38	47.3
subroutine	9	10	90.0
pod	7	7	100.0
total	183	252	72.6

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::Taxonomy::list
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Sendu Bala
7							#
8							# Copyright Sendu Bala
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14
15							=head1 NAME
16
17							Bio::DB::Taxonomy::list - An implementation of Bio::DB::Taxonomy
18							that accepts lists of words to build a database
19
20							=head1 SYNOPSIS
21
22							use Bio::DB::Taxonomy;
23
24							my $db = Bio::DB::Taxonomy->new( -source => 'list' );
25
26							my @ranks = ('superkingdom', 'class', 'genus', 'species');
27							my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
28							$db->add_lineage(-names => \@names, -ranks => \@ranks);
29
30							@names = ('Eukaryota', 'Mammalia', 'Mus', 'Mus musculus');
31							$db->add_lineage(-names => \@names, -ranks => \@ranks);
32
33							=head1 DESCRIPTION
34
35							This is an implementation which uses supplied lists of words to create a
36							database from which you can extract Bio::Taxon objects.
37
38							=head1 TODO
39
40							It is possible this module could do something like store the data it builds
41							up to disc. Would that be useful?
42							At any rate, this is why the module is called 'list' and not 'in_memory' or
43							similar.
44
45							=head1 FEEDBACK
46
47							=head2 Mailing Lists
48
49							User feedback is an integral part of the evolution of this and other
50							Bioperl modules. Send your comments and suggestions preferably to
51							the Bioperl mailing list. Your participation is much appreciated.
52
53							bioperl-l@bioperl.org - General discussion
54							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55
56							=head2 Support
57
58							Please direct usage questions or support issues to the mailing list:
59
60							I
61
62							rather than to the module maintainer directly. Many experienced and
63							reponsive experts will be able look at the problem and quickly
64							address it. Please include a thorough description of the problem
65							with code and data examples if at all possible.
66
67							=head2 Reporting Bugs
68
69							Report bugs to the Bioperl bug tracking system to help us keep track
70							of the bugs and their resolution. Bug reports can be submitted via
71							the web:
72
73							https://github.com/bioperl/bioperl-live/issues
74
75							=head1 AUTHOR - Sendu Bala
76
77							Email bix@sendu.me.uk
78
79							=head1 APPENDIX
80
81							The rest of the documentation details each of the object methods.
82							Internal methods are usually preceded with a _
83
84							=cut
85
86							# Let the code begin...
87
88
89							package Bio::DB::Taxonomy::list;
90
91	34			34		123	use strict;
	34					40
	34					836
92	34			34		709	use Bio::Taxon;
	34					35
	34					703
93
94	34			34		97	use base qw(Bio::DB::Taxonomy);
	34					34
	34					34051
95
96							our $prefix = 'list';
97
98
99							=head2 new
100
101							Title : new
102							Usage : my $obj = Bio::DB::Taxonomy::list->new();
103							Function: Builds a new Bio::DB::Taxonomy::list object
104							Returns : an instance of Bio::DB::Taxonomy::list
105							Args : optional, as per the add_lineage() method.
106
107							=cut
108
109							sub new {
110	252			252	1	524	my ($class, @args) = @_;
111	252					1197	my $self = $class->SUPER::new(@args);
112	252					587	my %args = @args;
113	252					355	delete $args{'-source'};
114
115	252	100				1107	$self->add_lineage(%args) if %args;
116
117	252					1085	return $self;
118							}
119
120
121							=head2 add_lineage
122
123							Title : add_lineage
124							Usage : my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
125							my @ranks = ('superkingdom', 'class', 'genus', 'species');
126							$db->add_lineage( -names => \@names, -ranks => \@ranks );
127							Function: Add a lineage to the database, where the lineage is described by
128							a list of scientific names in the order root->leaf. The rank of each
129							name can optionally be described by supplying an additional list
130							of rank names in the same order (eg. superkingdom->species).
131							Returns : 1 for success
132							Args : -names => [] : array ref of scientific names, REQUIRED
133							-ranks => [] : array ref of rank names, same order as above, OPTIONAL
134
135							=cut
136
137							sub add_lineage {
138	331			331	1	543	my ($self, @args) = @_;
139	331					1171	my ($names, $ranks) = $self->_rearrange([qw (NAMES RANKS)], @args);
140	331	50	33			1691	$self->throw("-names must be supplied and its value must be an array reference")
141							unless $names && ref($names) eq 'ARRAY';
142
143	331					549	my $names_idx = scalar @$names - 1;
144
145	331	100				610	if ($ranks) {
146	39	50				47	$self->throw("-ranks must be an array reference")
147							unless ref($ranks) eq 'ARRAY';
148	39	50				47	$self->throw("The -names and -ranks lists must be of equal length")
149							unless $names_idx == scalar @$ranks - 1;
150							}
151
152							# This is non-trivial because names are not guaranteed unique in a taxonomy,
153							# and neither are name&rank combinations. Furthermore, different name&rank
154							# combinations can actually refer to the same taxon, eg. when one time
155							# 'Homo'&'genus' is supplied, while another time 'Homo'&'no rank'.
156							#
157							# name&rank&ancestor could well be unique (or good enough 99.9999% of the
158							# time), but we have the added complication that lineages could sometimes be
159							# supplied with differing numbers of taxa. Ideally we want to realise that
160							# the first of these two lineages shares all its nodes with the second:
161							# ('Mammalia', 'Homo', 'Homo sapiens')
162							# ('Mammalia', 'Hominidae', 'Homo', 'Homo sapiens')
163							#
164							# Clearly with limited information we can't do a perfect job, but we can try
165							# and do a reasonable one. So, let's just do the trivial implementation now
166							# and see how bad it is! (assumes ranks are unique except for 'no rank')
167
168	331					459	my $ancestors = $self->{ancestors};
169	331					629	my $node_data = $self->{node_data};
170	331					380	my $name_to_id = $self->{name_to_id};
171	331					366	my $children = $self->{children};
172
173	331					418	my $my_ancestor_id = '';
174	331					292	my @node_ids;
175	331					771	for my $i (0 .. $names_idx) {
176	3010					2545	my $name = $names->[$i];
177	3010					2122	my $rank = $ranks->[$i]; # if undef, this node has 'no rank'
178
179							# This is a new node with a new id if we haven't seen this name before.
180							# It's also always a new node if this is the first lineage going into
181							# the db.
182							#
183							# We need to handle, however, situations in the future where we try to
184							# merge in a new lineage but we have non-unique names in the lineage
185							# and possible missing classes in some lineages, e.g.
186							# '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
187							# merged with
188							# '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
189							# but still need the 'tree' to be correct
190
191							# Look for a node that is consistent with this lineage
192	3010					1849	my $node_id;
193	3010					1955	SAME_NAMED: for my $same_id (@{$name_to_id->{$name}}) {
	3010					6367
194
195							# Taxa are the same if it they have the same ancestor or none
196	557		100			797	my $this_ancestor_id = $ancestors->{$same_id} \|\| '';
197	557	100				611	if ($my_ancestor_id eq $this_ancestor_id) {
198	282					162	$node_id = $same_id;
199	282					235	last SAME_NAMED;
200							}
201
202							# Compare children
203	275	100				300	next if $i >= $names_idx; # this taxon has no child
204	223					149	my $my_child_name = $names->[$i + 1];
205							#while ( my ($this_child_id, undef) = each %{$children->{$same_id}} ) {
206	223					113	for my $this_child_id (keys %{$children->{$same_id}}) {
	223					347
207	234	100				363	if ($my_child_name eq $node_data->{$this_child_id}->[0]) { # both children have same name
208	5	50				6	if ($my_ancestor_id) {
209	5					5	my @s_ancestors;
210	5					7	while ($this_ancestor_id = $ancestors->{$this_ancestor_id}) {
211	5	50				7	if ($my_ancestor_id eq $this_ancestor_id) {
212	0					0	$my_ancestor_id = $ancestors->{$same_id};
213	0					0	push @node_ids, @s_ancestors, $my_ancestor_id;
214	0					0	$node_id = $same_id;
215	0					0	last SAME_NAMED;
216							}
217	5					14	unshift @s_ancestors, $this_ancestor_id;
218							}
219							} else {
220							# This new lineage (@$names) doesn't start at the
221							# same root as the existing lineages. Assuming
222							# '$name' corresponds to node $same_id");
223	0					0	$node_id = $same_id;
224	0					0	last SAME_NAMED;
225							}
226							}
227							}
228							}
229
230	3010	100				4267	if (not defined $node_id) {
231							# This is a new node. Add it to the database, using the prefix 'list'
232							# for its ID to distinguish it from the IDs from other taxonomies.
233	2728					2397	my $next_num = ++$self->{node_ids};
234	2728					2371	$node_id = $prefix.$next_num;
235	2728					1670	push @{$self->{name_to_id}->{$name}}, $node_id;
	2728					4855
236	2728					4634	$self->{node_data}->{$node_id}->[0] = $name;
237							}
238
239	3010	100	100			4782	if ( (defined $rank) && (not defined $node_data->{$node_id}->[1]) ) {
240							# Save rank if node in database has no rank but the current node has one
241	114					99	$self->{node_data}->{$node_id}->[1] = $rank;
242							}
243
244	3010	100				3477	if ($my_ancestor_id) {
245	2679	50	66			4510	if ($self->{ancestors}->{$node_id} && $self->{ancestors}->{$node_id} ne $my_ancestor_id) {
246	0					0	$self->throw("The lineage '".join(', ', @$names)."' and a ".
247							"previously stored lineage share a node name but have ".
248							"different ancestries for that node. Can't cope!");
249							}
250	2679					3126	$self->{ancestors}->{$node_id} = $my_ancestor_id;
251							}
252
253	3010					2096	$my_ancestor_id = $node_id;
254	3010					3573	push @node_ids, $node_id;
255							}
256
257							# Go through the lineage in reverse so we can remember the children
258	331					976	for (my $i = $names_idx - 1; $i >= 0; $i--) {
259	2679					5875	$self->{children}->{$node_ids[$i]}->{$node_ids[$i+1]} = undef;
260							}
261	331					1365	return 1;
262							}
263
264
265							=head2 Bio::DB::Taxonomy Interface implementation
266
267							=head2 get_num_taxa
268
269							Title : get_num_taxa
270							Usage : my $num = $db->get_num_taxa();
271							Function: Get the number of taxa stored in the database.
272							Returns : A number
273							Args : None
274
275							=cut
276
277							sub get_num_taxa {
278	0			0	1	0	my ($self) = @_;
279	0		0			0	return $self->{node_ids} \|\| 0;
280							}
281
282
283							=head2 get_taxon
284
285							Title : get_taxon
286							Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
287							Function: Get a Bio::Taxon object from the database.
288							Returns : Bio::Taxon object
289							Args : A single value which is the ID of the taxon to retrieve
290							OR named args, as follows:
291							-taxonid => Taxonomy ID (NB: these are not NCBI taxonomy ids but
292							'list' pre-fixed ids unique to the list database).
293							OR
294							-name => String (to query by a taxonomy name). A given taxon name
295							can match different taxonomy objects. When that is the
296							case, a warning is displayed and the first matching taxon
297							is reported. See get_taxonids() to get all matching taxon
298							IDs.
299							OR
300							-names => Array ref of lineage names, like in add_lineage(). To
301							overcome the limitations of -name, you can use -names to
302							provide the full lineage of the taxon you want and get a
303							unique, unambiguous taxon object.
304
305							=cut
306
307							sub get_taxon {
308	6877			6877	1	10434	my ($self, @args) = @_;
309
310	6877					5107	my $taxonid;
311	6877	100				10567	if (scalar @args == 1) {
312							# Argument is a taxon ID
313	2145					2142	$taxonid = $args[0];
314							} else {
315							# Got named arguments
316	4732					3570	my ($name, $names);
317	4732					12648	($taxonid, $name, $names) = $self->_rearrange([qw(TAXONID NAME NAMES)], @args);
318	4732	100				9658	if ($name) {
319	2					3	$names = [$name];
320							}
321	4732	100				7016	if ($names) {
322	2					3	$name = $names->[-1];
323
324	2					6	my @taxonids = $self->get_taxonids($name);
325	2					2	$taxonid = $taxonids[0];
326
327							# Use provided lineage to find correct ID amongst several matching IDs
328	2	50	33			7	if ( (scalar @taxonids > 1) && (scalar @$names > 1) ) {
329	0					0	for my $query_taxonid (@taxonids) {
330	0					0	my $matched = 1;
331	0					0	my $db_ancestor = $self->get_taxon($query_taxonid);
332	0					0	for (my $i = $#$names-1; $i >= 0; $i--) {
333	0					0	my $query_ancestor_name = $names->[$i];
334	0					0	$db_ancestor = $db_ancestor->ancestor;
335	0					0	my $db_ancestor_name = '';
336	0	0				0	if ($db_ancestor) {
337	0					0	$db_ancestor_name = $db_ancestor->node_name;
338							}
339	0	0				0	if (not ($query_ancestor_name eq $db_ancestor_name) ) {
340	0					0	$matched = 0;
341	0					0	last; # done testing this taxonid
342							}
343							}
344	0	0				0	if ($matched == 1) {
345	0					0	@taxonids = [$query_taxonid];
346	0					0	$taxonid = $query_taxonid;
347	0					0	last; # done testing all taxonids
348							}
349							}
350							}
351
352							# Warn if several taxon IDs matched
353	2	50				7	if (scalar @taxonids > 1) {
354	0	0				0	$self->warn("There were multiple ids (@taxonids) matching '$name',".
355							" using '$taxonid'") if scalar @taxonids > 1;
356							}
357
358							}
359							}
360
361							# Now that we have the taxon ID, retrieve the corresponding Taxon object
362	6877					5522	my $taxon;
363	6877					8662	my $node = $self->{node_data}->{$taxonid};
364	6877	100				9796	if ($node) {
365	6873					7541	my ($sci_name, $rank) = @$node;
366	6873					18886	$taxon = Bio::Taxon->new(
367							-name => $sci_name,
368							-object_id => $taxonid, # not an ncbi taxid, simply an object id
369							);
370
371	6873	100				10748	if ($rank) {
372	53					93	$taxon->rank($rank);
373							}
374
375							# we can't use -dbh or the db_handle() method ourselves or we'll go
376							# infinite on the merge attempt
377	6873					7678	$taxon->{'db_handle'} = $self;
378
379	6873					13874	$self->_handle_internal_id($taxon, 1);
380							}
381
382	6877					14629	return $taxon;
383							}
384
385							*get_Taxonomy_Node = \&get_taxon;
386
387
388							=head2 get_taxonids
389
390							Title : get_taxonids
391							Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
392							Function: Searches for a taxonid (generated by the list module) based on a
393							query string. Note that multiple taxonids can match to the same
394							supplied name.
395							Returns : array of integer ids in list context, one of these in scalar context
396							Args : string representing taxon's name
397
398							=cut
399
400							sub get_taxonids {
401	257			257	1	1286	my ($self, $name) = @_;
402	257	100				609	return wantarray() ? @{$self->{name_to_id}->{$name} \|\| []} : $self->{name_to_id}->{$name}->[0];
	257	50				1017
403							}
404
405							*get_taxonid = \&get_taxonids;
406
407
408							=head2 ancestor
409
410							Title : ancestor
411							Usage : my $ancestor_taxon = $db->ancestor($taxon)
412							Function: Retrieve the full ancestor taxon of a supplied Taxon from the
413							database.
414							Returns : Bio::Taxon
415							Args : Bio::Taxon (that was retrieved from this database)
416
417							=cut
418
419							sub ancestor {
420	4476			4476	1	4261	my ($self, $taxon) = @_;
421	4476	50				6311	$taxon \|\| return; # for bug 2092, or something similar to it at least: shouldn't need this!
422	4476	50	33			18255	$self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
423	4476	50	33			6102	$self->throw("The supplied Taxon must belong to this database")
424							unless $taxon->db_handle && $taxon->db_handle eq $self;
425	4476		33			6478	my $id = $taxon->id \|\| $self->throw("The supplied Taxon is missing its id!");
426
427	4476		100			13895	my $ancestor_id = $self->{ancestors}->{$id} \|\| return;
428	2120					3110	return $self->get_taxon($ancestor_id);
429							}
430
431
432							=head2 each_Descendent
433
434							Title : each_Descendent
435							Usage : my @taxa = $db->each_Descendent($taxon);
436							Function: Get all the descendents of the supplied Taxon (but not their
437							descendents, ie. not a recursive fetchall).
438							Returns : Array of Bio::Taxon objects
439							Args : Bio::Taxon (that was retrieved from this database)
440
441							=cut
442
443							sub each_Descendent {
444	18			18	1	26	my ($self, $taxon) = @_;
445	18	50	33			79	$self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
446	18	50	33			26	$self->throw("The supplied Taxon must belong to this database")
447							unless $taxon->db_handle && $taxon->db_handle eq $self;
448	18		33			25	my $id = $taxon->id \|\| $self->throw("The supplied Taxon is missing its id!");
449
450	18					16	my @children;
451	18					16	while ( my ($child_id, undef) = each %{$self->{children}->{$id}} ) {
	38					110
452	20		50			30	push @children, ($self->get_taxon($child_id) \|\| next);
453							}
454
455	18					50	return @children;
456							}
457
458
459							1;