File Coverage

blib/lib/Bio/LITE/Taxonomy/NCBI/Gi2taxid.pm

Criterion	Covered	Total	%
statement	92	100	92.0
branch	24	38	63.1
condition	8	16	50.0
subroutine	13	13	100.0
pod	3	3	100.0
total	140	170	82.3

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::LITE::Taxonomy::NCBI::Gi2taxid;
2
3							=head1 NAME
4
5							Bio::LITE::Taxonomy::NCBI::Gi2taxid - Mappings of NCBI GI's to Taxids fast and with very low memory footprint.
6
7							=head1 SYNOPSIS
8
9							Creation of a new Taxid to GI dictionary (binary mapping file):
10
11							use Bio::LITE::Taxonomy::NCBI::Gi2taxid qw/new_dict/;
12
13							new_dict (in => "gi_taxid_prot.dmp",
14							out => "gi_taxid_prot.bin");
15
16							Usage of the dictionary:
17
18							use Bio::LITE::Taxonomy::NCBI::Gi2taxid;
19
20							my $dict = Bio::LITE::Taxonomy::NCBI::Gi2taxid->new(dict=>"gi_taxid_prot.bin");
21							my $taxid = $dict->get_taxid(12553);
22
23							=head1 DESCRIPTION
24
25							The NCBI site offers a file to map gene and protein sequences (GIs) with their corresponding taxon of origin (Taxids). If you want to use this information inside a Perl script you will find that (given the high amount of sequences available) it is fairly inefficient to store this information in, for example, a regular hash. Only for creating such a hash you will need more than 10 GBs of system memory.
26
27							This is a very simple module that has been designed to efficiently map NCBI GIs to Taxids with speed as the primary goal. It is designed to retrieve taxids from GIs very fast and with low memory usage. It is even faster than using a SQL database to retrieve the mappings or using a local DBHash.
28
29							To achieve this, it uses a binary index that can be created with the function C. This index has to be created one time for each mapping file.
30
31							The original mapping files can be downloaded from the NCBI site at the following address: L.
32
33							=head1 FUNCTIONS
34
35							=head2 C
36
37							This function creates a new binary dictionary from the NCBI mapping file. The file should be uncompressed before being passed to the script. The function accepts the following parameters:
38
39							NOTE From version 0.05, the lib uses a more compacted memory file. This means that binary files created with earlier versions will not work with this one and vice-versa. You need to create the new binary db with this version.
40
41							=over 4
42
43							=item in
44
45							This is the uncompressed mapping file from the NCBI. The function accepts a filename or a filehandle
46
47							=item out
48
49							Optional. Where the binary dictionary is going to be printed. The function accepts a filename or a filehandle (that should be opened with writing permissions). If absent STDOUT will be assumed.
50
51							=item chunk_size
52
53							Optional. While bin conversion, the lib stores chunks of data in memory to speed up the conversion. This number specifies the size of the chunks. By default 30Mb is used. The whole chunk is stored in a Perl scalar so be careful not to overflow the scalar capacity.
54
55							=back
56
57							=head1 CONSTRUCTOR
58
59							=head2 C
60
61							Once the binary dictionary is created it can be used as an object using this constructor. It accepts the following parameters
62
63							=over 4
64
65							=item dict
66
67							This is the binary dictionary obtained with the C function. The name of the file or a filehandle is accepted.
68
69							=item save_mem
70
71							Optional. Use this option to avoid to load the binary dictionary into memory. This will save almost 1GB of system memory but looking up for Taxids will be ~20% slower. This option is I by default.
72
73							=back
74
75							=head1 METHODS
76
77							=head2 C
78
79							This method receives a GI and returns the corresponding Taxid.
80
81							=head1 SEE ALSO
82
83							L
84							L
85
86							=head1 AUTHOR
87
88							Miguel Pignatelli
89
90							Any comments should be addressed to emepyc@gmail.com
91
92							=head1 LICENSE
93
94							Copyright 2013 Miguel Pignatelli, all rights reserved.
95
96							This library is free software; you may redistribute it and/or modify it under the same terms as Perl itself.
97
98
99							=cut
100
101	3			3		46309	use strict;
	3					4
	3					98
102	3			3		11	use warnings;
	3					5
	3					82
103	3			3		12	use Carp qw/croak/;
	3					6
	3					160
104	3			3		1450	use File::Tail;
	3					64693
	3					190
105	3			3		1431	use POSIX;
	3					14832
	3					12
106
107	3			3		6967	use Exporter;
	3					6
	3					104
108	3			3		12	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
	3					5
	3					1215
109							@ISA = qw(Exporter);
110
111							@EXPORT = (); # Only qualified exports are allowed
112							@EXPORT_OK = qw(new_dict);
113							$VERSION = '0.12';
114
115							sub new
116							{
117	3			3	1	1065	my ($class, %args) = @_;
118	3					5	my %opts;
119
120	3	50				8	$args{'dict'} or croak "Need the dictionary file";
121	3	50				41	croak "\nERROR\n$args{dict}: File not found\n" unless -e $args{dict};
122
123	3		50			13	my $save_mem = $args{'save_mem'} \|\| 0;
124	3					2	my $dictfh;
125	3	50	33			23	if ((UNIVERSAL::isa($args{dict}, 'GLOB')) or (ref \$args{dict} eq 'GLOB')) {
126	0					0	$dictfh = $args{dict}; # TO DO: Test flags, the filehandle must be readable
127							} else {
128	3	100				214	croak "\nERROR\nThe file containing the gi <-> taxid correspondences must be converted to binary format. See the documentation of this module for more details." unless (-B $args{dict});
129	2	50				44	open $dictfh, "<:raw", $args{dict} or croak "$!: $args{dict}";
130							}
131
132	2					5	$opts{fh} = $dictfh;
133	2					3	$opts{save_mem} = $save_mem;
134	2					4	my $self = bless \%opts;
135
136	2					4	$self -> _build_dict();
137	2					6	return $self;
138							}
139
140							sub _build_dict
141							{
142	2			2		3	my ($self) = @_;
143	2					2	my $data;
144	2	50				12	$self->{save_mem} && return;
145	2					18	my $n = sysread( $self->{fh}, $data, -s $self->{fh} );
146	2	50				4	croak "Can't read input file" unless ($n);
147	2					5	$self->{dict} = \$data;
148							}
149
150							sub get_taxid
151							{
152	4			4	1	644	my ($self, $gi) = @_;
153	4					8	return $self->_direct_lookup ($gi);
154							}
155
156							sub _direct_lookup {
157	4			4		4	my ($self,$gi) = @_;
158	4	50				9	if ($self->{save_mem}){
159	0					0	my $taxidBytes;
160	0					0	sysseek ($self->{fh},$gi*3,0);
161	0					0	sysread($self->{fh},$taxidBytes,3);
162	0					0	my ($taxid1, $taxid2, $taxid3) = unpack "CCC", $taxidBytes;
163	0					0	return $taxid3 \| $taxid2 << 8 \| $taxid1 << 16 \| 0 << 24 ;
164							} else {
165	3			3		15	no warnings qw/substr/;
	3					4
	3					1525
166	4					3	my $v = substr(${$self->{dict}}, $gi*3, 3);
	4					15
167	4	50				13	return 0 unless ($v);
168	4					17	my ($taxid1, $taxid2, $taxid3) = unpack "CCC", $v;
169	4					15	my $taxid = $taxid3 \| $taxid2 << 8 \| $taxid1 << 16 \| 0 << 24 ;
170	4					13	return $taxid;
171							}
172							}
173
174							sub new_dict {
175	3			3	1	1827	my (%args) = @_;
176
177							# TO DO -> Multiple inputs should be allowed
178	3	100				154	defined $args{in} or croak "No input dictionary file provided";
179
180	2					2	my $outfh;
181	2	50	66			19	if (! defined $args{out}) {
		100
182	0					0	$outfh = \*STDOUT
183							} elsif ((UNIVERSAL::isa($args{out}, 'GLOB')) or (ref \$args{out} eq 'GLOB')) {
184	1					3	$outfh = $args{out}; ## TO DO: Test flags, the filehandle must be writeable
185							} else {
186	1	50				53	open $outfh, ">", $args{out} or croak $!;
187							}
188
189	2					3	my $infh;
190	2	50	33			18	if ((UNIVERSAL::isa($args{in}, 'GLOB')) or (ref \$args{in} eq 'GLOB')) {
191	0					0	$infh = $args{in}; ## TO DO: Test flags, the filehandle must be readable
192							} else {
193	2	50				52	open $infh, "<", $args{in} or croak $!;
194							}
195
196	2	50				15	my $ftail = File::Tail->new(name=>$args{in},tail=>1) or croak "$!: $args{in}";
197	2					907	my $last_line = $ftail->read();
198	2	50				50	croak "$args{in} is empty" unless (defined $last_line);
199	2					9	my ($last_val) = split /\t/, $last_line;
200
201	2		50			6	my $chunk_size = $args{chunk_size} \|\| 30000000; # 30Mb -- 10 million records per chunk
202	2					17	$chunk_size = floor($chunk_size / 3);
203
204	2					6	my $bin_chunk = "\0" x (3 * ($chunk_size));
205	2					3	my $n_chunk = 0;
206	2					3	my $chunk_pos = $chunk_size * $n_chunk;
207	2					2	my $chunk_limit = $chunk_size * ($n_chunk + 1);
208	2					21	while (<$infh>) {
209	24					29	chomp;
210	24					55	my ($key,$val) = split /\t/;
211	24					87	my ($taxid1, $taxid2, $taxid3, $taxid4) = unpack("CCCC", pack("N", $val));
212	24	100				53	if ($key >= $chunk_limit) {
213	8					8	print {$outfh} $bin_chunk;
	8					27
214	8					15	$bin_chunk = "\0" x (3 * ($chunk_size));
215	8					23	$n_chunk++;
216	8					8	$chunk_pos = $chunk_size * $n_chunk;
217	8					8	$chunk_limit = $chunk_size * ($n_chunk + 1);
218							}
219	24					53	substr($bin_chunk, ($key-$chunk_pos)*3, 1, pack("C", $taxid2));
220	24					48	substr($bin_chunk, ($key-$chunk_pos)*3+1, 1, pack("C", $taxid3));
221	24					96	substr($bin_chunk, ($key-$chunk_pos)*3+2, 1, pack("C", $taxid4));
222							}
223	2					2	print {$outfh} $bin_chunk;
	2					3
224
225	2					14	close ($infh);
226	2	100	66			20	if (defined $args{out} && ref \$args{out} eq "SCALAR") {
227	1					27	close($outfh)}
228							}
229
230							1;