File Coverage

blib/lib/Bio/LITE/Taxonomy/NCBI/Gi2taxid.pm
Criterion Covered Total %
statement 77 85 90.5
branch 22 36 61.1
condition 7 14 50.0
subroutine 12 12 100.0
pod 3 3 100.0
total 121 150 80.6


line stmt bran cond sub pod time code
1             package Bio::LITE::Taxonomy::NCBI::Gi2taxid;
2              
3             =head1 NAME
4              
5             Bio::LITE::Taxonomy::NCBI::Gi2taxid - Mappings of NCBI GI's to Taxids fast and with very low memory footprint.
6              
7             =head1 SYNOPSIS
8              
9             Creation of a new Taxid to GI dictionary (binary mapping file):
10              
11             use Bio::LITE::Taxonomy::NCBI::Gi2taxid qw/new_dict/;
12              
13             new_dict (in => "gi_taxid_prot.dmp",
14             out => "gi_taxid_prot.bin");
15              
16             Usage of the dictionary:
17              
18             use Bio::LITE::Taxonomy::NCBI::Gi2taxid;
19              
20             my $dict = Bio::LITE::Taxonomy::NCBI::Gi2taxid->new(dict=>"gi_taxid_prot.bin");
21             my $taxid = $dict->get_taxid(12553);
22              
23             =head1 DESCRIPTION
24              
25             The NCBI site offers a file to map gene and protein sequences (GIs) with their corresponding taxon of origin (Taxids). If you want to use this information inside a Perl script you will find that (given the high amount of sequences available) it is fairly inefficient to store this information in, for example, a regular hash. Only for creating such a hash you will need more than 10 GBs of system memory.
26              
27             This is a very simple module that has been designed to efficiently map NCBI GIs to Taxids with speed as the primary goal. It is designed to retrieve taxids from GIs very fast and with low memory usage. It is even faster than using a SQL database to retrieve the mappings or using a local DBHash.
28              
29             To achieve this, it uses a binary index that can be created with the function C. This index has to be created one time for each mapping file.
30              
31             The original mapping files can be downloaded from the NCBI site at the following address: L.
32              
33             =head1 FUNCTIONS
34              
35             =head2 C
36              
37             This function creates a new binary dictionary from the NCBI mapping file. The file should be uncompressed before being passed to the script. The function accepts the following parameters:
38              
39             *WARNING* version 0.05 uses a more compacted memory file. This means that binary files created with earlier versions will not work with this one and vice-versa. You need to create the new binary db with this version.
40              
41             =over 4
42              
43             =item in
44              
45             This is the uncompressed mapping file from the NCBI. The function accepts a filename or a filehandle
46              
47             =item out
48              
49             Optional. Where the binary dictionary is going to be printed. The function accepts a filename or a filehandle (that should be opened with writing permissions). If absent STDOUT will be assumed.
50              
51             =back
52              
53             =head1 CONSTRUCTOR
54              
55             =head2 C
56              
57             Once the binary dictionary is created it can be used as an object using this constructor. It accepts the following parameters
58              
59             =over 4
60              
61             =item dict
62              
63             This is the binary dictionary obtained with the C function. The name of the file or a filehandle is accepted.
64              
65             =item save_mem
66              
67             Optional. Use this option to avoid to load the binary dictionary into memory. This will save almost 1GB of system memory but looking up for Taxids will be ~20% slower. This option of I by default.
68              
69             =back
70              
71             =head1 METHODS
72              
73             =head2 C
74              
75             This method receives a GI and returns the corresponding Taxid.
76              
77             =head1 SEE ALSO
78              
79             L
80             L
81              
82             =head1 AUTHOR
83              
84             Miguel Pignatelli
85              
86             Any comments should be addressed to emepyc@gmail.com
87              
88             =head1 LICENSE
89              
90             Copyright 2013 Miguel Pignatelli, all rights reserved.
91              
92             This library is free software; you may redistribute it and/or modify it under the same terms as Perl itself.
93              
94              
95             =cut
96              
97 3     3   83454 use strict;
  3         8  
  3         115  
98 3     3   19 use warnings;
  3         7  
  3         99  
99 3     3   17 use Carp qw/croak/;
  3         15  
  3         242  
100 3     3   3236 use File::Tail;
  3         121165  
  3         318  
101              
102 3     3   44 use Exporter;
  3         6  
  3         113  
103 3     3   16 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  3         7  
  3         1704  
104             @ISA = qw(Exporter);
105              
106             @EXPORT = (); # Only qualified exports are allowed
107             @EXPORT_OK = qw(new_dict);
108             $VERSION = '0.11';
109              
110             sub new
111             {
112 3     3 1 1545 my ($class, %args) = @_;
113 3         4 my %opts;
114              
115 3 50       9 $args{'dict'} or croak "Need the dictionary file";
116 3 50       50 croak "\nERROR\n$args{dict}: File not found\n" unless -e $args{dict};
117              
118 3   50     13 my $save_mem = $args{'save_mem'} || 0;
119 3         5 my $dictfh;
120 3 50 33     42 if ((UNIVERSAL::isa($args{dict}, 'GLOB')) or (ref \$args{dict} eq 'GLOB')) {
121 0         0 $dictfh = $args{dict}; # TO DO: Test flags, the filehandle must be readable
122             } else {
123 3 100       263 croak "\nERROR\nThe file containing the gi <-> taxid correspondences must be converted to binary format. See the documentation of this module for more details." unless (-B $args{dict});
124 2 50       153 open $dictfh, "<:raw", $args{dict} or croak "$!: $args{dict}";
125             }
126              
127 2         5 $opts{fh} = $dictfh;
128 2         3 $opts{save_mem} = $save_mem;
129 2         3 my $self = bless \%opts;
130              
131 2         8 $self -> _build_dict();
132 2         8 return $self;
133             }
134              
135             sub _build_dict
136             {
137 2     2   3 my ($self) = @_;
138 2         2 my $data;
139 2 50       11 $self->{save_mem} && return;
140 2         24 my $n = sysread( $self->{fh}, $data, -s $self->{fh} );
141 2 50       6 croak "Can't read input file" unless ($n);
142 2         5 $self->{dict} = \$data;
143             }
144              
145             sub get_taxid
146             {
147 4     4 1 920 my ($self, $gi) = @_;
148 4         10 return $self->_direct_lookup ($gi);
149             }
150              
151             sub _direct_lookup {
152 4     4   6 my ($self,$gi) = @_;
153 4 50       10 if ($self->{save_mem}){
154 0         0 my $taxidBytes;
155 0         0 sysseek ($self->{fh},$gi*3,0);
156 0         0 sysread($self->{fh},$taxidBytes,3);
157 0         0 my ($taxid1, $taxid2, $taxid3) = unpack "CCC", $taxidBytes;
158 0         0 return $taxid3 | $taxid2 << 8 | $taxid1 << 16 | 0 << 24 ;
159             } else {
160 3     3   21 no warnings qw/substr/;
  3         7  
  3         1926  
161 4         4 my $v = substr(${$self->{dict}}, $gi*3, 3);
  4         16  
162 4 50       16 return 0 unless ($v);
163 4         20 my ($taxid1, $taxid2, $taxid3) = unpack "CCC", $v;
164 4         17 my $taxid = $taxid3 | $taxid2 << 8 | $taxid1 << 16 | 0 << 24 ;
165 4         18 return $taxid;
166             }
167             }
168              
169             sub new_dict {
170 3     3 1 4630 my (%args) = @_;
171              
172             # TO DO -> Multiple inputs should be allowed
173 3 100       173 defined $args{in} or croak "No input dictionary file provided";
174              
175 2         4 my $outfh;
176 2 50 66     25 if (! defined $args{out}) {
    100          
177 0         0 $outfh = \*STDOUT
178             } elsif ((UNIVERSAL::isa($args{out}, 'GLOB')) or (ref \$args{out} eq 'GLOB')) {
179 1         4 $outfh = $args{out}; ## TO DO: Test flags, the filehandle must be writeable
180             } else {
181 1 50       94 open $outfh, ">", $args{out} or croak $!;
182             }
183              
184 2         3 my $infh;
185 2 50 33     33 if ((UNIVERSAL::isa($args{in}, 'GLOB')) or (ref \$args{in} eq 'GLOB')) {
186 0         0 $infh = $args{in}; ## TO DO: Test flags, the filehandle must be readable
187             } else {
188 2 50       112 open $infh, "<", $args{in} or croak $!;
189             }
190              
191 2 50       18 my $ftail = File::Tail->new(name=>$args{in},tail=>1) or croak "$!: $args{in}";
192 2         1021 my $last_line = $ftail->read();
193 2 50       53 croak "$args{in} is empty" unless (defined $last_line);
194 2         10 my ($last_val) = split /\t/, $last_line;
195              
196 2         40 while (<$infh>) {
197 10         20 chomp;
198 10         36 my ($key,$val) = split /\t/;
199 10         54 my ($taxid1, $taxid2, $taxid3, $taxid4) = unpack("CCCC", pack("N", $val));
200 10         43 sysseek($outfh, $key*3, 0);
201 10         157 syswrite($outfh, pack("C", $taxid2), 1);
202 10         88 sysseek($outfh, $key*3 + 1, 0);
203 10         94 syswrite($outfh, pack("C", $taxid3), 1);
204 10         38 sysseek($outfh, $key*3 + 2, 0);
205 10         128 syswrite($outfh, pack("C", $taxid4), 1);
206             }
207 2         22 close ($infh);
208 2 100 66     19 if (defined $args{out} && ref \$args{out} eq "SCALAR") {
209 1         13 close($outfh)}
210             }
211              
212             1;