File Coverage

Bio/DB/SeqVersion/gi.pm
Criterion Covered Total %
statement 16 58 27.5
branch 0 16 0.0
condition 0 3 0.0
subroutine 5 11 45.4
pod 5 5 100.0
total 26 93 27.9


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::SeqVersion::gi
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Brian Osborne
7             #
8             # Copyright Brian Osborne 2006
9             #
10             # You may distribute this module under the same terms as Perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::DB::SeqVersion::gi - interface to NCBI Sequence Revision History page
17              
18             =head1 SYNOPSIS
19              
20             Do not use this module directly, use Bio::DB::SeqVersion.
21              
22             use Bio::DB::SeqVersion;
23              
24             my $query = Bio::DB::SeqVersion->new(-type => 'gi');
25              
26             # all GIs, which will include the GI used to query
27             my @all_gis = $query->get_all(2);
28              
29             # the most recent GI, which may or may not be the GI used to query
30             my $live_gi = $query->get_recent(2);
31              
32             # get all the visible data on the Sequence Revision page
33             my $array_ref = $query->get_history(11111111);
34              
35             These methods can also take accession numbers as arguments, just like
36             the Sequence Revision page itself.
37              
38             =head1 DESCRIPTION
39              
40             All sequence entries at GenBank are identified by a pair of
41             identifiers, an accession and a numeric identifier, and this number is
42             frequently called a GI number (BenInfo Bdentifier). The accession
43             is stable, but each new version of the sequence entry for the accession
44             receives a new GI number (see L
45             for more information on GenBank identifiers). One accession
46             can have one or more GI numbers and the highest of these is the most recent,
47             or "live", GI.
48              
49             Information on an accession and its associated GI numbers is available at
50             the Sequence Revision History page at NCBI,
51             L, this information is
52             not available in file format. This module queries the Web page and retrieves GI
53             numbers and related data given an accession (e.g. NP_111111, A11111, P12345) or
54             a GI number (e.g. 2, 11111111) as query.
55              
56             =head1 FEEDBACK
57              
58             =head2 Mailing Lists
59              
60             User feedback is an integral part of the evolution of this and other
61             Bioperl modules. Send your comments and suggestions preferably to one
62             of the Bioperl mailing lists. Your participation is much appreciated.
63              
64             bioperl-l@bioperl.org - General discussion
65             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66              
67             =head2 Support
68              
69             Please direct usage questions or support issues to the mailing list:
70              
71             I
72              
73             rather than to the module maintainer directly. Many experienced and
74             reponsive experts will be able look at the problem and quickly
75             address it. Please include a thorough description of the problem
76             with code and data examples if at all possible.
77              
78             =head2 Reporting Bugs
79              
80             Report bugs to the Bioperl bug tracking system to help us keep track
81             the bugs and their resolution. Bug reports can be submitted via the
82             web:
83              
84             https://github.com/bioperl/bioperl-live/issues
85              
86             =head1 AUTHOR - Brian Osborne
87              
88             Email E osborne at optonline dot net E
89              
90             =head1 CONTRIBUTORS
91              
92             Torsten Seemann - torsten.seemann AT infotech.monash.edu.au
93              
94             =head1 APPENDIX
95              
96             The rest of the documentation details each of the object
97             methods. Internal methods are usually preceded with a _
98              
99             =cut
100              
101             # Let the code begin...
102              
103             package Bio::DB::SeqVersion::gi;
104 1     1   2 use strict;
  1         1  
  1         22  
105 1     1   415 use Encode;
  1         6000  
  1         54  
106 1     1   5 use HTML::TableExtract;
  1         1  
  1         8  
107 1     1   41 use base qw(Bio::DB::SeqVersion);
  1         1  
  1         459  
108              
109             # Private class variables
110              
111             # TODO: this may be an unstable setting (text is actually minimal XHTML)
112             my $URL = 'https://www.ncbi.nlm.nih.gov/nuccore/%s?report=girevhist&format=text';
113              
114             =head2 new
115              
116             Title : new
117             Usage : $gb = Bio::DB::SeqVersion::gi->new
118             Function: Creates a new query object
119             Returns : New query object
120              
121             =cut
122              
123             sub new {
124 1     1 1 1 my ( $class, @args ) = @_;
125 1         6 my $self = $class->SUPER::new(@args);
126 1         2 $self->_initialize;
127 1         6 return $self;
128             }
129              
130             =head2 get_all
131              
132             Title : get_all
133             Usage : my @gis = $q->get_all(2)
134             Function: Get all GI numbers given a GI number
135             Returns : An array of GI numbers, earliest GI number is the 0 element
136             Args : A single GI number (string)
137              
138             =cut
139              
140             sub get_all {
141 0     0 1   my ( $self, $id ) = @_;
142 0           my ( @arr, $ref );
143             $id eq $self->{_last_id}
144             ? $ref = $self->{_last_result}
145 0 0         : $ref = $self->get_history($id);
146 0           for my $row ( @{$ref} ) {
  0            
147 0           push @arr, $$row[0];
148             }
149 0           @arr;
150             }
151              
152             =head2 get_recent
153              
154             Title : get_recent
155             Usage : my $newest_gi = $q->get_recent(2)
156             Function: Get most recent GI given a single GI
157             Returns : String
158             Args : A single GI number (string)
159              
160             =cut
161              
162             sub get_recent {
163 0     0 1   my ( $self, $id ) = @_;
164 0           my $ref;
165             $id eq $self->{_last_id}
166             ? $ref = $self->{_last_result}
167 0 0         : $ref = $self->get_history($id);
168 0           $ref->[0]->[0];
169             }
170              
171             =head2 get_status
172              
173             Title : get_status
174             Usage : my $newest_gi = $q->get_status(2)
175             Function: Get most recent GI given a single GI
176             Returns : String
177             Args : A single GI number (string)
178              
179             =cut
180              
181             sub get_status {
182 0     0 1   my ( $self, $id ) = @_;
183 0 0         $self->throw("Must pass an ID") if !defined $id;
184 0 0         if ($id ne $self->{_last_id} ) {
185 0           $self->get_history($id);
186             }
187 0           $self->{_last_status};
188             }
189              
190             =head2 get_history
191              
192             Title : get_history
193             Usage : my $ref = $query_obj->get_history()
194             Function: Queries the NCBI Revision page, gets the data from the HTML table
195             Returns : Reference to an array of arrays where element 0 refers to the most
196             recent version and the last element refers to the oldest version.
197             In the second dimension the elements are:
198              
199             0 GI number
200             1 Version
201             2 Update Date
202              
203             For example, to get the GI number of the first version:
204              
205             $ref->[$#{@$ref}]->[0]
206              
207             To get the Update Date of the latest version:
208              
209             $ref->[0]->[2]
210              
211             Args : One identifier (string)
212             Note : Status of the GI was returned here previously as the last element in
213             the row of elemnts above; however the status is currently only
214             returned for the GI requested (e.g. a single value). One can get
215             the status for this using the get_status() method above
216              
217             =cut
218              
219             sub get_history {
220 0     0 1   my ( $self, $id ) = @_;
221 0           my $html = $self->_get_request($id);
222 0           my ( $ref, $status ) = $self->_process_data($html);
223              
224             # store the very last result in case some other methods
225             # are called using the same identifier
226 0           $self->{_last_result} = $ref;
227 0           $self->{_last_id} = $id;
228 0           $self->{_last_status} = $status;
229 0           $ref;
230             }
231              
232             =head2 _get_request
233              
234             Title : _get_request
235             Usage : my $url = $self->_get_request
236             Function: GET using NCBI Revision page URL, uses Root::HTTPget
237             Returns : HTML
238             Args : One identifier (string)
239              
240             =cut
241              
242             sub _get_request {
243 0     0     my ( $self, $id ) = @_;
244              
245 0 0 0       $self->throw("Must specify a single id to query") if ( !defined($id) || ref($id) );
246              
247 0           my $url = sprintf( $URL, $id );
248 0           my $response = $self->get($url);
249 0 0         if ( not $response->is_success ) {
250 0           $self->throw( "Can't query $url: "
251             . $response->status_line . "\n"
252             . "ID likely does not exist" );
253             }
254 0           return $response->content;
255             }
256              
257             =head2 _process_data
258              
259             Title : _process_data
260             Usage : $self->_process_data($html)
261             Function: extract data from HTML
262             Args : HTML from Revision History page
263             Returns : reference to an array of arrays
264              
265             =cut
266              
267             sub _process_data {
268 0     0     my ( $self, $html ) = @_;
269              
270             # Only one status is returned (not one per revision). Setting once
271 0           my $status;
272 0 0         if ($html =~ /
Current status:\s+(\S+)<\/div>/) {
273 0           $status = $1;
274             } else {
275 0           $self->warn("No current status found, setting to 'unknown'");
276 0           $status = 'unknown';
277             }
278              
279 0           my $te = HTML::TableExtract->new(
280             headers => ['Gi', 'Version', 'Update Date'] ,
281             depth => 0);
282 0           $te->parse(decode_utf8($html));
283 0           my $table = $te->first_table_found;
284 0 0         $self->throw("No table found") unless defined $table;
285 0           my $t = [$table->rows];
286 0           ($t, $status);
287             }
288              
289             1;
290              
291             __END__