File Coverage

blib/lib/Bio/DB/SwissProt.pm
Criterion Covered Total %
statement 78 109 71.5
branch 20 44 45.4
condition 10 24 41.6
subroutine 12 14 85.7
pod 10 10 100.0
total 130 201 64.6


line stmt bran cond sub pod time code
1             #
2             #
3             # BioPerl module for Bio::DB::SwissProt
4             #
5             # Please direct questions and support issues to
6             #
7             # Cared for by Jason Stajich
8             #
9             # Copyright Jason Stajich
10             #
11             # You may distribute this module under the same terms as perl itself
12              
13             # POD documentation - main docs before the code
14             # Reworked to use Bio::DB::WebDBSeqI 2000-12-11
15              
16             =head1 NAME
17              
18             Bio::DB::SwissProt - Database object interface to SwissProt retrieval
19              
20             =head1 SYNOPSIS
21              
22             use Bio::DB::SwissProt;
23              
24             $sp = Bio::DB::SwissProt->new();
25              
26             $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID
27             # <4-letter-identifier>_
28             # or ...
29             $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC
30             # [OPQ]xxxxx
31              
32              
33             # In fact in this implementation
34             # these methods call the same webscript so you can use
35             # then interchangeably
36              
37             # choose a different server to query
38             $sp = Bio::DB::SwissProt->new('-servertype' => 'expasy',
39             '-hostlocation' => 'us');
40              
41             $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID
42              
43             =head1 DESCRIPTION
44              
45             SwissProt is a curated database of proteins managed by the Swiss
46             Bioinformatics Institute. Additional tools for
47             parsing and manipulating swissprot files can be found at
48             ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/.
49              
50             Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the
51             SwissProt database via an Expasy retrieval.
52              
53             In order to make changes transparent we have host type (currently only
54             expasy) and location (default to Switzerland) separated out. This
55             allows the user to pick the closest Expasy mirror for running their
56             queries.
57              
58              
59             =head1 FEEDBACK
60              
61             =head2 Mailing Lists
62              
63             User feedback is an integral part of the evolution of this and other
64             Bioperl modules. Send your comments and suggestions preferably to one
65             of the Bioperl mailing lists. Your participation is much appreciated.
66              
67              
68             bioperl-l@bioperl.org - General discussion
69             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
70              
71             =head2 Support
72              
73             Please direct usage questions or support issues to the mailing list:
74              
75             I
76              
77             rather than to the module maintainer directly. Many experienced and
78             reponsive experts will be able look at the problem and quickly
79             address it. Please include a thorough description of the problem
80             with code and data examples if at all possible.
81              
82             =head2 Reporting Bugs
83              
84             Report bugs to the Bioperl bug tracking system to help us keep track
85             the bugs and their resolution. Bug reports can be submitted via the
86             web:
87              
88             https://github.com/bioperl/bioperl-live/issues
89              
90             =head1 AUTHOR - Jason Stajich
91              
92             Email Jason Stajich Ejason@bioperl.org E
93              
94             Thanks go to Alexandre Gattiker Egattiker@isb-sib.chE of Swiss
95             Institute of Bioinformatics for helping point us in the direction of
96             the correct expasy scripts and for swissknife references.
97              
98             Also thanks to Heikki Lehvaslaiho Eheikki-at-bioperl-dot-orgE
99             for help with adding EBI swall server.
100              
101             =head1 APPENDIX
102              
103             The rest of the documentation details each of the object
104             methods. Internal methods are usually preceded with a _
105              
106             =cut
107              
108             # Let the code begin...
109              
110             package Bio::DB::SwissProt;
111             $Bio::DB::SwissProt::VERSION = '1.7.4';
112 1     1   214040 use strict;
  1         6  
  1         51  
113              
114 1     1   7 use HTTP::Request::Common;
  1         2  
  1         107  
115             our $MODVERSION = '0.8.1';
116              
117 1     1   7 use base qw(Bio::DB::WebDBSeqI);
  1         2  
  1         500  
118              
119             # global vars
120             our $DEFAULTSERVERTYPE = 'ebi';
121             our $DEFAULTFORMAT = 'swissprot';
122             # our $DEFAULTIDTRACKER = 'http://www.expasy.ch';
123              
124             # you can add your own here theoretically.
125             our %HOSTS = (
126             'expasy' => {
127             'default' => 'us',
128             'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl',
129             'hosts' =>
130             {
131             'switzerland' => 'ch.expasy.org',
132             'canada' => 'ca.expasy.org',
133             'china' => 'cn.expasy.org',
134             'taiwan' => 'tw.expasy.org',
135             'australia' => 'au.expasy.org',
136             'korea' => 'kr.expasy.org',
137             'us' => 'us.expasy.org',
138             },
139             # ick, CGI variables
140             'jointype' => ' ',
141             'idvar' => 'list',
142             'basevars' => [ ],
143             },
144             'ebi' => {
145             'default' => 'uk',
146             'baseurl' => 'http://%s/Tools/dbfetch/dbfetch',
147             'hosts' => {
148             'uk' => 'www.ebi.ac.uk',
149             },
150             'jointype' => ',',
151             'idvar' => 'id',
152             'basevars' => [ 'db' => 'UniProtKB',
153             'style' => 'raw' ],
154             }
155             );
156              
157             our %ID_MAPPING_DATABASES = map {$_ => 1} qw(
158             ACC+ID ACC ID UPARC NF50 NF90 NF100 EMBL_ID EMBL PIR UNIGENE_ID P_ENTREZGENEID
159             P_GI P_IPI P_REFSEQ_AC PDB_ID DISPROT_ID HSSP_ID DIP_ID MEROPS_ID PEROXIBASE_ID
160             PPTASEDB_ID REBASE_ID TCDB_ID 2DBASE_ECOLI_ID AARHUS_GHENT_2DPAGE_ID
161             ANU_2DPAGE_ID DOSAC_COBS_2DPAGE_ID ECO2DBASE_ID WORLD_2DPAGE_ID ENSEMBL_ID
162             ENSEMBL_PRO_ID ENSEMBL_TRS_ID P_ENTREZGENEID GENOMEREVIEWS_ID KEGG_ID TIGR_ID
163             UCSC_ID VECTORBASE_ID AGD_ID ARACHNOSERVER_ID BURULIST_ID CGD CYGD_ID
164             DICTYBASE_ID ECHOBASE_ID ECOGENE_ID EUHCVDB_ID FLYBASE_ID GENECARDS_ID
165             GENEDB_SPOMBE_ID GENEFARM_ID H_INVDB_ID HGNC_ID HPA_ID LEGIOLIST_ID LEPROMA_ID
166             LISTILIST_ID MAIZEGDB_ID MIM_ID MGI_ID MYPULIST_ID NMPDR ORPHANET_ID PHARMGKB_ID
167             PHOTOLIST_ID PSEUDOCAP_ID RGD_ID SAGALIST_ID SGD_ID SUBTILIST_ID TAIR_ID
168             TUBERCULIST_ID WORMBASE_ID WORMPEP_ID XENBASE_ID ZFIN_ID EGGNOG_ID OMA_ID
169             ORTHODB_ID BIOCYC_ID REACTOME_ID CLEANEX_ID GERMONLINE_ID DRUGBANK_ID
170             NEXTBIO_ID);
171              
172             # new modules should be a little more lightweight and
173             # should use Bio::Root::Root
174             sub new {
175 3     3 1 1107909 my ($class, @args) = @_;
176 3         34 my $self = $class->SUPER::new(@args);
177              
178 3         16259 my ($format, $hostlocation,$servertype) =
179             $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)],
180             @args);
181              
182 3 50 33     111 if( $format && $format !~ /(swiss)|(fasta)/i ) {
183 0         0 $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported");
184 0         0 $format = $self->default_format;
185             }
186 3 50       21 $servertype = $DEFAULTSERVERTYPE unless $servertype;
187 3         13 $servertype = lc $servertype;
188 3         21 $self->servertype($servertype);
189 3 50       7 if ( $hostlocation ) {
190 0         0 $self->hostlocation(lc $hostlocation);
191             }
192              
193 3         8 $self->request_format($format); # let's always override the format, as it must be swiss or fasta
194 3         37 return $self;
195             }
196              
197             =head2 Routines from Bio::DB::RandomAccessI
198              
199             =cut
200              
201             =head2 get_Seq_by_id
202              
203             Title : get_Seq_by_id
204             Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
205             Function: Gets a Bio::Seq object by its name
206             Returns : a Bio::Seq object
207             Args : the id (as a string) of a sequence
208             Throws : "id does not exist" exception
209              
210             =cut
211              
212             =head2 get_Seq_by_acc
213              
214             Title : get_Seq_by_acc
215             Usage : $seq = $db->get_Seq_by_acc('X77802');
216             Function: Gets a Bio::Seq object by accession number
217             Returns : A Bio::Seq object
218             Args : accession number (as a string)
219             Throws : "acc does not exist" exception
220              
221             =cut
222              
223             =head2 get_Stream_by_id
224              
225             Title : get_Stream_by_id
226             Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
227             Function: Gets a series of Seq objects by unique identifiers
228             Returns : a Bio::SeqIO stream object
229             Args : $ref : a reference to an array of unique identifiers for
230             the desired sequence entries
231              
232             =cut
233              
234             =head2 get_Stream_by_acc
235              
236             Title : get_Stream_by_acc
237             Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
238             Function: Gets a series of Seq objects by accession numbers
239             Returns : a Bio::SeqIO stream object
240             Args : $ref : a reference to an array of accession numbers for
241             the desired sequence entries
242             Note : For GenBank, this just calls the same code for get_Stream_by_id()
243              
244             =cut
245              
246             =head2 get_Stream_by_batch
247              
248             Title : get_Stream_by_batch
249             Usage : $seq = $db->get_Stream_by_batch($ref);
250             Function: Retrieves Seq objects from SwissProt 'en masse', rather than one
251             at a time. This is implemented the same way as get_Stream_by_id,
252             but is provided here in keeping with access methods of NCBI
253             modules.
254             Example :
255             Returns : a Bio::SeqIO stream object
256             Args : $ref : either an array reference, a filename, or a filehandle
257             from which to get the list of unique ids/accession numbers.
258              
259             NOTE: deprecated API. Use get_Stream_by_id() instead.
260              
261             =cut
262              
263             *get_Stream_by_batch = sub {
264 0     0   0 my $self = shift;
265 0         0 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
266 0         0 $self->get_Stream_by_id(@_)
267             };
268              
269             =head2 Implemented Routines from Bio::DB::WebDBSeqI interface
270              
271             =cut
272              
273             =head2 get_request
274              
275             Title : get_request
276             Usage : my $url = $self->get_request
277             Function: returns a HTTP::Request object
278             Returns :
279             Args : %qualifiers = a hash of qualifiers (ids, format, etc)
280              
281             =cut
282              
283             sub get_request {
284 5     5 1 83 my ($self, @qualifiers) = @_;
285 5         25 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
286             @qualifiers);
287              
288 5 50       166 if( !defined $uids ) {
289 0         0 $self->throw("Must specify a value for uids to query");
290             }
291 5         15 my ($f,undef) = $self->request_format($format);
292              
293             my %vars = (
294 5         10 @{$HOSTS{$self->servertype}->{'basevars'}},
  5         13  
295             ( 'format' => $f )
296             );
297              
298 5         38 my $url = $self->location_url;
299              
300 5         11 my $uid;
301 5   50     12 my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' ';
302 5   50     24 my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id';
303              
304 5 100       33 if( ref($uids) =~ /ARRAY/i ) {
305             # HTTP::Request automagically converts the ' ' to %20
306 3         12 $uid = join($jointype, @$uids);
307             } else {
308 2         4 $uid = $uids;
309             }
310 5         27 $vars{$idvar} = $uid;
311              
312 5         67 return POST $url, \%vars;
313             }
314              
315             =head2 postprocess_data
316              
317             Title : postprocess_data
318             Usage : $self->postprocess_data ( 'type' => 'string',
319             'location' => \$datastr);
320             Function: process downloaded data before loading into a Bio::SeqIO
321             Returns : void
322             Args : hash with two keys - 'type' can be 'string' or 'file'
323             - 'location' either file location or string
324             reference containing data
325              
326             =cut
327              
328             # don't need to do anything
329              
330             sub postprocess_data {
331 1     1 1 1795440 my ($self, %args) = @_;
332 1         5 return;
333             }
334              
335             =head2 default_format
336              
337             Title : default_format
338             Usage : my $format = $self->default_format
339             Function: Returns default sequence format for this module
340             Returns : string
341             Args : none
342              
343             =cut
344              
345             sub default_format {
346 3     3 1 474 return $DEFAULTFORMAT;
347             }
348              
349             =head2 Bio::DB::SwissProt specific routines
350              
351             =cut
352              
353             =head2 servertype
354              
355             Title : servertype
356             Usage : my $servertype = $self->servertype
357             $self->servertype($servertype);
358             Function: Get/Set server type
359             Returns : string
360             Args : server type string [optional]
361              
362             =cut
363              
364             sub servertype {
365 60     60 1 102 my ($self, $servertype) = @_;
366 60 100 66     137 if( defined $servertype && $servertype ne '') {
367             $self->throw("You gave an invalid server type ($servertype)".
368             " - available types are ".
369 3 50       24 keys %HOSTS) unless( $HOSTS{$servertype} );
370 3         18 $self->{'_servertype'} = $servertype;
371 3         13 $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'};
372              
373             # make sure format is reset properly in that different
374             # servers have different syntaxes
375 3         11 my ($existingformat,$seqioformat) = $self->request_format;
376 3         9 $self->request_format($existingformat);
377             }
378 60   100     371 return $self->{'_servertype'} || $DEFAULTSERVERTYPE;
379             }
380              
381              
382             =head2 hostlocation
383              
384             Title : hostlocation
385             Usage : my $location = $self->hostlocation()
386             $self->hostlocation($location)
387             Function: Set/Get Hostlocation
388             Returns : string representing hostlocation
389             Args : string specifying hostlocation [optional]
390              
391             =cut
392              
393             sub hostlocation {
394 5     5 1 14 my ($self, $location ) = @_;
395 5         11 my $servertype = $self->servertype;
396 5 50       14 $self->throw("Must have a valid servertype defined not $servertype")
397             unless defined $servertype;
398 5         8 my %hosts = %{$HOSTS{$servertype}->{'hosts'}};
  5         59  
399 5 50 33     22 if( defined $location && $location ne '' ) {
400 0         0 $location = lc $location;
401 0 0       0 if( ! $hosts{$location} ) {
402 0         0 $self->throw("Must specify a known host, not $location,".
403             " possible values (".
404             join(",", sort keys %hosts ). ")");
405             }
406 0         0 $self->{'_hostlocation'} = $location;
407             }
408 5         19 return $self->{'_hostlocation'};
409             }
410              
411             =head2 location_url
412              
413             Title : location
414             Usage : my $url = $self->location_url()
415             Function: Get host url
416             Returns : string representing url
417             Args : none
418              
419             =cut
420              
421             sub location_url {
422 5     5 1 17 my ($self) = @_;
423 5         12 my $servertype = $self->servertype();
424 5         15 my $location = $self->hostlocation();
425              
426 5 50 33     46 if( ! defined $location || !defined $servertype ) {
427 0         0 $self->throw("must have a valid hostlocation and servertype set before calling location_url");
428             }
429             return sprintf($HOSTS{$servertype}->{'baseurl'},
430 5         37 $HOSTS{$servertype}->{'hosts'}->{$location});
431             }
432              
433             =head2 request_format
434              
435             Title : request_format
436             Usage : my ($req_format, $ioformat) = $self->request_format;
437             $self->request_format("genbank");
438             $self->request_format("fasta");
439             Function: Get/Set sequence format retrieval. The get-form will normally
440             not be used outside of this and derived modules.
441             Returns : Array of two strings, the first representing the format for
442             retrieval, and the second specifying the corresponding SeqIO
443             format.
444             Args : $format = sequence format
445              
446             =cut
447              
448             sub request_format {
449 31     31 1 3392563 my ($self, $value) = @_;
450 31 100       83 if( defined $value ) {
451 16 50       70 if( $self->servertype =~ /expasy/ ) {
    50          
452 0 0 0     0 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
    0          
453 0         0 $self->{'_format'} = [ 'sprot', 'swiss'];
454             } elsif( $value =~ /^fa/ ) {
455 0         0 $self->{'_format'} = [ 'fasta', 'fasta'];
456             } else {
457 0         0 $self->warn("Unrecognized format $value requested");
458 0         0 $self->{'_format'} = [ 'fasta', 'fasta'];
459             }
460             } elsif( $self->servertype =~ /ebi/ ) {
461 16 50 33     83 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
    0          
462 16         65 $self->{'_format'} = [ 'swissprot', 'swiss' ];
463             } elsif( $value =~ /^fa/ ) {
464 0         0 $self->{'_format'} = [ 'fasta', 'fasta'];
465             } else {
466 0         0 $self->warn("Unrecognized format $value requested");
467 0         0 $self->{'_format'} = [ 'swissprot', 'swiss'];
468             }
469             }
470             }
471 31         61 return @{$self->{'_format'}};
  31         107  
472             }
473              
474             =head2 idtracker
475              
476             Title : idtracker
477             Usage : my ($newid) = $self->idtracker($oldid);
478             Function: Retrieve new ID using old ID.
479             Returns : single ID if one is found
480             Args : ID to look for
481              
482             =cut
483              
484             sub idtracker {
485 0     0 1 0 my ($self, $id) = @_;
486 0         0 $self->deprecated(
487             -message => 'The SwissProt IDTracker service is no longer available, '.
488             'use id_mapper() instead',
489             -warn_version => '1.6', # warn if $VERSION is >= this version
490             -throw_version => '1.7' # throw if $VERSION is >= this version
491             );
492             }
493              
494             =head2 id_mapper
495              
496             Title : id_tracker
497             Usage : my $map = $self->id_mapper( -from => '',
498             -to => '',
499             -ids => \@ids);
500             Function: Retrieve new ID using old ID.
501             Returns : hash reference of successfully mapped IDs
502             Args : -from : database mapping from
503             -to : database mapped to
504             -ids : a single ID or array ref of IDs to map
505             Note : For a list of valid database IDs, see:
506             http://www.uniprot.org/faq/28#id_mapping_examples
507              
508             =cut
509              
510             sub id_mapper {
511 1     1 1 8 my $self = shift;
512 1         12 my ($from, $to, $ids) = $self->_rearrange([qw(FROM TO IDS)], @_);
513 1         36 for ($from, $to) {
514 2 50       10 $self->throw("$_ is not a recognized database") if !exists $ID_MAPPING_DATABASES{$_};
515             }
516 1 50       5 my @ids = ref $ids ? @$ids : $ids;
517 1         8 my $params = {
518             from => $from,
519             to => $to,
520             format => 'tab',
521             query => join(' ',@ids)
522             };
523 1         4 my $ua = $self->ua;
524 1         7 push @{ $ua->requests_redirectable }, 'POST';
  1         6  
525 1         34 my $response = $ua->post("http://www.uniprot.org/mapping/", $params);
526 1         453126 while (my $wait = $response->header('Retry-After')) {
527 0         0 $self->debug("Waiting...\n");
528 0         0 $self->_sleep;
529 0         0 $response = $ua->get($response->base);
530             }
531              
532 1         46 my %map;
533 1 50       6 if ($response->is_success) {
534 0         0 for my $line (split("\n", $response->content)) {
535 0         0 my ($id_from, $id_to) = split(/\s+/, $line, 2);
536 0 0       0 next if $id_from eq 'From';
537 0         0 push @{$map{$id_from}}, $id_to;
  0         0  
538             }
539             } else {
540 1         14 $self->throw("Error: ".$response->status_line."\n");
541             }
542 0           \%map;
543             }
544              
545             1;
546              
547             __END__