File Coverage

Bio/Tools/Analysis/Protein/Scansite.pm
Criterion Covered Total %
statement 37 99 37.3
branch 4 28 14.2
condition 1 12 8.3
subroutine 10 15 66.6
pod 3 3 100.0
total 55 157 35.0


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Analysis::Protein::Scansite
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Richard Adams
7             #
8             # Copyright Richard Adams
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Analysis::Protein::Scansite - a wrapper around the Scansite server
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::Analysis::Protein::Scansite;
21              
22             my $seq; # a Bio::PrimarySeqI object
23              
24             my $tool = Bio::Tools::Analysis::Protein::Scansite->new
25             ( -seq => $seq->primary_seq );
26              
27             # run Scansite prediction on a sequence
28             $tool->run();
29              
30             # alternatively you can say
31             $tool->seq($seq->primary_seq)->run;
32              
33             die "Could not get a result" unless $tool->status =~ /^COMPLETED/;
34              
35             print $tool->result; # print raw prediction to STDOUT
36              
37             foreach my $feat ( $tool->result('Bio::SeqFeatureI') ) {
38              
39             # do something to SeqFeature
40             # e.g. print as GFF
41             print $feat->gff_string, "\n";
42             # or store within the sequence - if it is a Bio::RichSeqI
43             $seq->add_SeqFeature($feat);
44              
45             }
46              
47             =head1 DESCRIPTION
48              
49             This class is a wrapper around the Scansite 2.0 server which produces
50             predictions for serine, threonine and tyrosine phosphorylation sites
51             in eukaryotic proteins. At present this is a basic wrapper for the
52             "Scan protein by input sequence" functionality, which takes a sequence
53             and searches for motifs, with the option to select the search
54             stringency. At present, searches for specific phosphorylation
55             sites are not supported; all predicted sites are returned.
56              
57             =head2 Return formats
58              
59             The Scansite results can be obtained in several formats:
60              
61             =over 3
62              
63             =item 1.
64              
65             By calling
66              
67             my $res = $tool->result('');
68              
69             $res holds a string of the predicted sites in tabular format.
70              
71             =item 2.
72              
73             By calling
74              
75             my $data_ref = $tool->result('value')
76              
77             $data_ref is a reference to an array of hashes. Each element in the
78             array represents a predicted phosphorylation site. The hash keys are
79             the names of the data fields,i.e.,
80              
81             'motif' => 'Casn_Kin1' # name of kinase
82             'percentile' => 0.155 # see Scansite docs
83             'position' => 9 # position in protein
84             'protein' => 'A1' # protein id
85             'score' => 0.3696 # see Scansite docs
86             'sequence' => 'ASYFDTASYFSADAT' # sequence surrounding site
87             'site' => 'S9' # phosphorylated residue
88             'zscore' => '-3.110' # see Scansite docs
89              
90             =item 3.
91              
92             By calling
93              
94             my @fts = $tool->Result('Bio::SeqFeatureI');
95              
96             which returns an array of L compliant objects with
97             primary tag value 'Site' and tag names of 'motif', 'score',
98             'sequence', 'zscore' as above.
99              
100             =back
101              
102             See L.
103              
104             This inherits Bio::SimpleAnalysisI which hopefully makes it easier to
105             write wrappers on various services. This class uses a web resource and
106             therefore inherits from L.
107              
108             =head1 SEE ALSO
109              
110             L,
111             L
112              
113             =head1 FEEDBACK
114              
115             =head2 Mailing Lists
116              
117             User feedback is an integral part of the evolution of this and other
118             Bioperl modules. Send your comments and suggestions preferably to one
119             of the Bioperl mailing lists. Your participation is much appreciated.
120              
121             bioperl-l@bioperl.org - General discussion
122             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
123              
124             =head2 Support
125              
126             Please direct usage questions or support issues to the mailing list:
127              
128             I
129              
130             rather than to the module maintainer directly. Many experienced and
131             reponsive experts will be able look at the problem and quickly
132             address it. Please include a thorough description of the problem
133             with code and data examples if at all possible.
134              
135             =head2 Reporting Bugs
136              
137             Report bugs to the Bioperl bug tracking system to help us keep track
138             the bugs and their resolution. Bug reports can be submitted via the
139             web:
140              
141             https://github.com/bioperl/bioperl-live/issues
142              
143             =head1 AUTHORS
144              
145             Richard Adams, Richard.Adams@ed.ac.uk,
146              
147             =head1 APPENDIX
148              
149             The rest of the documentation details each of the object
150             methods. Internal methods are usually preceded with a _
151              
152             =cut
153              
154              
155             # Let the code begin...
156              
157              
158             package Bio::Tools::Analysis::Protein::Scansite;
159 1     1   636 use vars qw($FLOAT @STRINGENCY);
  1         2  
  1         44  
160 1     1   3 use strict;
  1         1  
  1         37  
161 1     1   3 use IO::String;
  1         1  
  1         14  
162 1     1   341 use Bio::SeqIO;
  1         2  
  1         27  
163 1     1   418 use HTTP::Request::Common qw(POST);
  1         1500  
  1         52  
164 1     1   339 use Bio::SeqFeature::Generic;
  1         1  
  1         26  
165              
166 1     1   5 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
  1         1  
  1         304  
167              
168             $FLOAT = '[+-]?\d*\.\d*';
169             @STRINGENCY = qw(High Medium Low);
170             my $URL = 'http://scansite.mit.edu/cgi-bin/motifscan_seq';
171              
172              
173             my $ANALYSIS_SPEC =
174             {
175             'name' => 'Scansite',
176             'type' => 'Protein',
177             'version' => '2.0',
178             'supplier' => 'Massachusetts Institute of Technology',
179             'description' => 'Prediction of serine, threonine and tyrosine
180             phosphorylation sites in eukaryotic proteins',
181             };
182              
183             my $INPUT_SPEC =
184             [
185             {
186             'mandatory' => 'true',
187             'type' => 'Bio::PrimarySeqI',
188             'name' => 'seq',
189             },
190             {
191             'mandatory' => 'false',
192             'type' => 'text',
193             'name' => 'protein_id',
194             'default' => 'unnamed',
195             },
196             {
197             'mandatory' => 'false',
198             'type' => 'text',
199             'name' => 'stringency',
200             'default' => 'High',
201             },
202             ];
203              
204             my $RESULT_SPEC =
205             {
206             '' => 'bulk', # same as undef
207             'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
208             'raw' => 'Array of {motif=>, percentile=>, position=>,
209             protein=>, score=>, site=>, zscore=>
210             sequence=>
211             }',
212             };
213              
214              
215             =head2 result
216              
217             Name : result
218             Usage : $job->result (...)
219             Returns : a result created by running an analysis
220             Args : none (but an implementation may choose
221             to add arguments for instructions how to process
222             the raw result)
223              
224             The method returns a scalar representing a result of an executed
225             job. If the job was terminated by an error, the result may contain
226             an error message instead of the real data.
227              
228             This implementation returns differently processed data depending on
229             argument:
230              
231             =over 3
232              
233             =item undef
234              
235             Returns the raw ASCII data stream but without HTML tags
236              
237             =item 'Bio::SeqFeatureI'
238              
239             The argument string defined the type of bioperl objects returned in an
240             array. The objects are L.
241              
242             =item 'parsed'
243              
244             Returns a reference to an array of hashes containing the data of one
245             phosphorylation site prediction. Key values are:
246              
247             motif, percentile, position, protein, score, site, zscore, sequence.
248              
249              
250             =back
251              
252              
253             =cut
254              
255             sub result {
256 0     0 1 0 my ($self,$value) = @_;
257 0 0 0     0 if( !exists($self->{'_result'}) || $self->status ne 'COMPLETED'){
258 0         0 $self->throw("Cannot get results, analysis not run!");
259             }
260 0         0 my @fts;
261              
262 0 0       0 if ($value ) {
263 0 0       0 if ($value eq 'Bio::SeqFeatureI') {
    0          
264 0         0 for my $hit (@{$self->{'_parsed'}}) {
  0         0  
265             push @fts, Bio::SeqFeature::Generic->new(
266             -start => $hit->{'position'},
267             -end => $hit->{'position'},
268             -primary_tag => 'Site',
269             -source => 'Scansite',
270             -tag => {
271             score => $hit->{'score'},
272             zscore => $hit->{'zscore'},
273             motif => $hit->{'motif'},
274             site => $hit->{'site'},
275 0         0 sequence => $hit->{'sequence'},
276             },
277             );
278             }
279 0         0 return @fts;
280             }
281             elsif ($value eq 'meta') {
282 0         0 $self->throw("No meta sequences available in this analysis!");
283             }
284             ## else get here
285 0         0 return $self->{'_parsed'};
286             }
287              
288 0         0 return $self->{'_result'};
289             }
290              
291             =head2 stringency
292              
293             Usage : $job->stringency(...)
294             Returns : The significance stringency of a prediction
295             Args : None (retrieves value) or 'High', 'Medium' or 'Low'.
296             Purpose : Get/setter of the stringency to be sumitted for analysis.
297              
298             =cut
299              
300             sub stringency {
301 2     2 1 4 my ($self,$value) = @_;
302 2 100       4 if( $value) {
303 1 50       3 if (! grep{$_=~ /$value/i}@STRINGENCY ) {
  3         16  
304 0         0 $self->throw("I need a stringency of [".
305             join " ", @STRINGENCY .
306             "], not [$value]");
307             }
308 1         2 $self->{'_stringency'} = $value;
309 1         7 return $self;
310             }
311 1   33     6 return $self->{'_stringency'} || $self->input_spec->[2]{'default'} ;
312             }
313              
314             =head2 protein_id
315              
316             Usage : $job->protein_id(...)
317             Returns : The sequence id of the protein or 'unnamed' if not set.
318             Args : None
319             Purpose : Getter of the seq_id. Returns the display_id of the sequence
320             object.
321              
322             =cut
323              
324             sub protein_id {
325 1     1 1 2 my $self = shift;
326             return defined ($self->seq())? $self->seq->display_id()
327 1 50       3 : $self->input_spec->[1]{'default'};
328             }
329              
330             sub _init
331             {
332 1     1   2 my $self = shift;
333 1         4 $self->url($URL);
334 1         3 $self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
335 1         2 $self->{'_INPUT_SPEC'} = $INPUT_SPEC;
336 1         1 $self->{'_RESULT_SPEC'} = $RESULT_SPEC;
337 1         2 $self->{'_ANALYSIS_NAME'} = $ANALYSIS_SPEC->{'name'};
338 1         1 return $self;
339             }
340              
341             sub _run {
342 0     0     my $self = shift;
343              
344             # format the sequence into fasta
345 0           $self->delay(1);
346             # delay repeated calls by default by 3 sec, set delay() to change
347 0           $self->sleep;
348              
349 0           $self->status('TERMINATED_BY_ERROR');
350              
351 0           my $request = POST $self->url,
352             Content => [sequence => $self->seq->seq(),
353             protein_id => $self->protein_id(),
354             motif_option => 'all',
355             motifs => '',
356             motif_groups => '',
357             stringency => $self->stringency(),
358             #domain_flag => '',
359             submit => "Submit Request",
360             ];
361             ## raw html report,
362 0           my $content = $self->request($request);
363 0           my $text = $content->content;
364              
365             ##access result data from tag in html
366 0           my @parsed_Results = ();
367 0           my @unwantedParams = qw(db source class);
368 0           my @results = split /sitestats\.phtml\?/, $text;
369 0           shift @results;
370              
371             ##this module generates 'parsed' output directly from html,
372             ## avoids having toparse twice.
373              
374 0           for my $hit (@results) {
375             ## get results string
376 0           my ($res) = $hit =~ /^(.+?)"/;
377              
378             #get key value pairs
379 0           my %params = $res =~/(\w+)=([^&]+)/g;
380              
381             ##remove unwanted data from hash
382 0           map{delete $params{$_}} @unwantedParams;
  0            
383 0           push @parsed_Results, \%params;
384             }
385            
386             ## now generate text output in table format
387 0           my $out_Str = '';
388 0           $out_Str .= $self->_make_header(\@parsed_Results);
389 0           $out_Str .= $self->_add_data(\@parsed_Results);
390            
391              
392 0           $self->{'_result'} = $out_Str;
393 0           $self->{'_parsed'} = \@parsed_Results;
394            
395             ## is successsful if there are results or if there are no results and
396             ## this beacuse there are no matches, not because of parsing errors etc.
397 0 0 0       $self->status('COMPLETED') if $text ne '' &&
      0        
398             (scalar @results > 0 ||
399             (scalar @results == 0 && $text =~/No sites found/));
400 0 0         if ($text =~ /server\s+error/i) {
401 0           $self->throw("Internal server error:\n\n $text");
402 0           return;
403             }
404             }
405              
406             sub _process_arguments {
407              
408             # extra checking for sequence length
409             # mitoprot specific argument testing
410 0     0     my ($self, $args) = @_;
411             #use base checking for existence of mandatory fields
412 0           $self->SUPER::_process_arguments($args);
413            
414             # specific requirements
415 0 0         $self->throw("Sequence must be > 15 amino acids long!")
416             if $self->seq->length < 15;
417 0 0         $self->throw("Sequence must be protein")
418             unless $self->seq->alphabet() eq 'protein';
419             }
420              
421             sub _make_header {
422 0     0     my ($self, $res) = @_;
423 0           my $header = '';
424 0           for my $k (sort keys %{$res->[0]} ){
  0            
425 0 0         next if $k eq 'sequence';
426 0           $header .= $k;
427 0           $header .= ' 'x(12 -length($k));
428             }
429 0           $header .= "sequence\n\n";
430 0           return $header;
431             }
432              
433             sub _add_data {
434 0     0     my ($self, $res) = @_;
435 0           my $outstr = '';
436 0           for my $hit (@$res) {
437 0           for my $k (sort keys %$hit ){
438 0 0         next if $k eq 'sequence';
439 0           $outstr .= $hit->{$k};
440 0           $outstr .= ' 'x(12 - length($hit->{$k}));
441             }
442 0 0         $outstr .= $hit->{'sequence'}. "\n" if $hit->{'sequence'};
443             }
444 0           return $outstr;
445              
446              
447             }
448            
449              
450             1;