File Coverage

Bio/Tools/Analysis/Protein/Scansite.pm

Criterion	Covered	Total	%
statement	37	99	37.3
branch	4	28	14.2
condition	1	12	8.3
subroutine	10	15	66.6
pod	3	3	100.0
total	55	157	35.0

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::Tools::Analysis::Protein::Scansite
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Richard Adams
7							#
8							# Copyright Richard Adams
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::Tools::Analysis::Protein::Scansite - a wrapper around the Scansite server
17
18							=head1 SYNOPSIS
19
20							use Bio::Tools::Analysis::Protein::Scansite;
21
22							my $seq; # a Bio::PrimarySeqI object
23
24							my $tool = Bio::Tools::Analysis::Protein::Scansite->new
25							( -seq => $seq->primary_seq );
26
27							# run Scansite prediction on a sequence
28							$tool->run();
29
30							# alternatively you can say
31							$tool->seq($seq->primary_seq)->run;
32
33							die "Could not get a result" unless $tool->status =~ /^COMPLETED/;
34
35							print $tool->result; # print raw prediction to STDOUT
36
37							foreach my $feat ( $tool->result('Bio::SeqFeatureI') ) {
38
39							# do something to SeqFeature
40							# e.g. print as GFF
41							print $feat->gff_string, "\n";
42							# or store within the sequence - if it is a Bio::RichSeqI
43							$seq->add_SeqFeature($feat);
44
45							}
46
47							=head1 DESCRIPTION
48
49							This class is a wrapper around the Scansite 2.0 server which produces
50							predictions for serine, threonine and tyrosine phosphorylation sites
51							in eukaryotic proteins. At present this is a basic wrapper for the
52							"Scan protein by input sequence" functionality, which takes a sequence
53							and searches for motifs, with the option to select the search
54							stringency. At present, searches for specific phosphorylation
55							sites are not supported; all predicted sites are returned.
56
57							=head2 Return formats
58
59							The Scansite results can be obtained in several formats:
60
61							=over 3
62
63							=item 1.
64
65							By calling
66
67							my $res = $tool->result('');
68
69							$res holds a string of the predicted sites in tabular format.
70
71							=item 2.
72
73							By calling
74
75							my $data_ref = $tool->result('value')
76
77							$data_ref is a reference to an array of hashes. Each element in the
78							array represents a predicted phosphorylation site. The hash keys are
79							the names of the data fields,i.e.,
80
81							'motif' => 'Casn_Kin1' # name of kinase
82							'percentile' => 0.155 # see Scansite docs
83							'position' => 9 # position in protein
84							'protein' => 'A1' # protein id
85							'score' => 0.3696 # see Scansite docs
86							'sequence' => 'ASYFDTASYFSADAT' # sequence surrounding site
87							'site' => 'S9' # phosphorylated residue
88							'zscore' => '-3.110' # see Scansite docs
89
90							=item 3.
91
92							By calling
93
94							my @fts = $tool->Result('Bio::SeqFeatureI');
95
96							which returns an array of L compliant objects with
97							primary tag value 'Site' and tag names of 'motif', 'score',
98							'sequence', 'zscore' as above.
99
100							=back
101
102							See L.
103
104							This inherits Bio::SimpleAnalysisI which hopefully makes it easier to
105							write wrappers on various services. This class uses a web resource and
106							therefore inherits from L.
107
108							=head1 SEE ALSO
109
110							L,
111							L
112
113							=head1 FEEDBACK
114
115							=head2 Mailing Lists
116
117							User feedback is an integral part of the evolution of this and other
118							Bioperl modules. Send your comments and suggestions preferably to one
119							of the Bioperl mailing lists. Your participation is much appreciated.
120
121							bioperl-l@bioperl.org - General discussion
122							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
123
124							=head2 Support
125
126							Please direct usage questions or support issues to the mailing list:
127
128							I
129
130							rather than to the module maintainer directly. Many experienced and
131							reponsive experts will be able look at the problem and quickly
132							address it. Please include a thorough description of the problem
133							with code and data examples if at all possible.
134
135							=head2 Reporting Bugs
136
137							Report bugs to the Bioperl bug tracking system to help us keep track
138							the bugs and their resolution. Bug reports can be submitted via the
139							web:
140
141							https://github.com/bioperl/bioperl-live/issues
142
143							=head1 AUTHORS
144
145							Richard Adams, Richard.Adams@ed.ac.uk,
146
147							=head1 APPENDIX
148
149							The rest of the documentation details each of the object
150							methods. Internal methods are usually preceded with a _
151
152							=cut
153
154
155							# Let the code begin...
156
157
158							package Bio::Tools::Analysis::Protein::Scansite;
159	1			1		636	use vars qw($FLOAT @STRINGENCY);
	1					2
	1					44
160	1			1		3	use strict;
	1					1
	1					37
161	1			1		3	use IO::String;
	1					1
	1					14
162	1			1		341	use Bio::SeqIO;
	1					2
	1					27
163	1			1		418	use HTTP::Request::Common qw(POST);
	1					1500
	1					52
164	1			1		339	use Bio::SeqFeature::Generic;
	1					1
	1					26
165
166	1			1		5	use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
	1					1
	1					304
167
168							$FLOAT = '[+-]?\d\.\d';
169							@STRINGENCY = qw(High Medium Low);
170							my $URL = 'http://scansite.mit.edu/cgi-bin/motifscan_seq';
171
172
173							my $ANALYSIS_SPEC =
174							{
175							'name' => 'Scansite',
176							'type' => 'Protein',
177							'version' => '2.0',
178							'supplier' => 'Massachusetts Institute of Technology',
179							'description' => 'Prediction of serine, threonine and tyrosine
180							phosphorylation sites in eukaryotic proteins',
181							};
182
183							my $INPUT_SPEC =
184							[
185							{
186							'mandatory' => 'true',
187							'type' => 'Bio::PrimarySeqI',
188							'name' => 'seq',
189							},
190							{
191							'mandatory' => 'false',
192							'type' => 'text',
193							'name' => 'protein_id',
194							'default' => 'unnamed',
195							},
196							{
197							'mandatory' => 'false',
198							'type' => 'text',
199							'name' => 'stringency',
200							'default' => 'High',
201							},
202							];
203
204							my $RESULT_SPEC =
205							{
206							'' => 'bulk', # same as undef
207							'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
208							'raw' => 'Array of {motif=>, percentile=>, position=>,
209							protein=>, score=>, site=>, zscore=>
210							sequence=>
211							}',
212							};
213
214
215							=head2 result
216
217							Name : result
218							Usage : $job->result (...)
219							Returns : a result created by running an analysis
220							Args : none (but an implementation may choose
221							to add arguments for instructions how to process
222							the raw result)
223
224							The method returns a scalar representing a result of an executed
225							job. If the job was terminated by an error, the result may contain
226							an error message instead of the real data.
227
228							This implementation returns differently processed data depending on
229							argument:
230
231							=over 3
232
233							=item undef
234
235							Returns the raw ASCII data stream but without HTML tags
236
237							=item 'Bio::SeqFeatureI'
238
239							The argument string defined the type of bioperl objects returned in an
240							array. The objects are L.
241
242							=item 'parsed'
243
244							Returns a reference to an array of hashes containing the data of one
245							phosphorylation site prediction. Key values are:
246
247							motif, percentile, position, protein, score, site, zscore, sequence.
248
249
250							=back
251
252
253							=cut
254
255							sub result {
256	0			0	1	0	my ($self,$value) = @_;
257	0	0	0			0	if( !exists($self->{'_result'}) \|\| $self->status ne 'COMPLETED'){
258	0					0	$self->throw("Cannot get results, analysis not run!");
259							}
260	0					0	my @fts;
261
262	0	0				0	if ($value ) {
263	0	0				0	if ($value eq 'Bio::SeqFeatureI') {
		0
264	0					0	for my $hit (@{$self->{'_parsed'}}) {
	0					0
265							push @fts, Bio::SeqFeature::Generic->new(
266							-start => $hit->{'position'},
267							-end => $hit->{'position'},
268							-primary_tag => 'Site',
269							-source => 'Scansite',
270							-tag => {
271							score => $hit->{'score'},
272							zscore => $hit->{'zscore'},
273							motif => $hit->{'motif'},
274							site => $hit->{'site'},
275	0					0	sequence => $hit->{'sequence'},
276							},
277							);
278							}
279	0					0	return @fts;
280							}
281							elsif ($value eq 'meta') {
282	0					0	$self->throw("No meta sequences available in this analysis!");
283							}
284							## else get here
285	0					0	return $self->{'_parsed'};
286							}
287
288	0					0	return $self->{'_result'};
289							}
290
291							=head2 stringency
292
293							Usage : $job->stringency(...)
294							Returns : The significance stringency of a prediction
295							Args : None (retrieves value) or 'High', 'Medium' or 'Low'.
296							Purpose : Get/setter of the stringency to be sumitted for analysis.
297
298							=cut
299
300							sub stringency {
301	2			2	1	4	my ($self,$value) = @_;
302	2	100				4	if( $value) {
303	1	50				3	if (! grep{$_=~ /$value/i}@STRINGENCY ) {
	3					16
304	0					0	$self->throw("I need a stringency of [".
305							join " ", @STRINGENCY .
306							"], not [$value]");
307							}
308	1					2	$self->{'_stringency'} = $value;
309	1					7	return $self;
310							}
311	1		33			6	return $self->{'_stringency'} \|\| $self->input_spec->[2]{'default'} ;
312							}
313
314							=head2 protein_id
315
316							Usage : $job->protein_id(...)
317							Returns : The sequence id of the protein or 'unnamed' if not set.
318							Args : None
319							Purpose : Getter of the seq_id. Returns the display_id of the sequence
320							object.
321
322							=cut
323
324							sub protein_id {
325	1			1	1	2	my $self = shift;
326							return defined ($self->seq())? $self->seq->display_id()
327	1	50				3	: $self->input_spec->[1]{'default'};
328							}
329
330							sub _init
331							{
332	1			1		2	my $self = shift;
333	1					4	$self->url($URL);
334	1					3	$self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
335	1					2	$self->{'_INPUT_SPEC'} = $INPUT_SPEC;
336	1					1	$self->{'_RESULT_SPEC'} = $RESULT_SPEC;
337	1					2	$self->{'_ANALYSIS_NAME'} = $ANALYSIS_SPEC->{'name'};
338	1					1	return $self;
339							}
340
341							sub _run {
342	0			0			my $self = shift;
343
344							# format the sequence into fasta
345	0						$self->delay(1);
346							# delay repeated calls by default by 3 sec, set delay() to change
347	0						$self->sleep;
348
349	0						$self->status('TERMINATED_BY_ERROR');
350
351	0						my $request = POST $self->url,
352							Content => [sequence => $self->seq->seq(),
353							protein_id => $self->protein_id(),
354							motif_option => 'all',
355							motifs => '',
356							motif_groups => '',
357							stringency => $self->stringency(),
358							#domain_flag => '',
359							submit => "Submit Request",
360							];
361							## raw html report,
362	0						my $content = $self->request($request);
363	0						my $text = $content->content;
364
365							##access result data from tag in html
366	0						my @parsed_Results = ();
367	0						my @unwantedParams = qw(db source class);
368	0						my @results = split /sitestats\.phtml\?/, $text;
369	0						shift @results;
370
371							##this module generates 'parsed' output directly from html,
372							## avoids having toparse twice.
373
374	0						for my $hit (@results) {
375							## get results string
376	0						my ($res) = $hit =~ /^(.+?)"/;
377
378							#get key value pairs
379	0						my %params = $res =~/(\w+)=([^&]+)/g;
380
381							##remove unwanted data from hash
382	0						map{delete $params{$_}} @unwantedParams;
	0
383	0						push @parsed_Results, \%params;
384							}
385
386							## now generate text output in table format
387	0						my $out_Str = '';
388	0						$out_Str .= $self->_make_header(\@parsed_Results);
389	0						$out_Str .= $self->_add_data(\@parsed_Results);
390
391
392	0						$self->{'_result'} = $out_Str;
393	0						$self->{'_parsed'} = \@parsed_Results;
394
395							## is successsful if there are results or if there are no results and
396							## this beacuse there are no matches, not because of parsing errors etc.
397	0	0	0				$self->status('COMPLETED') if $text ne '' &&
			0
398							(scalar @results > 0 \|\|
399							(scalar @results == 0 && $text =~/No sites found/));
400	0	0					if ($text =~ /server\s+error/i) {
401	0						$self->throw("Internal server error:\n\n $text");
402	0						return;
403							}
404							}
405
406							sub _process_arguments {
407
408							# extra checking for sequence length
409							# mitoprot specific argument testing
410	0			0			my ($self, $args) = @_;
411							#use base checking for existence of mandatory fields
412	0						$self->SUPER::_process_arguments($args);
413
414							# specific requirements
415	0	0					$self->throw("Sequence must be > 15 amino acids long!")
416							if $self->seq->length < 15;
417	0	0					$self->throw("Sequence must be protein")
418							unless $self->seq->alphabet() eq 'protein';
419							}
420
421							sub _make_header {
422	0			0			my ($self, $res) = @_;
423	0						my $header = '';
424	0						for my $k (sort keys %{$res->[0]} ){
	0
425	0	0					next if $k eq 'sequence';
426	0						$header .= $k;
427	0						$header .= ' 'x(12 -length($k));
428							}
429	0						$header .= "sequence\n\n";
430	0						return $header;
431							}
432
433							sub _add_data {
434	0			0			my ($self, $res) = @_;
435	0						my $outstr = '';
436	0						for my $hit (@$res) {
437	0						for my $k (sort keys %$hit ){
438	0	0					next if $k eq 'sequence';
439	0						$outstr .= $hit->{$k};
440	0						$outstr .= ' 'x(12 - length($hit->{$k}));
441							}
442	0	0					$outstr .= $hit->{'sequence'}. "\n" if $hit->{'sequence'};
443							}
444	0						return $outstr;
445
446
447							}
448
449
450							1;