File Coverage

Bio/Tools/Signalp/ExtendedSignalp.pm
Criterion Covered Total %
statement 194 197 98.4
branch 107 116 92.2
condition 17 22 77.2
subroutine 16 16 100.0
pod 5 5 100.0
total 339 356 95.2


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Signalp::ExtendedSignalp
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Emmanuel Quevillon
7             #
8             # Copyright Emmanuel Quevillon
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Signalp::ExtendedSignalp - enhanced parser for Signalp output
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::Signalp::ExtendedSignalp;
21             my $params = [qw(maxC maxY maxS meanS D)];
22             my $parser = new Bio::Tools::Signalp::ExtendedSignalp(
23             -fh => $filehandle
24             -factors => $params
25             );
26              
27             $parser->factors($params);
28             while( my $sp_feat = $parser->next_feature ) {
29             #do something
30             #eg
31             push @sp_feat, $sp_feat;
32             }
33              
34             =head1 DESCRIPTION
35              
36             # Please direct questions and support issues to I
37              
38             Parser module for Signalp.
39              
40             Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp
41             originally written by Marc Sohrmann (ms2 a sanger.ac.uk) Written in BioPipe by
42             Balamurugan Kumarasamy (savikalpa a fugu-sg.org) Cared for by the Fugu
43             Informatics team (fuguteam@fugu-sg.org)
44              
45             You may distribute this module under the same terms as perl itself
46              
47             Compared to the original SignalP, this method allow the user to filter results
48             out based on maxC maxY maxS meanS and D factor cutoff for the Neural Network (NN)
49             method only. The HMM method does not give any filters with 'YES' or 'NO' as result.
50              
51             The user must be aware that the filters can only by applied on NN method.
52             Also, to ensure the compatibility with original Signalp parsing module, the user
53             must know that by default, if filters are empty, max Y and mean S filters are
54             automatically used to filter results.
55              
56             If the used gives a list, then the parser will only report protein having 'YES'
57             for each factor.
58              
59             This module supports parsing for full, summary and short output form signalp.
60             Actually, full and summary are equivalent in terms of filtering results.
61              
62             =head1 FEEDBACK
63              
64             =head2 Mailing Lists
65              
66             User feedback is an integral part of the evolution of this and other
67             Bioperl modules. Send your comments and suggestions preferably to
68             the Bioperl mailing list. Your participation is much appreciated.
69              
70             bioperl-l@bioperl.org - General discussion
71             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72              
73             =head2 Support
74              
75             Please direct usage questions or support issues to the mailing list:
76              
77             I
78              
79             rather than to the module maintainer directly. Many experienced and
80             reponsive experts will be able look at the problem and quickly
81             address it. Please include a thorough description of the problem
82             with code and data examples if at all possible.
83              
84             =head2 Reporting Bugs
85              
86             Report bugs to the Bioperl bug tracking system to help us keep track
87             of the bugs and their resolution. Bug reports can be submitted via
88             the web:
89              
90             https://github.com/bioperl/bioperl-live/issues
91              
92             =head1 AUTHOR
93              
94             Based on the Bio::Tools::Signalp module
95             Emmanuel Quevillon
96              
97             =head1 APPENDIX
98              
99             The rest of the documentation details each of the object methods.
100             Internal methods are usually preceded with a _
101              
102             =cut
103              
104             package Bio::Tools::Signalp::ExtendedSignalp;
105              
106 1     1   477 use strict;
  1         1  
  1         22  
107 1     1   3 use Data::Dumper;
  1         1  
  1         34  
108 1     1   325 use Bio::SeqFeature::Generic;
  1         4  
  1         38  
109             # don't need Bio::Root::Root/IO (already in inheritance tree)
110 1     1   10 use base qw(Bio::Tools::Signalp Bio::Tools::AnalysisResult);
  1         1  
  1         444  
111              
112             #Supported arguments
113             my $FACTS = {
114             'maxC' => 1,
115             'maxS' => 1,
116             'maxY' => 1,
117             'meanS' => 1,
118             'D' => 1,
119             };
120              
121             =head2 new
122              
123             Title : new
124             Usage : my $obj = new Bio::Tools::Signalp::ExtendedSignalp();
125             Function: Builds a new Bio::Tools::Signalp::ExtendedSignalp object
126             Returns : Bio::Tools::Signalp::ExtendedSignalp
127             Args : -fh/-file => $val, # for initing input, see Bio::Root::IO
128              
129              
130             =cut
131              
132             sub new {
133 10     10 1 107 my($class,@args) = @_;
134              
135 10         25 my $self = $class->SUPER::new(@args);
136 10         18 $self->_initialize_io(@args);
137              
138 10         24 my $factors = $self->_rearrange([qw(FACTORS)], @args);
139             #To behave like the parent module (Bio::Tools::Signalp) we default factors to these two factors
140 10 100 100     28 if($factors && scalar(@$factors)){
141 4         4 $factors = $factors;
142             }
143             else{
144 6         7 $factors = [qw(maxY meanS)];
145             }
146 10 50       25 $factors && $self->factors($factors);
147            
148 10         22 return $self;
149             }
150              
151             =head2 next_feature
152              
153             Title : next_feature
154             Usage : my $feat = $signalp->next_feature
155             Function: Get the next result feature from parser data
156             Returns : Bio::SeqFeature::Generic
157             Args : none
158              
159              
160             =cut
161              
162             sub next_feature {
163              
164 44     44 1 611 my ($self) = @_;
165              
166 44 50       62 if(!$self->_parsed()){
167 44         54 $self->_parse();
168             }
169              
170 44   100     39 return shift @{$self->{_features}} || undef;
171              
172             }
173              
174             =head2 _filterok
175              
176             Title : _filterok
177             Usage : my $feat = $signalp->_filterok
178             Function: Check if the factors required by the user are all ok.
179             Returns : 1/0
180             Args : hash reference
181              
182              
183             =cut
184              
185             sub _filterok {
186              
187 88     88   72 my($self, $hash) = @_;
188              
189             #We hope everything will be fine ;)
190 88         62 my $bool = 1;
191              
192             #If the user did not give any filter, we keep eveything
193 88 50       53 return $bool unless keys %{$self->{_factors}};
  88         159  
194              
195             #If only one of the factors parsed is equal to NO based on the user factors cutoff
196             #Then the filter is not ok.
197 88         71 foreach my $fact (keys %{$self->factors()}){
  88         106  
198 146 100 100     461 if(exists($hash->{$fact}) && $hash->{$fact} =~ /^N/){
199 96         92 $bool = 0;
200             }
201             }
202              
203 88         222 return $bool;
204              
205             }
206              
207             =head2 factors
208              
209             Title : factors
210             Usage : my $feat = $signalp->factors
211             Function: Get/Set the filters required from the user
212             Returns : hash
213             Args : array reference
214              
215              
216             =cut
217              
218             sub factors {
219              
220 98     98 1 69 my($self, $array) = @_;
221              
222 98 100       120 if($array){
223 10         12 $self->{_factors} = { };
224 10         15 foreach my $f (@$array){
225 17 50       24 if(exists($FACTS->{$f})){
226 17         25 $self->{_factors}->{$f} = 1;
227             }
228             else{
229 0         0 $self->throw("[$f] incorrect factor. Supported:\n- ".join("\n- ", keys %$FACTS)."\n");
230             }
231             }
232             }
233              
234 98         161 return $self->{_factors};
235              
236             }
237              
238             =head2 _parsed
239              
240             Title : _parsed
241             Usage : obj->_parsed()
242             Function: Get/Set if the result is parsed or not
243             Returns : 1/0 scalar
244             Args : On set 1
245              
246              
247             =cut
248              
249             sub _parsed {
250              
251 44     44   39 my($self, $parsed) = @_;
252              
253 44 50       74 if(defined($parsed)){
254 0         0 $self->{_parsed} = $parsed;
255             }
256              
257 44         78 return $self->{_parsed};
258              
259             }
260              
261             =head2 _parse
262              
263             Title : _parse
264             Usage : obj->_parse
265             Function: Parse the SignalP result
266             Returns :
267             Args :
268              
269              
270             =cut
271              
272             sub _parse {
273              
274 44     44   29 my($self) = @_;
275              
276             #Let's read the file...
277 44         81 while (my $line = $self->_readline()) {
278              
279 30         24 chomp $line;
280             #We want to be sure to catch the first non empty line to be ablte to determine
281             #which format we are working with...
282 30 100       161 next unless ($line =~ /^>(\S+)|^# SignalP-[NHM]+ \S+ predictions/);
283              
284 10 100       29 if($line =~ /^>(\S+)/){
    50          
285 5         11 $self->_pushback($line);
286 5         6 $self->_parse_summary_format();
287 5         5 last;
288             }
289             elsif($line =~ /^# SignalP-[NHM]+ \S+ predictions/){
290 5         9 $self->_pushback($line);
291 5         10 $self->_parse_short_format();
292 5         6 last;
293             }
294             else{
295 0         0 $self->throw("Unable to determine the format type.");
296             }
297             }
298              
299 44         56 return;
300             }
301              
302             =head2 _parse_summary_format
303              
304             Title : _parse_summary_format
305             Usage : $self->_parse_summary_format
306             Function: Method to parse summary/full format from signalp output
307             It automatically fills filtered features.
308             Returns :
309             Args :
310              
311             =cut
312              
313             sub _parse_summary_format {
314              
315 5     5   6 my($self) = @_;
316              
317 5         4 my $feature = undef;
318 5         5 my $ok = 0;
319              
320 5         7 while(my $line = $self->_readline()){
321              
322 245 100       344 if($line =~ /^SignalP-NN result:/){
323 38         52 $self->_pushback($line);
324 38         47 $feature = $self->_parse_nn_result($feature);
325             }
326 245 100       290 if($line =~ /^SignalP-HMM result:/){
327 27         38 $self->_pushback($line);
328 27         39 $feature = $self->_parse_hmm_result($feature);
329             }
330              
331 245 100 66     622 if($line =~ /^---------/ && $feature){
332 42         53 my $new_feature = $self->create_feature($feature);
333 42 100       55 push @{$self->{_features}}, $new_feature if $new_feature;
  16         19  
334 42         118 $feature = undef;
335             }
336             }
337              
338 5         11 return;
339             }
340              
341              
342             =head2 _parse_nn_result
343              
344             Title : _parse_nn_result
345             Usage : obj->_parse_nn_result
346             Function: Parses the Neuronal Network (NN) part of the result
347             Returns : Hash reference
348             Args :
349              
350              
351             =cut
352              
353             sub _parse_nn_result {
354              
355 38     38   26 my($self, $feature) = @_;
356              
357 38         30 my $ok = 0;
358 38         32 my %facts;
359              
360             #SignalP-NN result:
361             #>MGG_11635.5 length = 100
362             ## Measure Position Value Cutoff signal peptide?
363             # max. C 37 0.087 0.32 NO
364             # max. Y 37 0.042 0.33 NO
365             # max. S 3 0.062 0.87 NO
366             # mean S 1-36 0.024 0.48 NO
367             # D 1-36 0.033 0.43 NO
368              
369 38         43 while(my $line = $self->_readline()){
370              
371 350         260 chomp $line;
372              
373 350 100       439 if($line =~ /^SignalP-NN result:/){
374 38         27 $ok = 1;
375 38         67 next;
376             }
377              
378 312 50       373 $self->throw("Wrong line for parsing NN results.") unless $ok;
379              
380 312 100       1259 if ($line=~/^\>(\S+)\s+length/) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
381 38         52 $self->seqname($1);
382 38         37 %facts = ();
383 38         55 next;
384             }
385             elsif($line =~ /max\.\s+C\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
386 38         73 $feature->{maxCprob} = $1;
387 38         39 $facts{maxC} = $2;
388 38         56 next;
389             }
390             elsif ($line =~ /max\.\s+Y\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
391 38         53 $feature->{maxYprob} = $1;
392 38         34 $facts{maxY} = $2;
393 38         66 next;
394             }
395             elsif($line =~ /max\.\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
396 38         57 $feature->{maxSprob} = $1;
397 38         38 $facts{maxS} = $2;
398 38         59 next;
399             }
400             elsif ($line=~/mean\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
401 38         50 $feature->{meanSprob} = $1;
402 38         40 $facts{meanS} = $2;
403 38         63 next;
404             }
405             elsif ($line=~/\s+D\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
406 38         51 $feature->{Dprob} = $1;
407 38         51 $facts{D} = $2;
408 38         64 next;
409             }
410             #If we don't have this line it means that all the factors cutoff are equal to 'NO'
411             elsif ($line =~ /Most likely cleavage site between pos\.\s+(\d+)/) {
412             #if($self->_filterok(\%facts)){
413             #$feature->{name} = $self->seqname();
414             #$feature->{start} = 1;
415 8         26 $feature->{end} = $1 + 1; #To be consistent with end given in short format
416             #}
417             #return $feature;
418             }
419             elsif($line =~ /^\s*$/){
420 38         36 last;
421             }
422             }
423              
424 38 100       62 if($self->_filterok(\%facts)){
425 10         16 $feature->{name} = $self->seqname();
426 10         15 $feature->{start} = 1;
427 10         12 $feature->{nnPrediction} = 'signal-peptide';
428             }
429              
430 38         57 return $feature;
431             }
432              
433              
434             =head2 _parse_hmm_result
435              
436             Title : _parse_hmm_result
437             Usage : obj->_parse_hmm_result
438             Function: Parses the Hiden Markov Model (HMM) part of the result
439             Returns : Hash reference
440             Args :
441              
442             =cut
443              
444             sub _parse_hmm_result {
445              
446 27     27   20 my ($self, $feature_hash) = @_;
447              
448 27         26 my $ok = 0;
449              
450             #SignalP-HMM result:
451             #>MGG_11635.5
452             #Prediction: Non-secretory protein
453             #Signal peptide probability: 0.000
454             #Signal anchor probability: 0.000
455             #Max cleavage site probability: 0.000 between pos. -1 and 0
456              
457 27         37 while(my $line = $self->_readline()){
458              
459 213         147 chomp $line;
460 213 100       364 next if $line =~ /^\s*$/o;
461              
462 202 100       231 if($line =~ /^SignalP-HMM result:/){
463 30         23 $ok = 1;
464 30         50 next;
465             }
466              
467 172 50       198 $self->throw("Wrong line for parsing HMM result.") unless $ok;
468              
469 172 100       501 if($line =~ /^>(\S+)/){
    100          
    100          
    100          
    100          
470             #In case we already seen a name with NN results
471 35 100       47 $feature_hash->{name} = $1 unless $self->seqname();
472             }
473             elsif($line =~ /Prediction: (.+)$/){
474 30         80 $feature_hash->{hmmPrediction} = $1;
475             }
476             elsif($line =~ /Signal peptide probability: ([0-9\.]+)/){
477 30         71 $feature_hash->{peptideProb} = $1;
478             }
479             elsif($line =~ /Signal anchor probability: ([0-9\.]+)/){
480 30         78 $feature_hash->{anchorProb} = $1;
481             }
482             elsif($line =~ /Max cleavage site probability: (\S+) between pos. \S+ and (\S+)/){
483 24         36 $feature_hash->{cleavageSiteProb} = $1;
484             #Strange case, if we don't have an end value in NN result (no nn method launched)
485             #We try anyway to get an end value, unless this value is lower than 1 which is
486             #the start
487 24 100 66     106 $feature_hash->{end} = $2 if($2 > 1 && !$feature_hash->{end});
488 24 100       40 $feature_hash->{start} = 1 unless $feature_hash->{start};
489 24         25 last;
490             }
491             }
492              
493 27         32 return $feature_hash;
494             }
495              
496             =head2 _parse_short_format
497              
498             Title : _parse_short_format
499             Usage : $self->_parse_short_format
500             Function: Method to parse short format from signalp output
501             It automatically fills filtered features.
502             Returns :
503             Args :
504              
505             =cut
506              
507             sub _parse_short_format {
508              
509 5     5   3 my($self) = @_;
510              
511 5         4 my $ok = 0;
512 5         4 my $method = undef;
513 5         11 $self->{_oformat} = 'short';
514              
515             #Output example
516             # SignalP-NN euk predictions # SignalP-HMM euk predictions
517             # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ?
518             #Q5A8M1_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8M1_CANAL Q 0.001 35 N 0.002 N
519             #O74127_YARLI 0.121 21 N 0.284 21 N 0.953 11 Y 0.826 Y 0.555 Y O74127_YARLI S 0.485 23 N 0.668 Y
520             #Q5VJ86_9PEZI 0.355 24 Y 0.375 24 Y 0.798 12 N 0.447 N 0.411 N Q5VJ86_9PEZI Q 0.180 23 N 0.339 N
521             #Q5A8U5_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8U5_CANAL Q 0.001 35 N 0.002 N
522              
523 5         8 while(my $line = $self->_readline()){
524            
525 60         54 chomp $line;
526 60 100       458 next if $line =~ /^\s*$|^# name/;
527              
528 55 100       71 if($line =~ /^#/){
529 5 100       16 $method = $line =~ /SignalP-NN .+ SignalP-HMM/ ?
    100          
530             'both' : $line =~ /SignalP-NN/ ?
531             'nn' : 'hmm';
532 5         9 next;
533             }
534              
535             #$self->throw("It looks like the format is not 'short' format.") unless($ok);
536              
537 50         275 my @data = split(/\s+/, $line);
538 50         80 $self->seqname($data[0]);
539              
540 50         45 my $factors = { };
541 50         39 my $feature = { };
542              
543             #NN results gives more fields than HMM
544 50 100 100     129 if($method eq 'both' || $method eq 'nn'){
    50          
545              
546 40         66 $feature->{maxCprob} = $data[1];
547 40         37 $factors->{maxC} = $data[3];
548 40         31 $feature->{maxYprob} = $data[4];
549 40         32 $factors->{maxY} = $data[6];
550 40         27 $feature->{maxSprob} = $data[7];
551 40         30 $factors->{maxS} = $data[9];
552 40         34 $feature->{meanSprob}= $data[10];
553 40         33 $factors->{meanS} = $data[11];
554 40         27 $feature->{Dprob} = $data[12];
555 40         31 $factors->{D} = $data[13];
556             #It looks like the max Y position is reported as the most likely cleavage position
557 40         38 $feature->{end} = $data[5];
558 40         35 $feature->{nnPrediction} = 'signal-peptide';
559              
560 40 100       49 if($method eq 'both'){
561 20 100       48 $feature->{hmmPrediction} = $data[15] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
562 20         19 $feature->{cleavageSiteProb} = $data[16];
563 20         18 $feature->{peptideProb} = $data[19];
564             }
565             }
566             elsif($method eq 'hmm'){
567             #In short output anchor probability is not given
568 10 100       20 $feature->{hmmPrediction} = $data[1] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
569 10         10 $feature->{cleavageSiteProb} = $data[2];
570 10         8 $feature->{peptideProb} = $data[5];
571             #It looks like the max cleavage probability position is given by the Cmax proability
572 10         12 $feature->{end} = $data[3];
573             }
574              
575             #Unfortunately, we cannot parse the filters for hmm method.
576 50 100       55 if($self->_filterok($factors)){
577 20         23 $feature->{name} = $self->seqname();
578 20         22 $feature->{start} = 1;
579 20         21 $feature->{source} = 'Signalp';
580 20         23 $feature->{primary} = 'signal_peptide';
581 20         14 $feature->{program} = 'Signalp';
582 20         19 $feature->{logic_name} = 'signal_peptide';
583              
584 20         22 my $new_feat = $self->create_feature($feature);
585 20 100       38 push @{$self->{_features}}, $new_feat if $new_feat;
  18         92  
586             }
587             }
588              
589 5         6 return;
590             }
591              
592             =head2 create_feature
593              
594             Title : create_feature
595             Usage : obj->create_feature(\%feature)
596             Function: Internal(not to be used directly)
597             Returns :
598             Args :
599              
600              
601             =cut
602              
603             sub create_feature {
604              
605 62     62 1 53 my ($self, $feat) = @_;
606              
607             #If we don't have neither start nor end, we return.
608 62 100 66     155 unless($feat->{name} && $feat->{start} && $feat->{end}){
      33        
609 28         31 return;
610             }
611              
612             # create feature object
613             my $feature = Bio::SeqFeature::Generic->new(
614             -seq_id => $feat->{name},
615             -start => $feat->{start},
616             -end => $feat->{end},
617 34 100       189 -score => defined($feat->{peptideProb}) ? $feat->{peptideProb} : '',
618             -source => 'Signalp',
619             -primary => 'signal_peptide',
620             -logic_name => 'signal_peptide',
621             );
622              
623 34         71 $feature->add_tag_value('peptideProb', $feat->{peptideProb});
624 34         60 $feature->add_tag_value('anchorProb', $feat->{anchorProb});
625 34         70 $feature->add_tag_value('evalue',$feat->{anchorProb});
626 34         50 $feature->add_tag_value('percent_id','NULL');
627 34         51 $feature->add_tag_value("hid",$feat->{primary});
628 34         51 $feature->add_tag_value('signalpPrediction', $feat->{hmmPrediction});
629 34 100       73 $feature->add_tag_value('cleavageSiteProb', $feat->{cleavageSiteProb}) if($feat->{cleavageSiteProb});
630 34 100       63 $feature->add_tag_value('nnPrediction', $feat->{nnPrediction}) if($feat->{nnPrediction});
631 34 100       59 $feature->add_tag_value('maxCprob', $feat->{maxCprob}) if(defined($feat->{maxCprob}));
632 34 100       68 $feature->add_tag_value('maxSprob', $feat->{maxSprob}) if(defined($feat->{maxSprob}));
633 34 100       54 $feature->add_tag_value('maxYprob', $feat->{maxYprob}) if(defined($feat->{maxYprob}));
634 34 100       61 $feature->add_tag_value('meanSprob', $feat->{meanSprob}) if(defined($feat->{meanSprob}));
635 34 100       61 $feature->add_tag_value('Dprob', $feat->{Dprob}) if(defined($feat->{Dprob}));
636              
637 34         36 return $feature;
638              
639             }
640              
641             =head2 seqname
642              
643             Title : seqname
644             Usage : obj->seqname($name)
645             Function: Internal(not to be used directly)
646             Returns :
647             Args :
648              
649              
650             =cut
651              
652             sub seqname{
653 153     153 1 157 my ($self,$seqname)=@_;
654              
655 153 100       195 if (defined($seqname)){
656 88         99 $self->{'seqname'} = $seqname;
657             }
658              
659 153         200 return $self->{'seqname'};
660              
661             }
662              
663              
664             1;
665              
666