File Coverage

Bio/Tools/Signalp/ExtendedSignalp.pm
Criterion Covered Total %
statement 194 197 98.4
branch 107 116 92.2
condition 18 22 81.8
subroutine 16 16 100.0
pod 5 5 100.0
total 340 356 95.5


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Signalp::ExtendedSignalp
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Emmanuel Quevillon
7             #
8             # Copyright Emmanuel Quevillon
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Signalp::ExtendedSignalp - enhanced parser for Signalp output
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::Signalp::ExtendedSignalp;
21             my $params = [qw(maxC maxY maxS meanS D)];
22             my $parser = new Bio::Tools::Signalp::ExtendedSignalp(
23             -fh => $filehandle
24             -factors => $params
25             );
26              
27             $parser->factors($params);
28             while( my $sp_feat = $parser->next_feature ) {
29             #do something
30             #eg
31             push @sp_feat, $sp_feat;
32             }
33              
34             =head1 DESCRIPTION
35              
36             # Please direct questions and support issues to I
37              
38             Parser module for Signalp.
39              
40             Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp
41             originally written by Marc Sohrmann (ms2 a sanger.ac.uk) Written in BioPipe by
42             Balamurugan Kumarasamy (savikalpa a fugu-sg.org) Cared for by the Fugu
43             Informatics team (fuguteam@fugu-sg.org)
44              
45             You may distribute this module under the same terms as perl itself
46              
47             Compared to the original SignalP, this method allow the user to filter results
48             out based on maxC maxY maxS meanS and D factor cutoff for the Neural Network (NN)
49             method only. The HMM method does not give any filters with 'YES' or 'NO' as result.
50              
51             The user must be aware that the filters can only by applied on NN method.
52             Also, to ensure the compatibility with original Signalp parsing module, the user
53             must know that by default, if filters are empty, max Y and mean S filters are
54             automatically used to filter results.
55              
56             If the used gives a list, then the parser will only report protein having 'YES'
57             for each factor.
58              
59             This module supports parsing for full, summary and short output form signalp.
60             Actually, full and summary are equivalent in terms of filtering results.
61              
62             =head1 FEEDBACK
63              
64             =head2 Mailing Lists
65              
66             User feedback is an integral part of the evolution of this and other
67             Bioperl modules. Send your comments and suggestions preferably to
68             the Bioperl mailing list. Your participation is much appreciated.
69              
70             bioperl-l@bioperl.org - General discussion
71             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72              
73             =head2 Support
74              
75             Please direct usage questions or support issues to the mailing list:
76              
77             I
78              
79             rather than to the module maintainer directly. Many experienced and
80             reponsive experts will be able look at the problem and quickly
81             address it. Please include a thorough description of the problem
82             with code and data examples if at all possible.
83              
84             =head2 Reporting Bugs
85              
86             Report bugs to the Bioperl bug tracking system to help us keep track
87             of the bugs and their resolution. Bug reports can be submitted via
88             the web:
89              
90             https://github.com/bioperl/bioperl-live/issues
91              
92             =head1 AUTHOR
93              
94             Based on the Bio::Tools::Signalp module
95             Emmanuel Quevillon
96              
97             =head1 APPENDIX
98              
99             The rest of the documentation details each of the object methods.
100             Internal methods are usually preceded with a _
101              
102             =cut
103              
104             package Bio::Tools::Signalp::ExtendedSignalp;
105              
106 1     1   458 use strict;
  1         1  
  1         23  
107 1     1   4 use Data::Dumper;
  1         2  
  1         52  
108 1     1   323 use Bio::SeqFeature::Generic;
  1         3  
  1         28  
109             # don't need Bio::Root::Root/IO (already in inheritance tree)
110 1     1   6 use base qw(Bio::Tools::Signalp Bio::Tools::AnalysisResult);
  1         1  
  1         237  
111              
112             #Supported arguments
113             my $FACTS = {
114             'maxC' => 1,
115             'maxS' => 1,
116             'maxY' => 1,
117             'meanS' => 1,
118             'D' => 1,
119             };
120              
121             =head2 new
122              
123             Title : new
124             Usage : my $obj = new Bio::Tools::Signalp::ExtendedSignalp();
125             Function: Builds a new Bio::Tools::Signalp::ExtendedSignalp object
126             Returns : Bio::Tools::Signalp::ExtendedSignalp
127             Args : -fh/-file => $val, # for initing input, see Bio::Root::IO
128              
129              
130             =cut
131              
132             sub new {
133 10     10 1 122 my($class,@args) = @_;
134              
135 10         40 my $self = $class->SUPER::new(@args);
136 10         26 $self->_initialize_io(@args);
137              
138 10         29 my $factors = $self->_rearrange([qw(FACTORS)], @args);
139             #To behave like the parent module (Bio::Tools::Signalp) we default factors to these two factors
140 10 100 100     34 if($factors && scalar(@$factors)){
141 4         7 $factors = $factors;
142             }
143             else{
144 6         14 $factors = [qw(maxY meanS)];
145             }
146 10 50       29 $factors && $self->factors($factors);
147            
148 10         40 return $self;
149             }
150              
151             =head2 next_feature
152              
153             Title : next_feature
154             Usage : my $feat = $signalp->next_feature
155             Function: Get the next result feature from parser data
156             Returns : Bio::SeqFeature::Generic
157             Args : none
158              
159              
160             =cut
161              
162             sub next_feature {
163              
164 44     44 1 2767 my ($self) = @_;
165              
166 44 50       75 if(!$self->_parsed()){
167 44         69 $self->_parse();
168             }
169              
170 44   100     52 return shift @{$self->{_features}} || undef;
171              
172             }
173              
174             =head2 _filterok
175              
176             Title : _filterok
177             Usage : my $feat = $signalp->_filterok
178             Function: Check if the factors required by the user are all ok.
179             Returns : 1/0
180             Args : hash reference
181              
182              
183             =cut
184              
185             sub _filterok {
186              
187 88     88   119 my($self, $hash) = @_;
188              
189             #We hope everything will be fine ;)
190 88         95 my $bool = 1;
191              
192             #If the user did not give any filter, we keep eveything
193 88 50       92 return $bool unless keys %{$self->{_factors}};
  88         187  
194              
195             #If only one of the factors parsed is equal to NO based on the user factors cutoff
196             #Then the filter is not ok.
197 88         112 foreach my $fact (keys %{$self->factors()}){
  88         114  
198 146 100 100     474 if(exists($hash->{$fact}) && $hash->{$fact} =~ /^N/){
199 96         138 $bool = 0;
200             }
201             }
202              
203 88         272 return $bool;
204              
205             }
206              
207             =head2 factors
208              
209             Title : factors
210             Usage : my $feat = $signalp->factors
211             Function: Get/Set the filters required from the user
212             Returns : hash
213             Args : array reference
214              
215              
216             =cut
217              
218             sub factors {
219              
220 98     98 1 141 my($self, $array) = @_;
221              
222 98 100       136 if($array){
223 10         19 $self->{_factors} = { };
224 10         18 foreach my $f (@$array){
225 17 50       32 if(exists($FACTS->{$f})){
226 17         33 $self->{_factors}->{$f} = 1;
227             }
228             else{
229 0         0 $self->throw("[$f] incorrect factor. Supported:\n- ".join("\n- ", keys %$FACTS)."\n");
230             }
231             }
232             }
233              
234 98         282 return $self->{_factors};
235              
236             }
237              
238             =head2 _parsed
239              
240             Title : _parsed
241             Usage : obj->_parsed()
242             Function: Get/Set if the result is parsed or not
243             Returns : 1/0 scalar
244             Args : On set 1
245              
246              
247             =cut
248              
249             sub _parsed {
250              
251 44     44   65 my($self, $parsed) = @_;
252              
253 44 50       85 if(defined($parsed)){
254 0         0 $self->{_parsed} = $parsed;
255             }
256              
257 44         86 return $self->{_parsed};
258              
259             }
260              
261             =head2 _parse
262              
263             Title : _parse
264             Usage : obj->_parse
265             Function: Parse the SignalP result
266             Returns :
267             Args :
268              
269              
270             =cut
271              
272             sub _parse {
273              
274 44     44   61 my($self) = @_;
275              
276             #Let's read the file...
277 44         102 while (my $line = $self->_readline()) {
278              
279 30         40 chomp $line;
280             #We want to be sure to catch the first non empty line to be ablte to determine
281             #which format we are working with...
282 30 100       172 next unless ($line =~ /^>(\S+)|^# SignalP-[NHM]+ \S+ predictions/);
283              
284 10 100       44 if($line =~ /^>(\S+)/){
    50          
285 5         15 $self->_pushback($line);
286 5         12 $self->_parse_summary_format();
287 5         8 last;
288             }
289             elsif($line =~ /^# SignalP-[NHM]+ \S+ predictions/){
290 5         15 $self->_pushback($line);
291 5         12 $self->_parse_short_format();
292 5         8 last;
293             }
294             else{
295 0         0 $self->throw("Unable to determine the format type.");
296             }
297             }
298              
299 44         59 return;
300             }
301              
302             =head2 _parse_summary_format
303              
304             Title : _parse_summary_format
305             Usage : $self->_parse_summary_format
306             Function: Method to parse summary/full format from signalp output
307             It automatically fills filtered features.
308             Returns :
309             Args :
310              
311             =cut
312              
313             sub _parse_summary_format {
314              
315 5     5   6 my($self) = @_;
316              
317 5         7 my $feature = undef;
318 5         7 my $ok = 0;
319              
320 5         8 while(my $line = $self->_readline()){
321              
322 245 100       381 if($line =~ /^SignalP-NN result:/){
323 38         75 $self->_pushback($line);
324 38         55 $feature = $self->_parse_nn_result($feature);
325             }
326 245 100       363 if($line =~ /^SignalP-HMM result:/){
327 27         55 $self->_pushback($line);
328 27         47 $feature = $self->_parse_hmm_result($feature);
329             }
330              
331 245 100 66     590 if($line =~ /^---------/ && $feature){
332 42         73 my $new_feature = $self->create_feature($feature);
333 42 100       69 push @{$self->{_features}}, $new_feature if $new_feature;
  16         29  
334 42         137 $feature = undef;
335             }
336             }
337              
338 5         14 return;
339             }
340              
341              
342             =head2 _parse_nn_result
343              
344             Title : _parse_nn_result
345             Usage : obj->_parse_nn_result
346             Function: Parses the Neuronal Network (NN) part of the result
347             Returns : Hash reference
348             Args :
349              
350              
351             =cut
352              
353             sub _parse_nn_result {
354              
355 38     38   52 my($self, $feature) = @_;
356              
357 38         41 my $ok = 0;
358 38         41 my %facts;
359              
360             #SignalP-NN result:
361             #>MGG_11635.5 length = 100
362             ## Measure Position Value Cutoff signal peptide?
363             # max. C 37 0.087 0.32 NO
364             # max. Y 37 0.042 0.33 NO
365             # max. S 3 0.062 0.87 NO
366             # mean S 1-36 0.024 0.48 NO
367             # D 1-36 0.033 0.43 NO
368              
369 38         51 while(my $line = $self->_readline()){
370              
371 350         414 chomp $line;
372              
373 350 100       497 if($line =~ /^SignalP-NN result:/){
374 38         40 $ok = 1;
375 38         75 next;
376             }
377              
378 312 50       393 $self->throw("Wrong line for parsing NN results.") unless $ok;
379              
380 312 100       1281 if ($line=~/^\>(\S+)\s+length/) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
381 38         82 $self->seqname($1);
382 38         41 %facts = ();
383 38         68 next;
384             }
385             elsif($line =~ /max\.\s+C\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
386 38         84 $feature->{maxCprob} = $1;
387 38         57 $facts{maxC} = $2;
388 38         66 next;
389             }
390             elsif ($line =~ /max\.\s+Y\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
391 38         67 $feature->{maxYprob} = $1;
392 38         62 $facts{maxY} = $2;
393 38         79 next;
394             }
395             elsif($line =~ /max\.\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
396 38         73 $feature->{maxSprob} = $1;
397 38         45 $facts{maxS} = $2;
398 38         71 next;
399             }
400             elsif ($line=~/mean\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
401 38         66 $feature->{meanSprob} = $1;
402 38         54 $facts{meanS} = $2;
403 38         67 next;
404             }
405             elsif ($line=~/\s+D\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
406 38         68 $feature->{Dprob} = $1;
407 38         55 $facts{D} = $2;
408 38         88 next;
409             }
410             #If we don't have this line it means that all the factors cutoff are equal to 'NO'
411             elsif ($line =~ /Most likely cleavage site between pos\.\s+(\d+)/) {
412             #if($self->_filterok(\%facts)){
413             #$feature->{name} = $self->seqname();
414             #$feature->{start} = 1;
415 8         29 $feature->{end} = $1 + 1; #To be consistent with end given in short format
416             #}
417             #return $feature;
418             }
419             elsif($line =~ /^\s*$/){
420 38         48 last;
421             }
422             }
423              
424 38 100       68 if($self->_filterok(\%facts)){
425 10         16 $feature->{name} = $self->seqname();
426 10         20 $feature->{start} = 1;
427 10         14 $feature->{nnPrediction} = 'signal-peptide';
428             }
429              
430 38         73 return $feature;
431             }
432              
433              
434             =head2 _parse_hmm_result
435              
436             Title : _parse_hmm_result
437             Usage : obj->_parse_hmm_result
438             Function: Parses the Hiden Markov Model (HMM) part of the result
439             Returns : Hash reference
440             Args :
441              
442             =cut
443              
444             sub _parse_hmm_result {
445              
446 27     27   37 my ($self, $feature_hash) = @_;
447              
448 27         31 my $ok = 0;
449              
450             #SignalP-HMM result:
451             #>MGG_11635.5
452             #Prediction: Non-secretory protein
453             #Signal peptide probability: 0.000
454             #Signal anchor probability: 0.000
455             #Max cleavage site probability: 0.000 between pos. -1 and 0
456              
457 27         42 while(my $line = $self->_readline()){
458              
459 213         253 chomp $line;
460 213 100       461 next if $line =~ /^\s*$/o;
461              
462 202 100       312 if($line =~ /^SignalP-HMM result:/){
463 30         32 $ok = 1;
464 30         60 next;
465             }
466              
467 172 50       205 $self->throw("Wrong line for parsing HMM result.") unless $ok;
468              
469 172 100       553 if($line =~ /^>(\S+)/){
    100          
    100          
    100          
    100          
470             #In case we already seen a name with NN results
471 35 100       63 $feature_hash->{name} = $1 unless $self->seqname();
472             }
473             elsif($line =~ /Prediction: (.+)$/){
474 30         84 $feature_hash->{hmmPrediction} = $1;
475             }
476             elsif($line =~ /Signal peptide probability: ([0-9\.]+)/){
477 30         87 $feature_hash->{peptideProb} = $1;
478             }
479             elsif($line =~ /Signal anchor probability: ([0-9\.]+)/){
480 30         98 $feature_hash->{anchorProb} = $1;
481             }
482             elsif($line =~ /Max cleavage site probability: (\S+) between pos. \S+ and (\S+)/){
483 24         48 $feature_hash->{cleavageSiteProb} = $1;
484             #Strange case, if we don't have an end value in NN result (no nn method launched)
485             #We try anyway to get an end value, unless this value is lower than 1 which is
486             #the start
487 24 100 66     103 $feature_hash->{end} = $2 if($2 > 1 && !$feature_hash->{end});
488 24 100       56 $feature_hash->{start} = 1 unless $feature_hash->{start};
489 24         30 last;
490             }
491             }
492              
493 27         49 return $feature_hash;
494             }
495              
496             =head2 _parse_short_format
497              
498             Title : _parse_short_format
499             Usage : $self->_parse_short_format
500             Function: Method to parse short format from signalp output
501             It automatically fills filtered features.
502             Returns :
503             Args :
504              
505             =cut
506              
507             sub _parse_short_format {
508              
509 5     5   8 my($self) = @_;
510              
511 5         8 my $ok = 0;
512 5         6 my $method = undef;
513 5         15 $self->{_oformat} = 'short';
514              
515             #Output example
516             # SignalP-NN euk predictions # SignalP-HMM euk predictions
517             # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ?
518             #Q5A8M1_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8M1_CANAL Q 0.001 35 N 0.002 N
519             #O74127_YARLI 0.121 21 N 0.284 21 N 0.953 11 Y 0.826 Y 0.555 Y O74127_YARLI S 0.485 23 N 0.668 Y
520             #Q5VJ86_9PEZI 0.355 24 Y 0.375 24 Y 0.798 12 N 0.447 N 0.411 N Q5VJ86_9PEZI Q 0.180 23 N 0.339 N
521             #Q5A8U5_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8U5_CANAL Q 0.001 35 N 0.002 N
522              
523 5         11 while(my $line = $self->_readline()){
524            
525 60         80 chomp $line;
526 60 100       520 next if $line =~ /^\s*$|^# name/;
527              
528 55 100       102 if($line =~ /^#/){
529 5 100       22 $method = $line =~ /SignalP-NN .+ SignalP-HMM/ ?
    100          
530             'both' : $line =~ /SignalP-NN/ ?
531             'nn' : 'hmm';
532 5         15 next;
533             }
534              
535             #$self->throw("It looks like the format is not 'short' format.") unless($ok);
536              
537 50         333 my @data = split(/\s+/, $line);
538 50         126 $self->seqname($data[0]);
539              
540 50         69 my $factors = { };
541 50         67 my $feature = { };
542              
543             #NN results gives more fields than HMM
544 50 100 100     142 if($method eq 'both' || $method eq 'nn'){
    50          
545              
546 40         73 $feature->{maxCprob} = $data[1];
547 40         55 $factors->{maxC} = $data[3];
548 40         48 $feature->{maxYprob} = $data[4];
549 40         55 $factors->{maxY} = $data[6];
550 40         49 $feature->{maxSprob} = $data[7];
551 40         50 $factors->{maxS} = $data[9];
552 40         45 $feature->{meanSprob}= $data[10];
553 40         51 $factors->{meanS} = $data[11];
554 40         45 $feature->{Dprob} = $data[12];
555 40         45 $factors->{D} = $data[13];
556             #It looks like the max Y position is reported as the most likely cleavage position
557 40         50 $feature->{end} = $data[5];
558 40         42 $feature->{nnPrediction} = 'signal-peptide';
559              
560 40 100       57 if($method eq 'both'){
561 20 100       45 $feature->{hmmPrediction} = $data[15] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
562 20         25 $feature->{cleavageSiteProb} = $data[16];
563 20         23 $feature->{peptideProb} = $data[19];
564             }
565             }
566             elsif($method eq 'hmm'){
567             #In short output anchor probability is not given
568 10 100       34 $feature->{hmmPrediction} = $data[1] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
569 10         20 $feature->{cleavageSiteProb} = $data[2];
570 10         16 $feature->{peptideProb} = $data[5];
571             #It looks like the max cleavage probability position is given by the Cmax proability
572 10         21 $feature->{end} = $data[3];
573             }
574              
575             #Unfortunately, we cannot parse the filters for hmm method.
576 50 100       84 if($self->_filterok($factors)){
577 20         39 $feature->{name} = $self->seqname();
578 20         33 $feature->{start} = 1;
579 20         28 $feature->{source} = 'Signalp';
580 20         43 $feature->{primary} = 'signal_peptide';
581 20         31 $feature->{program} = 'Signalp';
582 20         31 $feature->{logic_name} = 'signal_peptide';
583              
584 20         40 my $new_feat = $self->create_feature($feature);
585 20 100       57 push @{$self->{_features}}, $new_feat if $new_feat;
  18         121  
586             }
587             }
588              
589 5         10 return;
590             }
591              
592             =head2 create_feature
593              
594             Title : create_feature
595             Usage : obj->create_feature(\%feature)
596             Function: Internal(not to be used directly)
597             Returns :
598             Args :
599              
600              
601             =cut
602              
603             sub create_feature {
604              
605 62     62 1 76 my ($self, $feat) = @_;
606              
607             #If we don't have neither start nor end, we return.
608 62 100 66     224 unless($feat->{name} && $feat->{start} && $feat->{end}){
      66        
609 28         62 return;
610             }
611              
612             # create feature object
613             my $feature = Bio::SeqFeature::Generic->new(
614             -seq_id => $feat->{name},
615             -start => $feat->{start},
616             -end => $feat->{end},
617 34 100       205 -score => defined($feat->{peptideProb}) ? $feat->{peptideProb} : '',
618             -source => 'Signalp',
619             -primary => 'signal_peptide',
620             -logic_name => 'signal_peptide',
621             );
622              
623 34         108 $feature->add_tag_value('peptideProb', $feat->{peptideProb});
624 34         108 $feature->add_tag_value('anchorProb', $feat->{anchorProb});
625 34         116 $feature->add_tag_value('evalue',$feat->{anchorProb});
626 34         76 $feature->add_tag_value('percent_id','NULL');
627 34         87 $feature->add_tag_value("hid",$feat->{primary});
628 34         86 $feature->add_tag_value('signalpPrediction', $feat->{hmmPrediction});
629 34 100       99 $feature->add_tag_value('cleavageSiteProb', $feat->{cleavageSiteProb}) if($feat->{cleavageSiteProb});
630 34 100       83 $feature->add_tag_value('nnPrediction', $feat->{nnPrediction}) if($feat->{nnPrediction});
631 34 100       81 $feature->add_tag_value('maxCprob', $feat->{maxCprob}) if(defined($feat->{maxCprob}));
632 34 100       92 $feature->add_tag_value('maxSprob', $feat->{maxSprob}) if(defined($feat->{maxSprob}));
633 34 100       83 $feature->add_tag_value('maxYprob', $feat->{maxYprob}) if(defined($feat->{maxYprob}));
634 34 100       77 $feature->add_tag_value('meanSprob', $feat->{meanSprob}) if(defined($feat->{meanSprob}));
635 34 100       72 $feature->add_tag_value('Dprob', $feat->{Dprob}) if(defined($feat->{Dprob}));
636              
637 34         54 return $feature;
638              
639             }
640              
641             =head2 seqname
642              
643             Title : seqname
644             Usage : obj->seqname($name)
645             Function: Internal(not to be used directly)
646             Returns :
647             Args :
648              
649              
650             =cut
651              
652             sub seqname{
653 153     153 1 262 my ($self,$seqname)=@_;
654              
655 153 100       238 if (defined($seqname)){
656 88         126 $self->{'seqname'} = $seqname;
657             }
658              
659 153         276 return $self->{'seqname'};
660              
661             }
662              
663              
664             1;
665              
666