File Coverage

Bio/Tools/Fgenesh.pm
Criterion Covered Total %
statement 136 168 80.9
branch 50 80 62.5
condition 10 24 41.6
subroutine 13 16 81.2
pod 3 3 100.0
total 212 291 72.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Fgenesh
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Christopher Dwan (chris@dwan.org)
7             #
8             # Copied, lock stock & barrel from Genscan.pm
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Fgenesh - parse results of one Fgenesh run
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::Fgenesh;
21              
22             $fgenesh = Bio::Tools::Fgenesh->new(-file => 'result.fgenesh');
23             # filehandle:
24             $fgenesh = Bio::Tools::Fgenesh->new( -fh => \*INPUT );
25              
26             # parse the results
27             # note: this class is-a Bio::Tools::AnalysisResult which implements
28             # Bio::SeqAnalysisParserI, i.e., $fgensh->next_feature() is the same
29             while($gene = $fgenesh->next_prediction()) {
30             # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
31             # off Bio::SeqFeature::Gene::Transcript.
32             #
33             # $gene->exons() returns an array of
34             # Bio::Tools::Prediction::Exon objects
35             # all exons:
36             @exon_arr = $gene->exons();
37              
38             # initial exons only
39             @init_exons = $gene->exons('Initial');
40             # internal exons only
41             @intrl_exons = $gene->exons('Internal');
42             # terminal exons only
43             @term_exons = $gene->exons('Terminal');
44             # singleton exons:
45             ($single_exon) = $gene->exons();
46             }
47              
48             # essential if you gave a filename at initialization (otherwise the file
49             # will stay open)
50             $fgenesh->close();
51              
52             =head1 DESCRIPTION
53              
54             The Fgenesh module provides a parser for Fgenesh (version 2) gene structure
55             prediction output. It parses one gene prediction into a
56             Bio::SeqFeature::Gene::Transcript- derived object.
57              
58             This module also implements the L interface, and thus
59             can be used wherever such an object fits.
60              
61             =head1 FEEDBACK
62              
63             =head2 Mailing Lists
64              
65             User feedback is an integral part of the evolution of this and other
66             Bioperl modules. Send your comments and suggestions preferably to one
67             of the Bioperl mailing lists. Your participation is much appreciated.
68              
69             bioperl-l@bioperl.org - General discussion
70             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
71              
72             =head2 Support
73              
74             Please direct usage questions or support issues to the mailing list:
75              
76             I
77              
78             rather than to the module maintainer directly. Many experienced and
79             reponsive experts will be able look at the problem and quickly
80             address it. Please include a thorough description of the problem
81             with code and data examples if at all possible.
82              
83             =head2 Reporting Bugs
84              
85             Report bugs to the Bioperl bug tracking system to help us keep track
86             the bugs and their resolution. Bug reports can be submitted via the
87             web:
88              
89             https://github.com/bioperl/bioperl-live/issues
90              
91             =head1 AUTHOR - Chris Dwan
92              
93             Email chris-at-dwan.org
94              
95             =head1 APPENDIX
96              
97             The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
98              
99             =cut
100              
101              
102             # Let the code begin...
103              
104              
105             package Bio::Tools::Fgenesh;
106 1     1   579 use strict;
  1         1  
  1         23  
107 1     1   3 use Symbol;
  1         1  
  1         45  
108              
109 1     1   258 use Bio::Root::Root;
  1         2  
  1         34  
110 1     1   279 use Bio::Tools::Prediction::Gene;
  1         2  
  1         34  
111 1     1   304 use Bio::Tools::Prediction::Exon;
  1         2  
  1         27  
112              
113 1     1   4 use base qw(Bio::Tools::AnalysisResult);
  1         1  
  1         277  
114              
115             my %ExonTags = ('CDSf' => 'Initial',
116             'CDSi' => 'Internal',
117             'CDSl' => 'Terminal',
118             'CDSo' => 'Singleton');
119            
120             sub _initialize_state {
121 1     1   3 my ($self,@args) = @_;
122            
123             # first call the inherited method!
124 1         4 $self->SUPER::_initialize_state(@args);
125              
126             # our private state variables
127 1         2 $self->{'_preds_parsed'} = 0;
128 1         3 $self->{'_has_cds'} = 0;
129             # array of pre-parsed predictions
130 1         2 $self->{'_preds'} = [];
131             # seq stack
132 1         3 $self->{'_seqstack'} = [];
133             }
134              
135             =head2 analysis_method
136              
137             Usage : $genscan->analysis_method();
138             Purpose : Inherited method. Overridden to ensure that the name matches
139             /genscan/i.
140             Returns : String
141             Argument : n/a
142              
143             =cut
144              
145             #-------------
146             sub analysis_method {
147             #-------------
148 0     0 1 0 my ($self, $method) = @_;
149 0 0 0     0 if($method && ($method !~ /fgenesh/i)) {
150 0         0 $self->throw("method $method not supported in " . ref($self));
151             }
152 0         0 return $self->SUPER::analysis_method($method);
153             }
154              
155             =head2 next_feature
156              
157             Title : next_feature
158             Usage : while($gene = $fgenesh->next_feature()) {
159             # do something
160             }
161             Function: Returns the next gene structure prediction of the Fgenesh result
162             file. Call this method repeatedly until FALSE is returned.
163              
164             The returned object is actually a SeqFeatureI implementing object.
165             This method is required for classes implementing the
166             SeqAnalysisParserI interface, and is merely an alias for
167             next_prediction() at present.
168              
169             Example :
170             Returns : A Bio::Tools::Prediction::Gene object.
171             Args :
172              
173             =cut
174              
175             sub next_feature {
176 0     0 1 0 my ($self,@args) = @_;
177             # even though next_prediction doesn't expect any args (and this method
178             # does neither), we pass on args in order to be prepared if this changes
179             # ever
180 0         0 return $self->next_prediction(@args);
181             }
182              
183             =head2 next_prediction
184              
185             Title : next_prediction
186             Usage : while($gene = $fgenesh->next_prediction()) { ... }
187             Function: Returns the next gene structure prediction of the Genscan result
188             file. Call this method repeatedly until FALSE is returned.
189             Example :
190             Returns : A Bio::Tools::Prediction::Gene object.
191             Args :
192              
193             =cut
194              
195             sub next_prediction {
196 5     5 1 1236 my ($self) = @_;
197 5         6 my $gene;
198              
199             # if the prediction section hasn't been parsed yet, we do this now
200 5 100       9 $self->_parse_predictions() unless $self->_predictions_parsed();
201              
202             # get next gene structure
203 5         11 $gene = $self->_prediction();
204              
205 5 100       12 if($gene) {
206             # fill in predicted protein, and if available the predicted CDS
207             #
208              
209             # use the seq stack if there's a seq on it
210 4         4 my $seqobj = pop(@{$self->{'_seqstack'}});
  4         5  
211 4         3 my ($id, $seq);
212 4 50       8 unless ($seqobj) {
213 4         7 ($id, $seq) = $self->_read_fasta_seq();
214 4         4 my $alphabet;
215 4 50 33     28 if (($id =~ /mrna/) || ($id =~ /cds/)) { $alphabet = 'dna'; }
  0         0  
216 4         6 else { $alphabet = 'protein'; }
217 4         15 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
218             '-display_id' => $id,
219             '-alphabet' => $alphabet);
220             }
221 4 50       7 if ($seqobj) {
222              
223             # check that prediction number matches the prediction number
224             # indicated in the sequence id (there may be incomplete gene
225             # predictions that contain only signals with no associated protein
226             # prediction.
227              
228 4         10 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
229 4         9 my $prednr = $1;
230 4 50       61 if ($id !~ /_predicted_(\w+)_$prednr/) {
231             # this is not our sequence, so push back for next prediction
232 0         0 push(@{$self->{'_seqstack'}}, $seqobj);
  0         0  
233             } else {
234 4 50 0     21 if ($1 eq "protein") {
    0          
235 4         11 $gene->predicted_protein($seqobj);
236             } elsif (($1 eq "mrna") || ($1 eq "cds")) {
237 0         0 $self->_has_cds(1);
238 0         0 $gene->predicted_cds($seqobj);
239            
240             # Have to go back in and get the protein...
241 0         0 ($id, $seq) = $self->_read_fasta_seq();
242 0 0       0 if ($id =~ /_cds_/) {
243 0         0 ($id, $seq) = $self->_read_fasta_seq();
244             }
245            
246 0         0 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
247             '-display_id' => $id,
248             '-alphabet' => "protein");
249 0         0 $gene->predicted_protein($seqobj);
250             }
251             }
252             }
253             }
254              
255 5         11 return $gene;
256             }
257              
258             =head2 _parse_predictions
259              
260             Title : _parse_predictions()
261             Usage : $obj->_parse_predictions()
262             Function: Parses the prediction section. Automatically called by
263             next_prediction() if not yet done.
264             Example :
265             Returns :
266              
267             =cut
268              
269             sub _parse_predictions {
270 1     1   1 my ($self) = @_;
271 1         2 my $gene;
272             my $seqname;
273              
274 1         9 while(defined($_ = $self->_readline())) {
275              
276 41 100       119 if(/^\s*(\d+)\s+([+\-])/) {
277 27         24 my $line = $_;
278              
279             # exon or signal
280 27         44 my $prednr = $1;
281 27 100       47 my $strand = ($2 eq '+') ? 1 : -1;
282              
283 27 100       35 if(! defined($gene)) {
284 4         20 $gene = Bio::Tools::Prediction::Gene->new(
285             '-primary' => "GenePrediction$prednr",
286             '-source' => 'Fgenesh');
287             }
288             # split into fields
289 27         35 chomp();
290 27         147 my @flds = split(/\s+/, ' ' . $line);
291             ## NB - the above adds leading whitespace before the gene
292             ## number in case there was none (as quick patch to code
293             ## below which expects it but it is not present after 999
294             ## predictions!) This allows >999 predictions to be parsed.
295              
296             # create the feature object depending on the type of signal
297 27         23 my $predobj;
298 27         51 my $is_exon = grep {$line =~ $_} keys(%ExonTags);
  108         603  
299 27         30 my ($start, $end);
300 27 100       33 if($is_exon) {
301 19         52 $predobj = Bio::Tools::Prediction::Exon->new();
302 19         32 $predobj->score($flds[8]);
303 19         17 $start = $flds[5];
304 19         16 $end = $flds[7];
305             } else {
306             # PolyA site, or TSS
307 8         22 $predobj = Bio::SeqFeature::Generic->new();
308 8         16 $predobj->score($flds[5]);
309 8         9 $start = $flds[4];
310 8         9 $end = $flds[4];
311             }
312             # set common fields
313 27         35 $predobj->source_tag('Fgenesh');
314 27         38 $predobj->strand($strand);
315              
316             # Following tactical commenting-out made by
317             # malcolm.cook@stowers-institute.org since coordinate reversal is
318             # apparently vestigial copy/paste detritus from Genscan.pm origins of
319             # this module and this is NOT needed for fgenesh (at least in v
320             # 2.1.4).
321              
322             # if($predobj->strand() == 1) {
323 27         43 $predobj->start($start);
324 27         43 $predobj->end($end);
325             # } else {
326             # $predobj->end($start);
327             # $predobj->start($end);
328             # }
329              
330             # print STDERR "start $start end $end\n";
331             # add to gene structure (should be done only when start and end
332             # are set, in order to allow for proper expansion of the range)
333 27 100       44 if($is_exon) {
    100          
    50          
334             # first, set fields unique to exons
335 19         55 $predobj->primary_tag($ExonTags{$flds[4]} . 'Exon');
336 19         38 $predobj->is_coding(1);
337 19         11 my $cod_offset;
338 19 100       30 if($predobj->strand() == 1) {
339 4         7 $cod_offset = ($flds[9] - $predobj->start()) % 3;
340             # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
341             # to offsets 2 and 1, resp. Offset 3 is the same as 0.
342 4 100       9 $cod_offset += 3 if($cod_offset < 1);
343             } else {
344             # On the reverse strand the Genscan frame also refers to
345             # the first base of the first complete codon, but viewed
346             # from forward, which is the third base viewed from
347             # reverse.
348 15         20 $cod_offset = ($flds[11] - $predobj->end()) % 3;
349             # Possible values are -2, -1, 0, 1, 2. Due to the reverse
350             # situation, {2,-1} and {1,-2} correspond to offsets
351             # 1 and 2, resp. Offset 3 is the same as 0.
352 15 50       49 $cod_offset -= 3 if($cod_offset >= 0);
353 15         14 $cod_offset = -$cod_offset;
354             }
355             # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
356             # is the frame of the first base relative to the exon, or the
357             # number of bases the first codon is missing).
358 19         38 $predobj->frame(3 - $cod_offset);
359             # print STDERR " frame is " . $predobj->frame() . "\n";
360             # then add to gene structure object
361 19         64 $gene->add_exon($predobj, $ExonTags{$flds[1]});
362             } elsif($flds[3] eq 'PolA') {
363 4         8 $predobj->primary_tag("PolyAsite");
364 4         11 $gene->poly_A_site($predobj);
365             } elsif($flds[3] eq 'TSS') {
366 4         9 $predobj->primary_tag("Promoter"); # (hey! a TSS is NOT a promoter... what's going on here?...
367 4         13 $gene->add_promoter($predobj);
368             #I'd like to do this (for now):
369             #$predobj->primary_tag("TSS"); #this is not the right model, but, it IS a feature at least.
370             #but the followg errs out
371             #$gene->add_SeqFeature($predobj); #err: MSG: start is undefined when bounds at Bio::SeqFeature::Generic::add_SeqFeature 671 check since gene has no start yet
372             }
373             else {
374 0         0 $self->throw("unrecognized prediction line: " . $line);
375             }
376 27         105 next;
377             }
378              
379 14 100 100     49 if(/^\s*$/ && defined($gene)) {
380             # current gene is completed
381 4         10 $gene->seq_id($seqname);
382 4         11 $self->_add_prediction($gene);
383 4         3 $gene = undef;
384 4         7 next;
385             }
386              
387 10 50       15 if(/^(FGENESH)\s+([\d\.]+)/) {
388 0         0 $self->analysis_method($1);
389 0         0 $self->analysis_method_version($2);
390 0 0       0 if (/\s(\S+)\sgenomic DNA/) {
391 0         0 $self->analysis_subject($1);
392             }
393 0         0 next;
394             }
395              
396 10 100       20 if(/^\s*Seq name:\s+(\S+)/) {
397 1         3 $seqname = $1;
398 1         3 next;
399             }
400            
401 9 100       21 /^Predicted protein/ && do {
402             # section of predicted sequences
403 1         14 $self->_pushback($_);
404 1         1 last;
405             };
406             }
407              
408 1         5 $self->_predictions_parsed(1);
409             }
410              
411             =head2 _prediction
412              
413             Title : _prediction()
414             Usage : $gene = $obj->_prediction()
415             Function: internal
416             Example :
417             Returns :
418              
419             =cut
420              
421             sub _prediction {
422 5     5   2 my ($self) = @_;
423              
424 5 100 66     12 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
  5         16  
425 4         3 return shift(@{$self->{'_preds'}});
  4         7  
426             }
427              
428             =head2 _add_prediction
429              
430             Title : _add_prediction()
431             Usage : $obj->_add_prediction($gene)
432             Function: internal
433             Example :
434             Returns :
435              
436             =cut
437              
438             sub _add_prediction {
439 4     4   2 my ($self, $gene) = @_;
440              
441 4 50       10 if(! exists($self->{'_preds'})) {
442 0         0 $self->{'_preds'} = [];
443             }
444 4         2 push(@{$self->{'_preds'}}, $gene);
  4         8  
445             }
446              
447             =head2 _predictions_parsed
448              
449             Title : _predictions_parsed
450             Usage : $obj->_predictions_parsed
451             Function: internal
452             Example :
453             Returns : TRUE or FALSE
454              
455             =cut
456              
457             sub _predictions_parsed {
458 6     6   5 my ($self, $val) = @_;
459              
460 6 100       15 $self->{'_preds_parsed'} = $val if $val;
461 6 50       11 if(! exists($self->{'_preds_parsed'})) {
462 0         0 $self->{'_preds_parsed'} = 0;
463             }
464 6         15 return $self->{'_preds_parsed'};
465             }
466              
467             =head2 _has_cds
468              
469             Title : _has_cds()
470             Usage : $obj->_has_cds()
471             Function: Whether or not the result contains the predicted CDSs, too.
472             Example :
473             Returns : TRUE or FALSE
474              
475             =cut
476              
477             sub _has_cds {
478 0     0   0 my ($self, $val) = @_;
479              
480 0 0       0 $self->{'_has_cds'} = $val if $val;
481 0 0       0 if(! exists($self->{'_has_cds'})) {
482 0         0 $self->{'_has_cds'} = 0;
483             }
484 0         0 return $self->{'_has_cds'};
485             }
486              
487             =head2 _read_fasta_seq
488              
489             Title : _read_fasta_seq()
490             Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
491             Function: Simple but specialised FASTA format sequence reader. Uses
492             $self->_readline() to retrieve input, and is able to strip off
493             the traling description lines.
494             Example :
495             Returns : An array of two elements: fasta_id & sequence
496              
497             =cut
498              
499             sub _read_fasta_seq {
500 4     4   5 my ($self) = @_;
501 4         2 my ($id, $seq);
502             #local $/ = ">";
503            
504 4         15 my $entry = $self->_readline();
505             # print " ^^ $entry\n";
506 4 50       8 return unless ($entry);
507 4 100       14 $entry = $self->_readline() if ($entry =~ /^Predicted protein/);
508             # print " ^^ $entry\n";
509              
510             # Pick up the header / id.
511 4 50       11 if ($entry =~ /^>FGENESH:/) {
512 4 50       13 if ($entry =~ /^>FGENESH:\s+(\d+)/) {
    0          
    0          
513             # print STDERR " this is a predicted gene\n";
514 4         10 $id = "_predicted_protein_" . $1;
515             } elsif ($entry =~ /^>FGENESH:\[mRNA\]\s*(\d+)/) {
516             # print STDERR " this is an mRNA\n";
517 0         0 $id = "_predicted_mrna_" . $1;
518             } elsif ($entry =~ /^>FGENESH:\[exon\]\s+Gene:\s*(\d+)/) {
519 0         0 $id = "_predicted_cds_" . $1;
520             }
521 4         2 $seq = "";
522 4         8 $entry = $self->_readline();
523             }
524              
525 4         4 my $done = 0;
526 4         8 while (!$done) {
527             # print "*** $entry\n";
528 19 50 33     31 if (($entry =~ /^>FGENESH:\[exon\]/) && ($id =~ /^_predicted_cds_/)) {
529             # print STDERR " -- informed about an exon header...\n";
530 0         0 $entry = $self->_readline();
531             } else {
532 19         38 $seq .= $entry;
533             # print STDERR " Added $entry\n";
534             }
535              
536 19 100       124 last unless $entry = $self->_readline();
537 18 100 33     54 if (($entry =~ /^>/) &&
      66        
538             (!(($entry =~ /^>FGENESH:\[exon\]/) && ($id =~ /^_predicted_cds_/)))) {
539 3         6 $self->_pushback($entry); last;
  3         4  
540             }
541             }
542              
543             # id and sequence
544 4         25 $seq =~ s/\s//g; # Remove whitespace
545 4         10 return ($id, $seq);
546             }
547              
548             1;