File Coverage

Bio/Tools/Genscan.pm
Criterion Covered Total %
statement 151 158 95.5
branch 57 68 83.8
condition 8 12 66.6
subroutine 16 16 100.0
pod 3 3 100.0
total 235 257 91.4


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Genscan
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Hilmar Lapp
7             #
8             # Copyright Hilmar Lapp
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Genscan - Results of one Genscan run
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::Genscan;
21              
22             $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan');
23             # filehandle:
24             $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT );
25              
26             # parse the results
27             # note: this class is-a Bio::Tools::AnalysisResult which implements
28             # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
29             while($gene = $genscan->next_prediction()) {
30             # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
31             # off Bio::SeqFeature::Gene::Transcript.
32             #
33             # $gene->exons() returns an array of
34             # Bio::Tools::Prediction::Exon objects
35             # all exons:
36             @exon_arr = $gene->exons();
37              
38             # initial exons only
39             @init_exons = $gene->exons('Initial');
40             # internal exons only
41             @intrl_exons = $gene->exons('Internal');
42             # terminal exons only
43             @term_exons = $gene->exons('Terminal');
44             # singleton exons:
45             ($single_exon) = $gene->exons();
46             }
47              
48             # essential if you gave a filename at initialization (otherwise the file
49             # will stay open)
50             $genscan->close();
51              
52             =head1 DESCRIPTION
53              
54             The Genscan module provides a parser for Genscan gene structure prediction
55             output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
56             derived object.
57              
58             This module also implements the Bio::SeqAnalysisParserI interface, and thus
59             can be used wherever such an object fits. See L.
60              
61             =head1 FEEDBACK
62              
63             =head2 Mailing Lists
64              
65             User feedback is an integral part of the evolution of this and other
66             Bioperl modules. Send your comments and suggestions preferably to one
67             of the Bioperl mailing lists. Your participation is much appreciated.
68              
69             bioperl-l@bioperl.org - General discussion
70             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
71              
72             =head2 Support
73              
74             Please direct usage questions or support issues to the mailing list:
75              
76             I
77              
78             rather than to the module maintainer directly. Many experienced and
79             reponsive experts will be able look at the problem and quickly
80             address it. Please include a thorough description of the problem
81             with code and data examples if at all possible.
82              
83             =head2 Reporting Bugs
84              
85             Report bugs to the Bioperl bug tracking system to help us keep track
86             the bugs and their resolution. Bug reports can be submitted via the
87             web:
88              
89             https://github.com/bioperl/bioperl-live/issues
90              
91             =head1 AUTHOR - Hilmar Lapp
92              
93             Email hlapp@gmx.net
94              
95             =head1 APPENDIX
96              
97             The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
98              
99             =cut
100              
101              
102             # Let the code begin...
103              
104              
105             package Bio::Tools::Genscan;
106 2     2   881 use strict;
  2         4  
  2         53  
107 2     2   7 use Symbol;
  2         4  
  2         117  
108              
109 2     2   9 use Bio::Root::Root;
  2         3  
  2         39  
110 2     2   253 use Bio::Tools::Prediction::Gene;
  2         4  
  2         53  
111 2     2   272 use Bio::Tools::Prediction::Exon;
  2         3  
  2         46  
112              
113 2     2   9 use base qw(Bio::Tools::AnalysisResult);
  2         4  
  2         1456  
114              
115             my %ExonTags = ('Init' => 'Initial',
116             'Intr' => 'Internal',
117             'Term' => 'Terminal',
118             'Sngl' => '');
119            
120             sub _initialize_state {
121 3     3   10 my ($self,@args) = @_;
122            
123             # first call the inherited method!
124 3         28 $self->SUPER::_initialize_state(@args);
125              
126             # our private state variables
127 3         6 $self->{'_preds_parsed'} = 0;
128 3         7 $self->{'_has_cds'} = 0;
129             # array of pre-parsed predictions
130 3         6 $self->{'_preds'} = [];
131             # seq stack
132 3         8 $self->{'_seqstack'} = [];
133             }
134              
135             =head2 analysis_method
136              
137             Usage : $genscan->analysis_method();
138             Purpose : Inherited method. Overridden to ensure that the name matches
139             /genscan/i.
140             Returns : String
141             Argument : n/a
142              
143             =cut
144              
145             #-------------
146             sub analysis_method {
147             #-------------
148 3     3 1 11 my ($self, $method) = @_;
149 3 50 33     31 if($method && ($method !~ /genscan/i)) {
150 0         0 $self->throw("method $method not supported in " . ref($self));
151             }
152 3         20 return $self->SUPER::analysis_method($method);
153             }
154              
155             =head2 next_feature
156              
157             Title : next_feature
158             Usage : while($gene = $genscan->next_feature()) {
159             # do something
160             }
161             Function: Returns the next gene structure prediction of the Genscan result
162             file. Call this method repeatedly until FALSE is returned.
163              
164             The returned object is actually a SeqFeatureI implementing object.
165             This method is required for classes implementing the
166             SeqAnalysisParserI interface, and is merely an alias for
167             next_prediction() at present.
168              
169             Example :
170             Returns : A Bio::Tools::Prediction::Gene object.
171             Args :
172              
173             =cut
174              
175             sub next_feature {
176 4     4 1 799 my ($self,@args) = @_;
177             # even though next_prediction doesn't expect any args (and this method
178             # does neither), we pass on args in order to be prepared if this changes
179             # ever
180 4         12 return $self->next_prediction(@args);
181             }
182              
183             =head2 next_prediction
184              
185             Title : next_prediction
186             Usage : while($gene = $genscan->next_prediction()) {
187             # do something
188             }
189             Function: Returns the next gene structure prediction of the Genscan result
190             file. Call this method repeatedly until FALSE is returned.
191              
192             Example :
193             Returns : A Bio::Tools::Prediction::Gene object.
194             Args :
195              
196             =cut
197              
198             sub next_prediction {
199 9     9 1 514 my ($self) = @_;
200 9         10 my $gene;
201              
202             # if the prediction section hasn't been parsed yet, we do this now
203 9 100       20 $self->_parse_predictions() unless $self->_predictions_parsed();
204              
205             # get next gene structure
206 9         18 $gene = $self->_prediction();
207              
208 9 100       16 if($gene) {
209             # fill in predicted protein, and if available the predicted CDS
210             #
211 7         12 my ($id, $seq);
212             # use the seq stack if there's a seq on it
213 7         8 my $seqobj = pop(@{$self->{'_seqstack'}});
  7         11  
214 7 50       16 if(! $seqobj) {
215             # otherwise read from input stream
216 7         15 ($id, $seq) = $self->_read_fasta_seq();
217             # there may be no sequence at all, or none any more
218 7 100 66     24 if($id && $seq) {
219 6         28 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
220             '-display_id' => $id,
221             '-alphabet' => "protein");
222             }
223             }
224 7 100       11 if($seqobj) {
225             # check that prediction number matches the prediction number
226             # indicated in the sequence id (there may be incomplete gene
227             # predictions that contain only signals with no associated protein
228             # and CDS, like promoters, poly-A sites etc)
229 6         15 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
230 6         13 my $prednr = $1;
231 6 50       12 if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) {
232             # this is not our sequence, so push back for next prediction
233 0         0 push(@{$self->{'_seqstack'}}, $seqobj);
  0         0  
234             } else {
235 6         23 $gene->predicted_protein($seqobj);
236             # CDS prediction, too?
237 6 50       15 if($self->_has_cds()) {
238 6         14 ($id, $seq) = $self->_read_fasta_seq();
239 6         21 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
240             '-display_id' => $id,
241             '-alphabet' => "dna");
242 6         17 $gene->predicted_cds($seqobj);
243             }
244             }
245             }
246             }
247              
248 9         23 return $gene;
249             }
250              
251             =head2 _parse_predictions
252              
253             Title : _parse_predictions()
254             Usage : $obj->_parse_predictions()
255             Function: Parses the prediction section. Automatically called by
256             next_prediction() if not yet done.
257             Example :
258             Returns :
259              
260             =cut
261              
262             sub _parse_predictions {
263 3     3   6 my ($self) = @_;
264 3         6 my $gene;
265             my $seqname;
266              
267 3         24 while(defined($_ = $self->_readline())) {
268 144 100       751 if(/^\s*(\d+)\.(\d+)/) {
269             # exon or signal
270 85         171 my $prednr = $1;
271 85         103 my $signalnr = $2; # not used presently
272 85 100       110 if(! defined($gene)) {
273 7         38 $gene = Bio::Tools::Prediction::Gene->new(
274             '-primary' => "GenePrediction$prednr",
275             '-source' => 'Genscan');
276             }
277             # split into fields
278 85         143 chomp();
279 85         261 my @flds = split(' ', $_);
280             # create the feature object depending on the type of signal
281 85         80 my $predobj;
282 85         180 my $is_exon = grep {$_ eq $flds[1];} (keys(%ExonTags));
  340         474  
283 85 100       136 if($is_exon) {
284 74         177 $predobj = Bio::Tools::Prediction::Exon->new();
285             } else {
286             # PolyA site, or Promoter
287 11         29 $predobj = Bio::SeqFeature::Generic->new();
288             }
289             # set common fields
290 85         170 $predobj->source_tag('Genscan');
291 85         182 $predobj->score($flds[$#flds]);
292 85 100       201 $predobj->strand((($flds[2] eq '+') ? 1 : -1));
293 85         153 my ($start, $end) = @flds[(3,4)];
294 85 100       127 if($predobj->strand() == 1) {
295 10         21 $predobj->start($start);
296 10         17 $predobj->end($end);
297             } else {
298 75         156 $predobj->end($start);
299 75         114 $predobj->start($end);
300             }
301             # add to gene structure (should be done only when start and end
302             # are set, in order to allow for proper expansion of the range)
303 85 100       142 if($is_exon) {
    100          
    50          
304             # first, set fields unique to exons
305 74         154 $predobj->start_signal_score($flds[8]);
306 74         164 $predobj->end_signal_score($flds[9]);
307 74         183 $predobj->coding_signal_score($flds[10]);
308 74         152 $predobj->significance($flds[11]);
309 74         227 $predobj->primary_tag($ExonTags{$flds[1]} . 'Exon');
310 74         135 $predobj->is_coding(1);
311             # Figure out the frame of this exon. This is NOT the frame
312             # given by Genscan, which is the absolute frame of the base
313             # starting the first predicted complete codon. By comparing
314             # to the absolute frame of the first base we can compute the
315             # offset of the first complete codon to the first base of the
316             # exon, which determines the frame of the exon.
317 74         68 my $cod_offset;
318 74 100       107 if($predobj->strand() == 1) {
319 6         14 $cod_offset = $flds[6] - (($predobj->start()-1) % 3);
320             # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
321             # to offsets 2 and 1, resp. Offset 3 is the same as 0.
322 6 50       15 $cod_offset += 3 if($cod_offset < 1);
323             } else {
324             # On the reverse strand the Genscan frame also refers to
325             # the first base of the first complete codon, but viewed
326             # from forward, which is the third base viewed from
327             # reverse.
328 68         113 $cod_offset = $flds[6] - (($predobj->end()-3) % 3);
329             # Possible values are -2, -1, 0, 1, 2. Due to the reverse
330             # situation, {2,-1} and {1,-2} correspond to offsets
331             # 1 and 2, resp. Offset 3 is the same as 0.
332 68 100       121 $cod_offset -= 3 if($cod_offset >= 0);
333 68         82 $cod_offset = -$cod_offset;
334             }
335             # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
336             # is the frame of the first base relative to the exon, or the
337             # number of bases the first codon is missing).
338 74         169 $predobj->frame(3 - $cod_offset);
339             # then add to gene structure object
340 74         220 $gene->add_exon($predobj, $ExonTags{$flds[1]});
341             } elsif($flds[1] eq 'PlyA') {
342 6         13 $predobj->primary_tag("PolyAsite");
343 6         24 $gene->poly_A_site($predobj);
344             } elsif($flds[1] eq 'Prom') {
345 5         12 $predobj->primary_tag("Promoter");
346 5         18 $gene->add_promoter($predobj);
347             }
348 85         309 next;
349             }
350 59 100 100     200 if(/^\s*$/ && defined($gene)) {
351             # current gene is completed
352 7         26 $gene->seq_id($seqname);
353 7         20 $self->_add_prediction($gene);
354 7         9 $gene = undef;
355 7         12 next;
356             }
357 52 100       87 if(/^(GENSCAN)\s+(\S+)/) {
358 3         11 $self->analysis_method($1);
359 3         20 $self->analysis_method_version($2);
360 3         9 next;
361             }
362 49 100       83 if(/^Sequence\s+(\S+)\s*:/) {
363 3         10 $seqname = $1;
364 3         8 next;
365             }
366            
367 46 100       79 if(/^Parameter matrix:\s+(\S+)/i) {
368 3         22 $self->analysis_subject($1);
369 3         7 next;
370             }
371            
372 43 100       66 if(/^Predicted coding/) {
373 3         10 $self->_has_cds(1);
374 3         4 next;
375             }
376 40 100       85 /^>/ && do {
377             # section of predicted sequences
378 2         37 $self->_pushback($_);
379 2         3 last;
380             };
381             }
382 3         26 $self->_predictions_parsed(1);
383             }
384              
385             =head2 _prediction
386              
387             Title : _prediction()
388             Usage : $gene = $obj->_prediction()
389             Function: internal
390             Example :
391             Returns :
392              
393             =cut
394              
395             sub _prediction {
396 9     9   12 my ($self) = @_;
397              
398 9 100 66     23 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
  9         24  
399 7         6 return shift(@{$self->{'_preds'}});
  7         39  
400             }
401              
402             =head2 _add_prediction
403              
404             Title : _add_prediction()
405             Usage : $obj->_add_prediction($gene)
406             Function: internal
407             Example :
408             Returns :
409              
410             =cut
411              
412             sub _add_prediction {
413 7     7   13 my ($self, $gene) = @_;
414              
415 7 50       15 if(! exists($self->{'_preds'})) {
416 0         0 $self->{'_preds'} = [];
417             }
418 7         9 push(@{$self->{'_preds'}}, $gene);
  7         14  
419             }
420              
421             =head2 _predictions_parsed
422              
423             Title : _predictions_parsed
424             Usage : $obj->_predictions_parsed
425             Function: internal
426             Example :
427             Returns : TRUE or FALSE
428              
429             =cut
430              
431             sub _predictions_parsed {
432 12     12   20 my ($self, $val) = @_;
433              
434 12 100       27 $self->{'_preds_parsed'} = $val if $val;
435 12 50       25 if(! exists($self->{'_preds_parsed'})) {
436 0         0 $self->{'_preds_parsed'} = 0;
437             }
438 12         30 return $self->{'_preds_parsed'};
439             }
440              
441             =head2 _has_cds
442              
443             Title : _has_cds()
444             Usage : $obj->_has_cds()
445             Function: Whether or not the result contains the predicted CDSs, too.
446             Example :
447             Returns : TRUE or FALSE
448              
449             =cut
450              
451             sub _has_cds {
452 9     9   16 my ($self, $val) = @_;
453              
454 9 100       15 $self->{'_has_cds'} = $val if $val;
455 9 50       19 if(! exists($self->{'_has_cds'})) {
456 0         0 $self->{'_has_cds'} = 0;
457             }
458 9         14 return $self->{'_has_cds'};
459             }
460              
461             =head2 _read_fasta_seq
462              
463             Title : _read_fasta_seq()
464             Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
465             Function: Simple but specialised FASTA format sequence reader. Uses
466             $self->_readline() to retrieve input, and is able to strip off
467             the traling description lines.
468             Example :
469             Returns : An array of two elements.
470              
471             =cut
472              
473             sub _read_fasta_seq {
474 13     13   17 my ($self) = @_;
475 13         17 my ($id, $seq);
476 13         43 local $/ = ">";
477            
478 13         31 my $entry = $self->_readline();
479 13 100       27 if($entry) {
480 12         25 $entry =~ s/^>//;
481             # complete the entry if the first line came from a pushback buffer
482 12         36 while($entry !~ />$/) {
483 2 50       5 last unless $_ = $self->_readline();
484 2         10 $entry .= $_;
485             }
486             # delete everything onwards from an intervening empty line (at the
487             # end there might be statistics stuff)
488 12         53 $entry =~ s/\n\n.*$//s;
489             # id and sequence
490 12 50       134 if($entry =~ /^(\S+)\n([^>]+)/) {
491 12         25 $id = $1;
492 12         27 $seq = $2;
493             } else {
494 0         0 $self->throw("Can't parse Genscan predicted sequence entry");
495             }
496 12         155 $seq =~ s/\s//g; # Remove whitespace
497             }
498 13         46 return ($id, $seq);
499             }
500              
501             1;