File Coverage

Bio/Tools/Genemark.pm
Criterion Covered Total %
statement 155 175 88.5
branch 55 70 78.5
condition 9 15 60.0
subroutine 18 20 90.0
pod 4 4 100.0
total 241 284 84.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Genemark
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Mark Fiers
7             #
8             # Copyright Hilmar Lapp, Mark Fiers
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Genemark - Results of one Genemark run
17              
18             =head1 SYNOPSIS
19              
20             $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
21             # filehandle:
22             $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
23              
24             # parse the results
25             # note: this class is-a Bio::Tools::AnalysisResult which implements
26             # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
27             while($gene = $Genemark->next_prediction()) {
28             # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
29             # off Bio::SeqFeature::Gene::Transcript.
30             #
31             # $gene->exons() returns an array of
32             # Bio::Tools::Prediction::Exon objects
33             # all exons:
34             @exon_arr = $gene->exons();
35              
36             # initial exons only
37             @init_exons = $gene->exons('Initial');
38             # internal exons only
39             @intrl_exons = $gene->exons('Internal');
40             # terminal exons only
41             @term_exons = $gene->exons('Terminal');
42             # singleton exons:
43             ($single_exon) = $gene->exons();
44             }
45              
46             # essential if you gave a filename at initialization (otherwise the file
47             # will stay open)
48             $Genemark->close();
49              
50             =head1 DESCRIPTION
51              
52             The Genemark module provides a parser for Genemark gene structure
53             prediction output. It parses one gene prediction into a
54             Bio::SeqFeature::Gene::Transcript- derived object.
55              
56             This module has been developed around genemark.hmm for eukaryots v2.2a
57             and will probably not work with other versions.
58              
59              
60             This module also implements the Bio::SeqAnalysisParserI interface, and
61             thus can be used wherever such an object fits. See
62             L.
63              
64             =head1 FEEDBACK
65              
66             =head2 Mailing Lists
67              
68             User feedback is an integral part of the evolution of this and other
69             Bioperl modules. Send your comments and suggestions preferably to one
70             of the Bioperl mailing lists. Your participation is much appreciated.
71              
72             bioperl-l@bioperl.org - General discussion
73             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74              
75             =head2 Support
76              
77             Please direct usage questions or support issues to the mailing list:
78              
79             I
80              
81             rather than to the module maintainer directly. Many experienced and
82             reponsive experts will be able look at the problem and quickly
83             address it. Please include a thorough description of the problem
84             with code and data examples if at all possible.
85              
86             =head2 Reporting Bugs
87              
88             Report bugs to the Bioperl bug tracking system to help us keep track
89             the bugs and their resolution. Bug reports can be submitted via the
90             web:
91              
92             https://github.com/bioperl/bioperl-live/issues
93              
94             =head1 AUTHOR - Hilmar Lapp, Mark Fiers
95              
96             Email hlapp@gmx.net
97             m.w.e.j.fiers@plant.wag-ur.nl
98              
99             =head1 APPENDIX
100              
101             The rest of the documentation details each of the object
102             methods. Internal methods are usually preceded with a _
103              
104             =cut
105              
106              
107             # Let the code begin...
108              
109              
110             package Bio::Tools::Genemark;
111 1     1   699 use strict;
  1         1  
  1         22  
112 1     1   2 use Symbol;
  1         1  
  1         40  
113              
114 1     1   3 use Bio::Root::Root;
  1         1  
  1         14  
115 1     1   3 use Bio::Tools::Prediction::Gene;
  1         1  
  1         16  
116 1     1   2 use Bio::Tools::Prediction::Exon;
  1         1  
  1         11  
117 1     1   7 use Bio::Seq;
  1         3  
  1         12  
118 1     1   267 use Bio::Factory::FTLocationFactory;
  1         1  
  1         26  
119              
120 1     1   4 use base qw(Bio::Tools::AnalysisResult);
  1         2  
  1         1286  
121              
122             =head2 new
123              
124             Title : new
125             Usage : my $obj = Bio::Tools::Genemark->new();
126             Function: Builds a new Bio::Tools::Genemark object
127             Returns : an instance of Bio::Tools::Genemark
128             Args : seqname
129              
130              
131             =cut
132              
133             sub new {
134 2     2 1 4 my($class,@args) = @_;
135              
136 2         15 my $self = $class->SUPER::new(@args);
137              
138 2         8 my ($seqname) = $self->_rearrange([qw(SEQNAME)], @args);
139              
140             # hardwire seq_id when creating gene and exon objects
141 2 100       10 $self->_seqname($seqname) if defined($seqname);
142              
143 2         4 return $self;
144             }
145              
146             sub _initialize_state {
147 2     2   6 my ($self,@args) = @_;
148              
149             # first call the inherited method!
150 2         9 $self->SUPER::_initialize_state(@args);
151              
152             # our private state variables
153 2         2 $self->{'_preds_parsed'} = 0;
154 2         5 $self->{'_has_cds'} = 0;
155             # array of pre-parsed predictions
156 2         5 $self->{'_preds'} = [];
157             # seq stack
158 2         5 $self->{'_seqstack'} = [];
159             }
160              
161             =head2 analysis_method
162              
163             Usage : $Genemark->analysis_method();
164             Purpose : Inherited method. Overridden to ensure that the name matches
165             /GeneMark.hmm/i.
166             Returns : String
167             Argument : n/a
168              
169             =cut
170              
171             #-------------
172             sub analysis_method {
173             #-------------
174 47     47 1 35 my ($self, $method) = @_;
175 47 50 66     86 if($method && ($method !~ /Genemark\.hmm/i)) {
176 0         0 $self->throw("method $method not supported in " . ref($self));
177             }
178 47         72 return $self->SUPER::analysis_method($method);
179             }
180              
181             =head2 next_feature
182              
183             Title : next_feature
184             Usage : while($gene = $Genemark->next_feature()) {
185             # do something
186             }
187             Function: Returns the next gene structure prediction of the Genemark result
188             file. Call this method repeatedly until FALSE is returned.
189              
190             The returned object is actually a SeqFeatureI implementing object.
191             This method is required for classes implementing the
192             SeqAnalysisParserI interface, and is merely an alias for
193             next_prediction() at present.
194              
195             Example :
196             Returns : A Bio::Tools::Prediction::Gene object.
197             Args :
198              
199             =cut
200              
201             sub next_feature {
202 0     0 1 0 my ($self,@args) = @_;
203             # even though next_prediction doesn't expect any args (and this method
204             # does neither), we pass on args in order to be prepared if this changes
205             # ever
206 0         0 return $self->next_prediction(@args);
207             }
208              
209             =head2 next_prediction
210              
211             Title : next_prediction
212             Usage : while($gene = $Genemark->next_prediction()) {
213             # do something
214             }
215             Function: Returns the next gene structure prediction of the Genemark result
216             file. Call this method repeatedly until FALSE is returned.
217              
218             Example :
219             Returns : A Bio::Tools::Prediction::Gene object.
220             Args :
221              
222             =cut
223              
224             sub next_prediction {
225 16     16 1 5344 my ($self) = @_;
226 16         17 my $gene;
227              
228             # if the prediction section hasn't been parsed yet, we do this now
229 16 100       31 $self->_parse_predictions() unless $self->_predictions_parsed();
230              
231             # get next gene structure
232 16         26 $gene = $self->_prediction();
233              
234 16         24 return $gene;
235             }
236              
237             =head2 _parse_predictions
238              
239             Title : _parse_predictions()
240             Usage : $obj->_parse_predictions()
241             Function: Parses the prediction section. Automatically called by
242             next_prediction() if not yet done.
243             Example :
244             Returns :
245              
246             =cut
247              
248             sub _parse_predictions {
249 2     2   3 my ($self) = @_;
250 2         12 my %exontags = ('Initial' => 'Initial',
251             'Internal' => 'Internal',
252             'Terminal' => 'Terminal',
253             'Single' => '',
254             '_na_' => '');
255 2         3 my $exontag;
256             my $gene;
257 0         0 my $exontype;
258 2         3 my $current_gene_no = -1;
259              
260             # The prediction report does not contain a sequence identifier
261             # (at least the prokaryotic version doesn't)
262 2         6 my $seqname = $self->_seqname();
263            
264 2         11 while(defined($_ = $self->_readline())) {
265              
266 77 100 100     290 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
267              
268             # this is an exon, Genemark doesn't predict anything else
269             # $prednr corresponds to geneno.
270 45         74 my $prednr = $1;
271              
272             #exon no:
273 45         39 my $signalnr = 0;
274 45 100       68 if ($2) { my $signalnr = $2; } # used in tag: exon_no
  43         45  
275            
276             # split into fields
277 45         50 chomp();
278 45         113 my @flds = split(' ', $_);
279              
280             # create the feature (an exon) object
281 45         92 my $predobj = Bio::Tools::Prediction::Exon->new();
282              
283            
284             # define info depending on it being eu- or prokaryot
285 45         39 my ($start, $end, $orientation, $prediction_source);
286              
287 45 100       76 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
288 2         3 $prediction_source = "Genemark.hmm.pro";
289 2 50       7 $orientation = ($flds[1] eq '+') ? 1 : -1;
290 2         5 ($start, $end) = @flds[(2,3)];
291 2         2 $exontag = "_na_";
292              
293             } else {
294 43         38 $prediction_source = "Genemark.hmm.eu";
295 43 100       67 $orientation = ($flds[2] eq '+') ? 1 : -1;
296 43         61 ($start, $end) = @flds[(4,5)];
297 43         40 $exontag = $flds[3];
298             }
299              
300             # instatiate a location object via
301             # Bio::Factory::FTLocationFactory (to handle
302             # inexact coordinates)
303 45         70 my $location_string = join('..', $start, $end);
304 45         89 my $location_factory = Bio::Factory::FTLocationFactory->new();
305 45         95 my $location_obj = $location_factory->from_string($location_string);
306 45         97 $predobj->location($location_obj);
307              
308             #store the data in the exon object
309 45         73 $predobj->source_tag($prediction_source);
310 45         63 $predobj->strand($orientation);
311              
312 45         120 $predobj->primary_tag($exontags{$exontag} . "Exon");
313              
314 45 50       64 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
315              
316 45         58 $predobj->is_coding(1);
317              
318 45 50 33     195 $predobj->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
319            
320             # frame calculation as in the genscan module
321             # is to be implemented...
322            
323             #If the $prednr is not equal to the current gene, we
324             #need to make a new gene and close the old one
325 45 100       83 if($prednr != $current_gene_no) {
326             # a new gene, store the old one if it exists
327 15 100       24 if (defined ($gene)) {
328 13         20 $gene->seq_id($seqname);
329 13         11 $gene = undef ;
330             }
331             #and make a new one
332 15         50 $gene = Bio::Tools::Prediction::Gene->new
333             (
334             '-primary' => "GenePrediction$prednr",
335             '-source' => $prediction_source);
336 15         34 $self->_add_prediction($gene);
337 15         14 $current_gene_no = $prednr;
338 15 50 33     67 $gene->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
339             }
340            
341             # Add the exon to the gene
342             $gene->add_exon($predobj, ($exontag eq "_na_" ?
343 45 100       109 undef : $exontags{$exontag}));
344              
345             }
346              
347 77 100       149 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
348 2         7 $self->analysis_method($1);
349              
350 2         3 my $gm_version = $2;
351              
352 2         8 $self->analysis_method_version($gm_version);
353 2         6 next;
354             }
355              
356             #Matrix file for eukaryot version
357 75 100       134 if (/^Matrices file:\s+(\S+)?/i) {
358 1         6 $self->analysis_subject($1);
359             # since the line after the matrix file is always the date
360             # (in the output file's I have seen!) extract and store this
361             # here
362 1 50       2 if (defined(my $_date = $self->_readline())) {
363 1         2 chomp ($_date);
364 1         5 $self->analysis_date($_date);
365             }
366             }
367            
368             #Matrix file for prokaryot version
369 75 100       92 if (/^Model file name:\s+(\S+)/) {
370 1         7 $self->analysis_subject($1);
371             # since the line after the matrix file is always the date
372             # (in the output file's I have seen!) extract and store this
373             # here
374 1         5 my $_date = $self->_readline() ;
375 1 50       3 if (defined($_date = $self->_readline())) {
376 1         3 chomp ($_date);
377 1         4 $self->analysis_date($_date);
378             }
379             }
380            
381 75 100       117 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
382 1         2 $seqname = $1;
383             # $self->analysis_subject($seqname);
384 1         2 next;
385             }
386            
387              
388 74 100       176 /^>/ && do {
389 1         21 $self->_pushback($_);
390              
391             # section of predicted aa sequences on recognition
392             # of a fasta start, read all sequences and find the
393             # appropriate gene
394 1         2 while (1) {
395 14         23 my ($aa_id, $seq) = $self->_read_fasta_seq();
396 14 100       19 last unless ($aa_id);
397              
398             #now parse through the predictions to add the pred. protein
399 13         11 FINDPRED: foreach my $gene (@{$self->{'_preds'}}) {
  13         23  
400 91         127 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
401 91         80 my $geneno = $1;
402 91 100       617 if ($aa_id =~ /\|gene.$geneno\|/) {
403             #print "x SEQ : \n $seq \nXXXX\n";
404 13         35 my $seqobj = Bio::Seq->new('-seq' => $seq,
405             '-display_id' => $aa_id,
406             '-alphabet' => "protein");
407 13         27 $gene->predicted_protein($seqobj);
408 13         24 last FINDPRED;
409             }
410              
411             }
412             }
413              
414 1         2 last;
415             };
416             }
417              
418             # if the analysis query object contains a ref to a Seq of PrimarySeq
419             # object, then extract the predicted sequences and add it to the gene
420             # object.
421 2 50       13 if (defined $self->analysis_query()) {
422 0         0 my $orig_seq = $self->analysis_query();
423 0         0 FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) {
  0         0  
424 0         0 my $predseq = "";
425 0         0 foreach my $exon ($gene->exons()) {
426             #print $exon->start() . " " . $exon->end () . "\n";
427 0         0 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
428             }
429              
430 0         0 my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq,
431             '-display_id' => "transl");
432 0         0 $gene->predicted_cds($seqobj);
433             }
434             }
435              
436              
437 2         6 $self->_predictions_parsed(1);
438             }
439              
440             =head2 _prediction
441              
442             Title : _prediction()
443             Usage : $gene = $obj->_prediction()
444             Function: internal
445             Example :
446             Returns :
447              
448             =cut
449              
450             sub _prediction {
451 16     16   11 my ($self) = @_;
452              
453 16 100 66     44 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
  16         52  
454 15         9 return shift(@{$self->{'_preds'}});
  15         28  
455             }
456              
457             =head2 _add_prediction
458              
459             Title : _add_prediction()
460             Usage : $obj->_add_prediction($gene)
461             Function: internal
462             Example :
463             Returns :
464              
465             =cut
466              
467             sub _add_prediction {
468 15     15   13 my ($self, $gene) = @_;
469              
470 15 50       26 if(! exists($self->{'_preds'})) {
471 0         0 $self->{'_preds'} = [];
472             }
473 15         12 push(@{$self->{'_preds'}}, $gene);
  15         22  
474             }
475              
476             =head2 _predictions_parsed
477              
478             Title : _predictions_parsed
479             Usage : $obj->_predictions_parsed
480             Function: internal
481             Example :
482             Returns : TRUE or FALSE
483              
484             =cut
485              
486             sub _predictions_parsed {
487 18     18   15 my ($self, $val) = @_;
488              
489 18 100       34 $self->{'_preds_parsed'} = $val if $val;
490 18 50       35 if(! exists($self->{'_preds_parsed'})) {
491 0         0 $self->{'_preds_parsed'} = 0;
492             }
493 18         38 return $self->{'_preds_parsed'};
494             }
495              
496             =head2 _has_cds
497              
498             Title : _has_cds()
499             Usage : $obj->_has_cds()
500             Function: Whether or not the result contains the predicted CDSs, too.
501             Example :
502             Returns : TRUE or FALSE
503              
504             =cut
505              
506             sub _has_cds {
507 0     0   0 my ($self, $val) = @_;
508              
509 0 0       0 $self->{'_has_cds'} = $val if $val;
510 0 0       0 if(! exists($self->{'_has_cds'})) {
511 0         0 $self->{'_has_cds'} = 0;
512             }
513 0         0 return $self->{'_has_cds'};
514             }
515              
516             =head2 _read_fasta_seq
517              
518             Title : _read_fasta_seq()
519             Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
520             Function: Simple but specialised FASTA format sequence reader. Uses
521             $self->_readline() to retrieve input, and is able to strip off
522             the traling description lines.
523             Example :
524             Returns : An array of two elements.
525              
526             =cut
527              
528             sub _read_fasta_seq {
529 14     14   12 my ($self) = @_;
530 14         9 my ($id, $seq);
531 14         35 local $/ = ">";
532              
533 14 100       25 return 0 unless (my $entry = $self->_readline());
534              
535 13         15 $entry =~ s/^>//;
536             # complete the entry if the first line came from a pushback buffer
537 13         32 while(! ($entry =~ />$/)) {
538 2 100       4 last unless ($_ = $self->_readline());
539 1         5 $entry .= $_;
540             }
541              
542             # delete everything onwards from an new fasta start (>)
543 13         35 $entry =~ s/\n>.*$//s;
544             # id and sequence
545              
546 13 50       44 if($entry =~ s/^(.+)\n//) {
547 13         21 $id = $1;
548 13         22 $id =~ s/ /_/g;
549 13         12 $seq = $entry;
550 13         56 $seq =~ s/\s//g;
551             #print "\n@@ $id \n@@ $seq \n##\n";
552             } else {
553 0         0 $self->throw("Can't parse Genemark predicted sequence entry");
554             }
555 13         21 $seq =~ s/\s//g; # Remove whitespace
556 13         30 return ($id, $seq);
557             }
558              
559             =head2 _seqname
560              
561             Title : _seqname
562             Usage : $obj->_seqname($seqname)
563             Function: internal
564             Example :
565             Returns : String
566              
567             =cut
568              
569             sub _seqname {
570 3     3   3 my ($self, $val) = @_;
571              
572 3 100       7 $self->{'_seqname'} = $val if $val;
573 3 100       8 if(! exists($self->{'_seqname'})) {
574 1         3 $self->{'_seqname'} = 'unknown';
575             }
576 3         4 return $self->{'_seqname'};
577             }
578              
579             1;
580