File Coverage

Bio/Tools/Genemark.pm
Criterion Covered Total %
statement 155 175 88.5
branch 55 70 78.5
condition 9 15 60.0
subroutine 18 20 90.0
pod 4 4 100.0
total 241 284 84.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Genemark
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Mark Fiers
7             #
8             # Copyright Hilmar Lapp, Mark Fiers
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Genemark - Results of one Genemark run
17              
18             =head1 SYNOPSIS
19              
20             $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
21             # filehandle:
22             $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
23              
24             # parse the results
25             # note: this class is-a Bio::Tools::AnalysisResult which implements
26             # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
27             while($gene = $Genemark->next_prediction()) {
28             # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
29             # off Bio::SeqFeature::Gene::Transcript.
30             #
31             # $gene->exons() returns an array of
32             # Bio::Tools::Prediction::Exon objects
33             # all exons:
34             @exon_arr = $gene->exons();
35              
36             # initial exons only
37             @init_exons = $gene->exons('Initial');
38             # internal exons only
39             @intrl_exons = $gene->exons('Internal');
40             # terminal exons only
41             @term_exons = $gene->exons('Terminal');
42             # singleton exons:
43             ($single_exon) = $gene->exons();
44             }
45              
46             # essential if you gave a filename at initialization (otherwise the file
47             # will stay open)
48             $Genemark->close();
49              
50             =head1 DESCRIPTION
51              
52             The Genemark module provides a parser for Genemark gene structure
53             prediction output. It parses one gene prediction into a
54             Bio::SeqFeature::Gene::Transcript- derived object.
55              
56             This module has been developed around genemark.hmm for eukaryots v2.2a
57             and will probably not work with other versions.
58              
59              
60             This module also implements the Bio::SeqAnalysisParserI interface, and
61             thus can be used wherever such an object fits. See
62             L.
63              
64             =head1 FEEDBACK
65              
66             =head2 Mailing Lists
67              
68             User feedback is an integral part of the evolution of this and other
69             Bioperl modules. Send your comments and suggestions preferably to one
70             of the Bioperl mailing lists. Your participation is much appreciated.
71              
72             bioperl-l@bioperl.org - General discussion
73             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74              
75             =head2 Support
76              
77             Please direct usage questions or support issues to the mailing list:
78              
79             I
80              
81             rather than to the module maintainer directly. Many experienced and
82             reponsive experts will be able look at the problem and quickly
83             address it. Please include a thorough description of the problem
84             with code and data examples if at all possible.
85              
86             =head2 Reporting Bugs
87              
88             Report bugs to the Bioperl bug tracking system to help us keep track
89             the bugs and their resolution. Bug reports can be submitted via the
90             web:
91              
92             https://github.com/bioperl/bioperl-live/issues
93              
94             =head1 AUTHOR - Hilmar Lapp, Mark Fiers
95              
96             Email hlapp@gmx.net
97             m.w.e.j.fiers@plant.wag-ur.nl
98              
99             =head1 APPENDIX
100              
101             The rest of the documentation details each of the object
102             methods. Internal methods are usually preceded with a _
103              
104             =cut
105              
106              
107             # Let the code begin...
108              
109              
110             package Bio::Tools::Genemark;
111 1     1   775 use strict;
  1         2  
  1         25  
112 1     1   4 use Symbol;
  1         1  
  1         48  
113              
114 1     1   4 use Bio::Root::Root;
  1         2  
  1         14  
115 1     1   7 use Bio::Tools::Prediction::Gene;
  1         2  
  1         14  
116 1     1   3 use Bio::Tools::Prediction::Exon;
  1         1  
  1         16  
117 1     1   3 use Bio::Seq;
  1         1  
  1         15  
118 1     1   284 use Bio::Factory::FTLocationFactory;
  1         3  
  1         28  
119              
120 1     1   5 use base qw(Bio::Tools::AnalysisResult);
  1         1  
  1         1375  
121              
122             =head2 new
123              
124             Title : new
125             Usage : my $obj = Bio::Tools::Genemark->new();
126             Function: Builds a new Bio::Tools::Genemark object
127             Returns : an instance of Bio::Tools::Genemark
128             Args : seqname
129              
130              
131             =cut
132              
133             sub new {
134 2     2 1 6 my($class,@args) = @_;
135              
136 2         17 my $self = $class->SUPER::new(@args);
137              
138 2         9 my ($seqname) = $self->_rearrange([qw(SEQNAME)], @args);
139              
140             # hardwire seq_id when creating gene and exon objects
141 2 100       11 $self->_seqname($seqname) if defined($seqname);
142              
143 2         5 return $self;
144             }
145              
146             sub _initialize_state {
147 2     2   6 my ($self,@args) = @_;
148              
149             # first call the inherited method!
150 2         10 $self->SUPER::_initialize_state(@args);
151              
152             # our private state variables
153 2         4 $self->{'_preds_parsed'} = 0;
154 2         4 $self->{'_has_cds'} = 0;
155             # array of pre-parsed predictions
156 2         6 $self->{'_preds'} = [];
157             # seq stack
158 2         5 $self->{'_seqstack'} = [];
159             }
160              
161             =head2 analysis_method
162              
163             Usage : $Genemark->analysis_method();
164             Purpose : Inherited method. Overridden to ensure that the name matches
165             /GeneMark.hmm/i.
166             Returns : String
167             Argument : n/a
168              
169             =cut
170              
171             #-------------
172             sub analysis_method {
173             #-------------
174 47     47 1 49 my ($self, $method) = @_;
175 47 50 66     91 if($method && ($method !~ /Genemark\.hmm/i)) {
176 0         0 $self->throw("method $method not supported in " . ref($self));
177             }
178 47         87 return $self->SUPER::analysis_method($method);
179             }
180              
181             =head2 next_feature
182              
183             Title : next_feature
184             Usage : while($gene = $Genemark->next_feature()) {
185             # do something
186             }
187             Function: Returns the next gene structure prediction of the Genemark result
188             file. Call this method repeatedly until FALSE is returned.
189              
190             The returned object is actually a SeqFeatureI implementing object.
191             This method is required for classes implementing the
192             SeqAnalysisParserI interface, and is merely an alias for
193             next_prediction() at present.
194              
195             Example :
196             Returns : A Bio::Tools::Prediction::Gene object.
197             Args :
198              
199             =cut
200              
201             sub next_feature {
202 0     0 1 0 my ($self,@args) = @_;
203             # even though next_prediction doesn't expect any args (and this method
204             # does neither), we pass on args in order to be prepared if this changes
205             # ever
206 0         0 return $self->next_prediction(@args);
207             }
208              
209             =head2 next_prediction
210              
211             Title : next_prediction
212             Usage : while($gene = $Genemark->next_prediction()) {
213             # do something
214             }
215             Function: Returns the next gene structure prediction of the Genemark result
216             file. Call this method repeatedly until FALSE is returned.
217              
218             Example :
219             Returns : A Bio::Tools::Prediction::Gene object.
220             Args :
221              
222             =cut
223              
224             sub next_prediction {
225 16     16 1 4062 my ($self) = @_;
226 16         14 my $gene;
227              
228             # if the prediction section hasn't been parsed yet, we do this now
229 16 100       30 $self->_parse_predictions() unless $self->_predictions_parsed();
230              
231             # get next gene structure
232 16         23 $gene = $self->_prediction();
233              
234 16         23 return $gene;
235             }
236              
237             =head2 _parse_predictions
238              
239             Title : _parse_predictions()
240             Usage : $obj->_parse_predictions()
241             Function: Parses the prediction section. Automatically called by
242             next_prediction() if not yet done.
243             Example :
244             Returns :
245              
246             =cut
247              
248             sub _parse_predictions {
249 2     2   3 my ($self) = @_;
250 2         13 my %exontags = ('Initial' => 'Initial',
251             'Internal' => 'Internal',
252             'Terminal' => 'Terminal',
253             'Single' => '',
254             '_na_' => '');
255 2         3 my $exontag;
256             my $gene;
257 0         0 my $exontype;
258 2         1 my $current_gene_no = -1;
259              
260             # The prediction report does not contain a sequence identifier
261             # (at least the prokaryotic version doesn't)
262 2         5 my $seqname = $self->_seqname();
263            
264 2         14 while(defined($_ = $self->_readline())) {
265              
266 77 100 100     284 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
267              
268             # this is an exon, Genemark doesn't predict anything else
269             # $prednr corresponds to geneno.
270 45         67 my $prednr = $1;
271              
272             #exon no:
273 45         36 my $signalnr = 0;
274 45 100       74 if ($2) { my $signalnr = $2; } # used in tag: exon_no
  43         39  
275            
276             # split into fields
277 45         48 chomp();
278 45         115 my @flds = split(' ', $_);
279              
280             # create the feature (an exon) object
281 45         109 my $predobj = Bio::Tools::Prediction::Exon->new();
282              
283            
284             # define info depending on it being eu- or prokaryot
285 45         35 my ($start, $end, $orientation, $prediction_source);
286              
287 45 100       80 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
288 2         4 $prediction_source = "Genemark.hmm.pro";
289 2 50       4 $orientation = ($flds[1] eq '+') ? 1 : -1;
290 2         5 ($start, $end) = @flds[(2,3)];
291 2         2 $exontag = "_na_";
292              
293             } else {
294 43         35 $prediction_source = "Genemark.hmm.eu";
295 43 100       60 $orientation = ($flds[2] eq '+') ? 1 : -1;
296 43         66 ($start, $end) = @flds[(4,5)];
297 43         39 $exontag = $flds[3];
298             }
299              
300             # instatiate a location object via
301             # Bio::Factory::FTLocationFactory (to handle
302             # inexact coordinates)
303 45         63 my $location_string = join('..', $start, $end);
304 45         95 my $location_factory = Bio::Factory::FTLocationFactory->new();
305 45         95 my $location_obj = $location_factory->from_string($location_string);
306 45         90 $predobj->location($location_obj);
307              
308             #store the data in the exon object
309 45         84 $predobj->source_tag($prediction_source);
310 45         79 $predobj->strand($orientation);
311              
312 45         123 $predobj->primary_tag($exontags{$exontag} . "Exon");
313              
314 45 50       64 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
315              
316 45         78 $predobj->is_coding(1);
317              
318 45 50 33     213 $predobj->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
319            
320             # frame calculation as in the genscan module
321             # is to be implemented...
322            
323             #If the $prednr is not equal to the current gene, we
324             #need to make a new gene and close the old one
325 45 100       83 if($prednr != $current_gene_no) {
326             # a new gene, store the old one if it exists
327 15 100       29 if (defined ($gene)) {
328 13         28 $gene->seq_id($seqname);
329 13         11 $gene = undef ;
330             }
331             #and make a new one
332 15         64 $gene = Bio::Tools::Prediction::Gene->new
333             (
334             '-primary' => "GenePrediction$prednr",
335             '-source' => $prediction_source);
336 15         32 $self->_add_prediction($gene);
337 15         15 $current_gene_no = $prednr;
338 15 50 33     62 $gene->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
339             }
340            
341             # Add the exon to the gene
342             $gene->add_exon($predobj, ($exontag eq "_na_" ?
343 45 100       123 undef : $exontags{$exontag}));
344              
345             }
346              
347 77 100       131 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
348 2         7 $self->analysis_method($1);
349              
350 2         4 my $gm_version = $2;
351              
352 2         8 $self->analysis_method_version($gm_version);
353 2         4 next;
354             }
355              
356             #Matrix file for eukaryot version
357 75 100       143 if (/^Matrices file:\s+(\S+)?/i) {
358 1         6 $self->analysis_subject($1);
359             # since the line after the matrix file is always the date
360             # (in the output file's I have seen!) extract and store this
361             # here
362 1 50       4 if (defined(my $_date = $self->_readline())) {
363 1         2 chomp ($_date);
364 1         4 $self->analysis_date($_date);
365             }
366             }
367            
368             #Matrix file for prokaryot version
369 75 100       94 if (/^Model file name:\s+(\S+)/) {
370 1         7 $self->analysis_subject($1);
371             # since the line after the matrix file is always the date
372             # (in the output file's I have seen!) extract and store this
373             # here
374 1         4 my $_date = $self->_readline() ;
375 1 50       4 if (defined($_date = $self->_readline())) {
376 1         2 chomp ($_date);
377 1         5 $self->analysis_date($_date);
378             }
379             }
380            
381 75 100       143 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
382 1         3 $seqname = $1;
383             # $self->analysis_subject($seqname);
384 1         2 next;
385             }
386            
387              
388 74 100       179 /^>/ && do {
389 1         17 $self->_pushback($_);
390              
391             # section of predicted aa sequences on recognition
392             # of a fasta start, read all sequences and find the
393             # appropriate gene
394 1         2 while (1) {
395 14         22 my ($aa_id, $seq) = $self->_read_fasta_seq();
396 14 100       24 last unless ($aa_id);
397              
398             #now parse through the predictions to add the pred. protein
399 13         13 FINDPRED: foreach my $gene (@{$self->{'_preds'}}) {
  13         23  
400 91         124 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
401 91         91 my $geneno = $1;
402 91 100       622 if ($aa_id =~ /\|gene.$geneno\|/) {
403             #print "x SEQ : \n $seq \nXXXX\n";
404 13         45 my $seqobj = Bio::Seq->new('-seq' => $seq,
405             '-display_id' => $aa_id,
406             '-alphabet' => "protein");
407 13         31 $gene->predicted_protein($seqobj);
408 13         27 last FINDPRED;
409             }
410              
411             }
412             }
413              
414 1         2 last;
415             };
416             }
417              
418             # if the analysis query object contains a ref to a Seq of PrimarySeq
419             # object, then extract the predicted sequences and add it to the gene
420             # object.
421 2 50       32 if (defined $self->analysis_query()) {
422 0         0 my $orig_seq = $self->analysis_query();
423 0         0 FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) {
  0         0  
424 0         0 my $predseq = "";
425 0         0 foreach my $exon ($gene->exons()) {
426             #print $exon->start() . " " . $exon->end () . "\n";
427 0         0 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
428             }
429              
430 0         0 my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq,
431             '-display_id' => "transl");
432 0         0 $gene->predicted_cds($seqobj);
433             }
434             }
435              
436              
437 2         8 $self->_predictions_parsed(1);
438             }
439              
440             =head2 _prediction
441              
442             Title : _prediction()
443             Usage : $gene = $obj->_prediction()
444             Function: internal
445             Example :
446             Returns :
447              
448             =cut
449              
450             sub _prediction {
451 16     16   13 my ($self) = @_;
452              
453 16 100 66     31 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
  16         47  
454 15         8 return shift(@{$self->{'_preds'}});
  15         30  
455             }
456              
457             =head2 _add_prediction
458              
459             Title : _add_prediction()
460             Usage : $obj->_add_prediction($gene)
461             Function: internal
462             Example :
463             Returns :
464              
465             =cut
466              
467             sub _add_prediction {
468 15     15   15 my ($self, $gene) = @_;
469              
470 15 50       25 if(! exists($self->{'_preds'})) {
471 0         0 $self->{'_preds'} = [];
472             }
473 15         11 push(@{$self->{'_preds'}}, $gene);
  15         29  
474             }
475              
476             =head2 _predictions_parsed
477              
478             Title : _predictions_parsed
479             Usage : $obj->_predictions_parsed
480             Function: internal
481             Example :
482             Returns : TRUE or FALSE
483              
484             =cut
485              
486             sub _predictions_parsed {
487 18     18   15 my ($self, $val) = @_;
488              
489 18 100       29 $self->{'_preds_parsed'} = $val if $val;
490 18 50       31 if(! exists($self->{'_preds_parsed'})) {
491 0         0 $self->{'_preds_parsed'} = 0;
492             }
493 18         42 return $self->{'_preds_parsed'};
494             }
495              
496             =head2 _has_cds
497              
498             Title : _has_cds()
499             Usage : $obj->_has_cds()
500             Function: Whether or not the result contains the predicted CDSs, too.
501             Example :
502             Returns : TRUE or FALSE
503              
504             =cut
505              
506             sub _has_cds {
507 0     0   0 my ($self, $val) = @_;
508              
509 0 0       0 $self->{'_has_cds'} = $val if $val;
510 0 0       0 if(! exists($self->{'_has_cds'})) {
511 0         0 $self->{'_has_cds'} = 0;
512             }
513 0         0 return $self->{'_has_cds'};
514             }
515              
516             =head2 _read_fasta_seq
517              
518             Title : _read_fasta_seq()
519             Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
520             Function: Simple but specialised FASTA format sequence reader. Uses
521             $self->_readline() to retrieve input, and is able to strip off
522             the traling description lines.
523             Example :
524             Returns : An array of two elements.
525              
526             =cut
527              
528             sub _read_fasta_seq {
529 14     14   16 my ($self) = @_;
530 14         10 my ($id, $seq);
531 14         40 local $/ = ">";
532              
533 14 100       25 return 0 unless (my $entry = $self->_readline());
534              
535 13         16 $entry =~ s/^>//;
536             # complete the entry if the first line came from a pushback buffer
537 13         36 while(! ($entry =~ />$/)) {
538 2 100       6 last unless ($_ = $self->_readline());
539 1         5 $entry .= $_;
540             }
541              
542             # delete everything onwards from an new fasta start (>)
543 13         35 $entry =~ s/\n>.*$//s;
544             # id and sequence
545              
546 13 50       45 if($entry =~ s/^(.+)\n//) {
547 13         51 $id = $1;
548 13         22 $id =~ s/ /_/g;
549 13         10 $seq = $entry;
550 13         65 $seq =~ s/\s//g;
551             #print "\n@@ $id \n@@ $seq \n##\n";
552             } else {
553 0         0 $self->throw("Can't parse Genemark predicted sequence entry");
554             }
555 13         23 $seq =~ s/\s//g; # Remove whitespace
556 13         33 return ($id, $seq);
557             }
558              
559             =head2 _seqname
560              
561             Title : _seqname
562             Usage : $obj->_seqname($seqname)
563             Function: internal
564             Example :
565             Returns : String
566              
567             =cut
568              
569             sub _seqname {
570 3     3   4 my ($self, $val) = @_;
571              
572 3 100       6 $self->{'_seqname'} = $val if $val;
573 3 100       7 if(! exists($self->{'_seqname'})) {
574 1         4 $self->{'_seqname'} = 'unknown';
575             }
576 3         4 return $self->{'_seqname'};
577             }
578              
579             1;
580