File Coverage

Bio/Tools/Geneid.pm
Criterion Covered Total %
statement 94 95 98.9
branch 22 24 91.6
condition 2 3 66.6
subroutine 13 13 100.0
pod 2 2 100.0
total 133 137 97.0


line stmt bran cond sub pod time code
1             #
2             # Please direct questions and support issues to
3             #
4             # Cared for by Keith James
5             #
6             # Copyright Genome Research Ltd.
7             #
8             # You may distribute this module under the same terms as Perl itself
9              
10             # POD documentation - main docs before the code
11              
12              
13             =encoding utf-8
14              
15             =head1 NAME
16              
17             Bio::Tools::Geneid - Results of one geneid run
18              
19             =head1 SYNOPSIS
20              
21             use Bio::Tools::Geneid;
22             my $gid = Bio::Tools::Geneid(-file => "geneid.out");
23              
24             while (my $gene = $gid->next_prediction)
25             {
26             my @transcripts = $gene->transcripts;
27             foreach my $t (@transcripts)
28             {
29             my @exons = $t->exons;
30             foreach my $e (@exons)
31             {
32             printf("Exon %d..%d\n", $e->start, $e->end);
33             }
34             }
35             }
36              
37             =head1 DESCRIPTION
38              
39             This is the parser for the output of geneid by Enrique Blanco and
40             Roderic GuigE<243> (IMIM-UPF). See http://www1.imim.es/software/geneid. It
41             relies on native geneid output format internally and will work with
42             geneid versions 1.0 and 1.1. Currently this module supports only the
43             default mode of operation which is to predict exons and assemble an
44             optimal gene prediction.
45              
46             It takes either a file handle or a file name and returns a
47             Bio::SeqFeature::Gene::GeneStructure object.
48              
49             =head1 FEEDBACK
50              
51             =head2 Mailing Lists
52              
53             User feedback is an integral part of the evolution of this and other
54             Bioperl modules. Send your comments and suggestions preferably to one
55             of the Bioperl mailing lists. Your participation is much appreciated.
56              
57             bioperl-l@bioperl.org - General discussion
58             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
59              
60             =head2 Support
61              
62             Please direct usage questions or support issues to the mailing list:
63              
64             I
65              
66             rather than to the module maintainer directly. Many experienced and
67             reponsive experts will be able look at the problem and quickly
68             address it. Please include a thorough description of the problem
69             with code and data examples if at all possible.
70              
71             =head2 Reporting Bugs
72              
73             Report bugs to the Bioperl bug tracking system to help us keep track
74             the bugs and their resolution. Bug reports can be submitted via the
75             web:
76              
77             https://github.com/bioperl/bioperl-live/issues
78              
79             =head1 AUTHOR - Keith James
80              
81             Email: kdj@sanger.ac.uk
82              
83             =head1 APPENDIX
84              
85             The rest of the documentation details each of the object methods.
86             Internal methods are usually preceded with a _
87              
88             =cut
89              
90             package Bio::Tools::Geneid;
91              
92 1     1   376 use vars qw($SOURCE_TAG);
  1         1  
  1         33  
93 1     1   3 use strict;
  1         1  
  1         14  
94              
95 1     1   253 use Bio::Tools::AnalysisResult;
  1         2  
  1         20  
96 1     1   374 use Bio::SeqFeature::Generic;
  1         1  
  1         797  
97 1     1   255 use Bio::SeqFeature::Gene::Exon;
  1         1  
  1         25  
98 1     1   293 use Bio::SeqFeature::Gene::Transcript;
  1         2  
  1         25  
99 1     1   270 use Bio::SeqFeature::Gene::GeneStructure;
  1         1  
  1         29  
100              
101 1     1   4 use base qw(Bio::Root::Root Bio::Root::IO);
  1         1  
  1         667  
102             $SOURCE_TAG = 'geneid';
103              
104             =head2 new
105              
106             Title : new
107             Usage : $obj->new(-file = "
108             $obj->new(-fh => \*GI);
109             Function: Constructor for geneid wrapper. Takes either a file
110             : or filehandle
111             Returns : L
112              
113             =cut
114              
115             sub new
116             {
117 1     1 1 11 my($class, @args) = @_;
118 1         8 my $self = $class->SUPER::new(@args);
119 1         6 $self->_initialize_io(@args);
120 1         2 return $self;
121             }
122              
123             =head2 next_prediction
124              
125             Title : next_prediction
126             Usage : while($gene = $geneid->next_prediction)
127             {
128             # do something
129             }
130             Function: Returns the gene structure prediction of the geneid result
131             file. Call this method repeatedly until FALSE is returned.
132             Returns : A Bio::SeqFeature::Gene::GeneStructure object
133             Args : None
134              
135             =cut
136              
137             sub next_prediction
138             {
139 4     4 1 14 my ($self) = @_;
140              
141 4         2 my ($gene, $transcript, $current_gene_id);
142 4         3 my $transcript_score = 0;
143              
144 4         5 my ($gene_id, $exon_type, $exon_start, $exon_end, $exon_score,
145             $exon_strand, $start_phase, $end_phase, $start_sig_score,
146             $end_sig_score, $coding_pot_score, $homol_score);
147              
148 4         12 while (defined($_ = $self->_readline))
149             {
150 39         63 $self->debug($_);
151              
152 39         62 s/^\s+//;
153 39         98 s/\s+$//;
154              
155             # We have a choice of geneid, gff or XML formats. The native
156             # geneid format has more information than gff. However, we
157             # then need to perform the hack of extracting the sequence ID
158             # from the header of the embedded Fasta file which comes after
159             # the exon data, as it is not stored elsewhere. Ack.
160             # the new version of geneID includes the sequence ID in a slightly
161             # different format and a new "or" statement was added below
162             # also removed "unless defined $self->_target_id;" inorder to continue
163             # generating new sequence IDs.
164            
165 39 100 66     121 if (/^>(\S+)\|GeneId/ or /^# Sequence (\S+)/)
166             {
167 3         6 my $target_id = $1;
168 3         5 $self->_target_id($target_id);
169 3         5 next;
170             }
171              
172 36 100       114 next unless (/(Single|First|Internal|Terminal)/);
173              
174 10         60 my @fields = split(/\s+/, $_);
175              
176             # Grab gene_id from eol first as there are issues with
177             # inconsistent whitespace in the AA coords field
178 10         13 $gene_id = pop @fields;
179              
180 10         34 ($exon_type, $exon_start, $exon_end, $exon_score,
181             $exon_strand, $start_phase, $end_phase, $start_sig_score,
182             $end_sig_score, $coding_pot_score, $homol_score) = @fields[0..10];
183              
184 10 100       25 if (! defined $current_gene_id)
    100          
185             {
186             # Starting the requested prediction
187 3         3 $current_gene_id = $gene_id;
188 3         4 $transcript_score = $exon_score;
189              
190 3         19 $gene = Bio::SeqFeature::Gene::GeneStructure->new(-source =>
191             $SOURCE_TAG);
192 3         15 $transcript = Bio::SeqFeature::Gene::Transcript->new(-source =>
193             $SOURCE_TAG);
194              
195 3         7 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
196             $exon_strand, $start_phase, $end_phase, $start_sig_score,
197             $end_sig_score, $coding_pot_score, $homol_score);
198             }
199             elsif ($gene_id eq $current_gene_id)
200             {
201             # Still in requested prediction
202 5         11 $transcript_score += $exon_score;
203              
204 5         10 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
205             $exon_strand, $start_phase, $end_phase, $start_sig_score,
206             $end_sig_score, $coding_pot_score, $homol_score);
207             }
208             else
209             {
210             # Found following prediction
211 2         8 $self->_pushback($_);
212 2         4 last;
213             }
214             }
215              
216 4 100       9 if (defined $gene)
217             {
218 3         4 $transcript->seq_id($self->_target_id);
219 3         6 $transcript->score($transcript_score);
220 3         9 $gene->add_transcript($transcript);
221 3         4 $gene->seq_id($self->_target_id);
222              
223 3         7 foreach my $exon ($gene->exons)
224             {
225 8         6 $exon->seq_id($self->_target_id);
226             }
227              
228 3         9 $self->_set_strand($gene);
229             }
230              
231 4         8 return $gene;
232             }
233              
234             =head2 _add_exon
235              
236             Title : _add_exon
237             Usage : $obj->_add_exon($gene, $transcript, ... exon data ...)
238             Function: Adds a new exon to both gene and transcript from the data
239             : supplied as args
240             Example :
241             Returns : Nothing
242              
243             =cut
244              
245             sub _add_exon
246             {
247 8     8   15 my ($self, $gene, $transcript, $exon_type, $exon_start, $exon_end,
248             $exon_score, $exon_strand, $start_phase, $end_phase, $start_sig_score,
249             $end_sig_score, $coding_pot_score, $homol_score) = @_;
250              
251 8         10 $exon_type =~ s/First/Initial/;
252              
253 8 100       12 my $strand = $exon_strand eq '+' ? 1 : -1;
254              
255 8         31 my $exon = Bio::SeqFeature::Gene::Exon->new(-source => $SOURCE_TAG,
256             -start => $exon_start,
257             -end => $exon_end,
258             -strand => $strand,
259             -score => $exon_score);
260 8         14 $exon->is_coding(1);
261 8         11 $exon->add_tag_value("Type", $exon_type);
262 8         10 $exon->add_tag_value('phase', $start_phase);
263 8         9 $exon->add_tag_value('end_phase', $end_phase);
264 8         11 $exon->add_tag_value('start_signal_score', $start_sig_score);
265 8         15 $exon->add_tag_value('end_signal_score', $end_sig_score);
266 8         9 $exon->add_tag_value('coding_potential_score', $coding_pot_score);
267 8         12 $exon->add_tag_value('homology_score', $homol_score);
268              
269 8 100       14 $transcript->strand($strand) unless $transcript->strand != 0;
270 8         18 $transcript->add_exon($exon, $exon_type);
271             }
272              
273             =head2 _set_strand
274              
275             Title : _set_strand
276             Usage : $obj->_set_strand($gene)
277             Function: Sets the overall gene strand to the same strand as all
278             : the exons if they are all on the same strand, or to strand 0
279             : if the exons are on different strands.
280             Example :
281             Returns : Nothing
282              
283             =cut
284              
285             sub _set_strand
286             {
287 3     3   4 my ($self, $gene) = @_;
288              
289 3         3 my $fwd = 0;
290 3         1 my $rev = 0;
291              
292 3         5 my @exons = $gene->exons;
293 3         4 foreach my $exon (@exons)
294             {
295 8         11 my $strand = $exon->strand;
296              
297 8 100       16 if ($strand == 1)
    50          
298             {
299 2         4 $fwd++;
300             }
301             elsif ($strand == -1)
302             {
303 6         3 $rev++;
304             }
305             }
306              
307 3 100       8 if ($#exons == $fwd)
    50          
308             {
309 1         2 $gene->strand(1);
310             }
311             elsif ($#exons == $rev)
312             {
313 0         0 $gene->strand(-1);
314             }
315             else
316             {
317 2         3 $gene->strand(0);
318             }
319              
320 3         4 return $gene;
321             }
322              
323             =head2 _target_id
324              
325             Title : _target_id
326             Usage : $obj->_target_id
327             Function: get/set for genomic sequence id
328             Example :
329             Returns : A target ID
330              
331             =cut
332              
333             sub _target_id
334             {
335 17     17   17 my ($self,$val) = @_;
336 17 100       21 if ($val)
337             {
338 3         4 $self->{'_target_id'} = $val;
339             }
340              
341 17         30 return $self->{'_target_id'};
342             }
343              
344             1;