| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Bio::ToolBox::parser::gff; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '1.66'; |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 NAME |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
Bio::ToolBox::parser::gff - parse GFF3, GTF, and GFF files |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use Bio::ToolBox::parser::gff; |
|
12
|
|
|
|
|
|
|
my $filename = 'file.gff3'; |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
my $parser = Bio::ToolBox::parser::gff->new( |
|
15
|
|
|
|
|
|
|
file => $filename, |
|
16
|
|
|
|
|
|
|
do_gene => 1, |
|
17
|
|
|
|
|
|
|
do_exon => 1, |
|
18
|
|
|
|
|
|
|
) or die "unable to open gff file!\n"; |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
while (my $feature = $parser->next_top_feature() ) { |
|
21
|
|
|
|
|
|
|
# each $feature is a SeqFeature object |
|
22
|
|
|
|
|
|
|
my @children = $feature->get_SeqFeatures(); |
|
23
|
|
|
|
|
|
|
} |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
This module parses a GFF file into SeqFeature objects. It natively |
|
28
|
|
|
|
|
|
|
handles GFF3, GTF, and general GFF files. |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
For both GFF3 and GTF files, fully nested gene models, typically |
|
31
|
|
|
|
|
|
|
gene =E transcript =E (exon, CDS, etc), may be built using the appropriate |
|
32
|
|
|
|
|
|
|
attribute tags. For GFF3 files, these include ID and Parent tags; for GTF |
|
33
|
|
|
|
|
|
|
these include C and C tags. |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
For GFF3 files, any feature without a C tag is assumed to be a |
|
36
|
|
|
|
|
|
|
parent. Children features referencing a parent feature that has not been |
|
37
|
|
|
|
|
|
|
loaded are considered orphans. Orphans are attempted to be re-associated |
|
38
|
|
|
|
|
|
|
with missing parents after the file is completely parsed. Any orphans left |
|
39
|
|
|
|
|
|
|
may be collected. Files with orphans are considered poorly formatted or |
|
40
|
|
|
|
|
|
|
incomplete and should be fixed. Multiple parentage, for example exons |
|
41
|
|
|
|
|
|
|
shared between different transcripts of the same gene, are fully supported. |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Embedded Fasta sequences are ignored, as are most comment and pragma lines. |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
The SeqFeature objects that are returned are L |
|
46
|
|
|
|
|
|
|
objects. Refer to that documentation for more information. |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=head1 METHODS |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 Initialize and modify the parser. |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
These are class methods to initialize the parser with an annotation file |
|
53
|
|
|
|
|
|
|
and modify the parsing behavior. Most parameters can be set either upon |
|
54
|
|
|
|
|
|
|
initialization or as class methods on the object. Unpredictable behavior |
|
55
|
|
|
|
|
|
|
may occur if you implement these in the midst of parsing a file. |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
Do not open subsequent files with the same object. Always create a new |
|
58
|
|
|
|
|
|
|
object to parse a new file |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=over 4 |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=item new |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
my $parser = Bio::ToolBox::parser::gff->new($filename); |
|
65
|
|
|
|
|
|
|
my $parser = Bio::ToolBox::parser::gff->new( |
|
66
|
|
|
|
|
|
|
file => 'file.gtf.gz', |
|
67
|
|
|
|
|
|
|
do_gene => 1, |
|
68
|
|
|
|
|
|
|
do_utr => 1, |
|
69
|
|
|
|
|
|
|
); |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Initialize a new gff parser object. Pass a single value (a GFF file name) |
|
72
|
|
|
|
|
|
|
to open a file. Alternatively, pass an array of key value pairs to control |
|
73
|
|
|
|
|
|
|
how the file is parsed. Options include the following. |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=over 4 |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=item file |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Provide a GFF file name to be parsed. It should have a gff, gtf, or gff3 file |
|
80
|
|
|
|
|
|
|
extension. The file may be gzip compressed. |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=item version |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Specify the version. Normally this is not needed, as version can be determined |
|
85
|
|
|
|
|
|
|
either from the file extension (in the case of gtf and gff3) or from the |
|
86
|
|
|
|
|
|
|
C<##gff-version> pragma at the top of the file. Acceptable values include 1, 2, |
|
87
|
|
|
|
|
|
|
2.5 (gtf), or 3. |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=item class |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
Pass the name of a L compliant class that will be used to |
|
92
|
|
|
|
|
|
|
create the SeqFeature objects. The default is to use L. |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=item simplify |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Pass a boolean value to simplify the SeqFeature objects parsed from the GFF |
|
97
|
|
|
|
|
|
|
file and ignore extraneous attributes. |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=item do_gene |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Pass a boolean (1 or 0) value to combine multiple transcripts with the same gene |
|
102
|
|
|
|
|
|
|
name under a single gene object. Default is true. |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item do_cds |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=item do_exon |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=item do_utr |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=item do_codon |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Pass a boolean (1 or 0) value to parse certain subfeatures. Exon subfeatures |
|
113
|
|
|
|
|
|
|
are always parsed, but C, C, C, C, |
|
114
|
|
|
|
|
|
|
and C features may be optionally parsed. Default is false. |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=back |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=item open_file |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
$parser->open_file($file) or die "unable to open $file!"; |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Pass the name of a GFF file to be parsed. The file may optionally be gzipped |
|
123
|
|
|
|
|
|
|
(F<.gz> extension). Do not open a new file when one has already opened a file. |
|
124
|
|
|
|
|
|
|
Create a new object for a new file, or concatenate the GFF files. |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item version |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Set or get the GFF version of the current file. Acceptable values include 1, 2, |
|
129
|
|
|
|
|
|
|
2.5 (gtf), or 3. Normally this is determined by file extension or |
|
130
|
|
|
|
|
|
|
C pragma on the first line, and should not need to be set by the |
|
131
|
|
|
|
|
|
|
user in most circumstances. |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item simplify |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Pass a boolean true value to simplify the attributes of GFF3 and GTF files |
|
136
|
|
|
|
|
|
|
that may have considerable numbers of tags, e.g. Ensembl files. Only |
|
137
|
|
|
|
|
|
|
essential information, including name, ID, and parentage, is retained. |
|
138
|
|
|
|
|
|
|
Useful if you're trying to quickly parse annotation files for basic |
|
139
|
|
|
|
|
|
|
information. |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=back |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head2 Feature retrieval |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
The following methods parse the GFF file lines into SeqFeature objects. |
|
146
|
|
|
|
|
|
|
It is best if these methods are not mixed; unexpected results may occur. |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=over 4 |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=item next_top_feature |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
This method will return a top level parent SeqFeature object |
|
153
|
|
|
|
|
|
|
assembled with child features as sub-features. For example, a gene |
|
154
|
|
|
|
|
|
|
object with mRNA subfeatures, which in turn may have exon and/or CDS |
|
155
|
|
|
|
|
|
|
subfeatures. Child features are assembled based on the existence of |
|
156
|
|
|
|
|
|
|
proper Parent attributes in child features. If no Parent attributes are |
|
157
|
|
|
|
|
|
|
included in the GFF file, then this will behave as L. |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Child features (those containing a C attribute) |
|
160
|
|
|
|
|
|
|
are associated with the parent feature. A warning will be issued about lost |
|
161
|
|
|
|
|
|
|
children (orphans). Shared subfeatures, for example exons common to |
|
162
|
|
|
|
|
|
|
multiple transcripts, are associated properly with each parent. An opportunity |
|
163
|
|
|
|
|
|
|
to rescue orphans is available using the L method. |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
Note that subfeatures may not necessarily be in ascending genomic order |
|
166
|
|
|
|
|
|
|
when associated with the feature, depending on their order in the GFF3 |
|
167
|
|
|
|
|
|
|
file and whether shared subfeatures are present or not. When calling |
|
168
|
|
|
|
|
|
|
subfeatures in your program, you may want to sort the subfeatures. For |
|
169
|
|
|
|
|
|
|
example |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
my @subfeatures = map { $_->[0] } |
|
172
|
|
|
|
|
|
|
sort { $a->[1] <=> $b->[1] } |
|
173
|
|
|
|
|
|
|
map { [$_, $_->start] } |
|
174
|
|
|
|
|
|
|
$parent->get_SeqFeatures; |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=item top_features |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
This method will return an array of the top (parent) features defined in |
|
179
|
|
|
|
|
|
|
the GFF file. This is similar to the L method except that |
|
180
|
|
|
|
|
|
|
all features are returned at once. |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=item next_feature |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
This method will return a SeqFeature object representation of |
|
185
|
|
|
|
|
|
|
the next feature (line) in the file. Parent - child relationships are |
|
186
|
|
|
|
|
|
|
NOT assembled; however, undefined parents in a GTF file may still be |
|
187
|
|
|
|
|
|
|
generated, just not returned. |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
This method is best used with simple GFF files with no hierarchies |
|
190
|
|
|
|
|
|
|
present. This may be used in a while loop until the end of the file |
|
191
|
|
|
|
|
|
|
is reached. Pragmas are ignored and comment lines and sequence are |
|
192
|
|
|
|
|
|
|
automatically skipped. |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=back |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=head2 Other methods |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
Additional methods for working with the parser object and the parsed |
|
199
|
|
|
|
|
|
|
SeqFeature objects. |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=over 4 |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=item fh |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
This method returns the L object of the opened GFF file. |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=item parse_file |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Parses the entire file into memory. This is automatically called when |
|
210
|
|
|
|
|
|
|
either L or L is called. |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=item find_gene |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
my $gene = $parser->find_gene( |
|
215
|
|
|
|
|
|
|
name => $display_name, |
|
216
|
|
|
|
|
|
|
id => $primary_id, |
|
217
|
|
|
|
|
|
|
) or warn "gene $display_name can not be found!"; |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
Pass a gene name, or an array of key =E values (C, C, |
|
220
|
|
|
|
|
|
|
C, C, and/or coordinate information), that can be used |
|
221
|
|
|
|
|
|
|
to find a gene already loaded into memory. Only useful after |
|
222
|
|
|
|
|
|
|
is called. Genes with a matching name are confirmed by a matching ID or |
|
223
|
|
|
|
|
|
|
overlapping coordinates, if available. Otherwise the first match is returned. |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=item orphans |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
my @orphans = $parser->orphans; |
|
228
|
|
|
|
|
|
|
printf "we have %d orphans left over!", scalar @orpans; |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
This method will return an array of orphan SeqFeature objects that indicated |
|
231
|
|
|
|
|
|
|
they had a parent but said parent could not be found. Typically, this is an |
|
232
|
|
|
|
|
|
|
indication of an incomplete or malformed GFF3 file. Nevertheless, it might |
|
233
|
|
|
|
|
|
|
be a good idea to check this after retrieving all top features. |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=item comments |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
This method will return an array of the comment or pragma lines that may have |
|
238
|
|
|
|
|
|
|
been in the parsed file. These may or may not be useful. |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=item typelist |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
Returns a comma-delimited string of the GFF primary tags (column 3) |
|
243
|
|
|
|
|
|
|
observed in the first 1000 lines of an opened file. Useful for |
|
244
|
|
|
|
|
|
|
checking what is in the GFF file. See |
|
245
|
|
|
|
|
|
|
L. |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=item from_gff_string |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
my $seqfeature = $parser->from_gff_string($string); |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
This method will parse a single GFF, GTF, or GFF3 formatted string or line |
|
252
|
|
|
|
|
|
|
of text and return a SeqFeature object. |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=item unescape |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
This method will unescape special characters in a text string. Certain |
|
257
|
|
|
|
|
|
|
characters, including ";" and "=", are reserved for GFF3 formatting and |
|
258
|
|
|
|
|
|
|
are not allowed, thus requiring them to be escaped. |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=item seq_ids |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Returns an array or array reference of the names of the chromosomes or |
|
263
|
|
|
|
|
|
|
reference sequences present in the file. These may be defined by GFF3 |
|
264
|
|
|
|
|
|
|
sequence-region pragmas or inferred from the features. |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=item seq_id_lengths |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
my $seq2len = $parser->seq_id_lengths; |
|
269
|
|
|
|
|
|
|
foreach (keys %$seq2len) { |
|
270
|
|
|
|
|
|
|
printf "chromosome %s is %d bp long\n", $_, $seq2len->{$_}; |
|
271
|
|
|
|
|
|
|
} |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Returns a hash reference to the chromosomes or reference sequences and |
|
274
|
|
|
|
|
|
|
their corresponding lengths. In this case, the length is either defined |
|
275
|
|
|
|
|
|
|
by the C pragma or inferred by the greatest end position of |
|
276
|
|
|
|
|
|
|
the top features. |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=back |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
L, L, L |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=cut |
|
285
|
|
|
|
|
|
|
|
|
286
|
1
|
|
|
1
|
|
71575
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
25
|
|
|
287
|
1
|
|
|
1
|
|
4
|
use Carp qw(carp cluck croak); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
47
|
|
|
288
|
1
|
|
|
1
|
|
443
|
use Bio::ToolBox::Data::file; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
4216
|
|
|
289
|
|
|
|
|
|
|
my $SFCLASS = 'Bio::ToolBox::SeqFeature'; # alternative to Bio::SeqFeature::Lite |
|
290
|
|
|
|
|
|
|
eval "require $SFCLASS" or croak $@; |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
1; |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
sub new { |
|
295
|
11
|
|
|
11
|
1
|
11117
|
my $class = shift; |
|
296
|
11
|
|
|
|
|
148
|
my $self = { |
|
297
|
|
|
|
|
|
|
'fh' => undef, |
|
298
|
|
|
|
|
|
|
'top_features' => [], |
|
299
|
|
|
|
|
|
|
'orphans' => [], |
|
300
|
|
|
|
|
|
|
'duplicate_ids' => {}, |
|
301
|
|
|
|
|
|
|
'loaded' => {}, |
|
302
|
|
|
|
|
|
|
'eof' => 0, |
|
303
|
|
|
|
|
|
|
'do_gene' => 1, |
|
304
|
|
|
|
|
|
|
'do_exon' => 0, |
|
305
|
|
|
|
|
|
|
'do_cds' => 0, |
|
306
|
|
|
|
|
|
|
'do_utr' => 0, |
|
307
|
|
|
|
|
|
|
'do_codon' => 0, |
|
308
|
|
|
|
|
|
|
'gff3' => 0, |
|
309
|
|
|
|
|
|
|
'gtf' => 0, |
|
310
|
|
|
|
|
|
|
'comments' => [], |
|
311
|
|
|
|
|
|
|
'seq_ids' => {}, |
|
312
|
|
|
|
|
|
|
'simplify' => 0, |
|
313
|
|
|
|
|
|
|
'typelist' => '', |
|
314
|
|
|
|
|
|
|
'convertor_sub' => undef, |
|
315
|
|
|
|
|
|
|
}; |
|
316
|
11
|
|
|
|
|
29
|
bless $self, $class; |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# check for options |
|
319
|
11
|
100
|
|
|
|
35
|
if (@_) { |
|
320
|
8
|
50
|
|
|
|
25
|
if (scalar @_ == 1) { |
|
321
|
0
|
0
|
|
|
|
0
|
$self->open_file($_[0]) or croak "unable to open file!"; |
|
322
|
|
|
|
|
|
|
} |
|
323
|
|
|
|
|
|
|
else { |
|
324
|
8
|
|
|
|
|
28
|
my %options = @_; |
|
325
|
8
|
100
|
|
|
|
24
|
if (exists $options{simplify}) { |
|
326
|
4
|
|
|
|
|
15
|
$self->simplify( $options{simplify} ); |
|
327
|
|
|
|
|
|
|
} |
|
328
|
8
|
100
|
|
|
|
22
|
if (exists $options{do_gene}) { |
|
329
|
4
|
|
|
|
|
13
|
$self->do_gene($options{do_gene}); |
|
330
|
|
|
|
|
|
|
} |
|
331
|
8
|
100
|
|
|
|
18
|
if (exists $options{do_exon}) { |
|
332
|
2
|
|
|
|
|
10
|
$self->do_exon($options{do_exon}); |
|
333
|
|
|
|
|
|
|
} |
|
334
|
8
|
100
|
|
|
|
20
|
if (exists $options{do_cds}) { |
|
335
|
4
|
|
|
|
|
13
|
$self->do_cds($options{do_cds}); |
|
336
|
|
|
|
|
|
|
} |
|
337
|
8
|
50
|
|
|
|
20
|
if (exists $options{do_utr}) { |
|
338
|
0
|
|
|
|
|
0
|
$self->do_utr($options{do_utr}); |
|
339
|
|
|
|
|
|
|
} |
|
340
|
8
|
50
|
|
|
|
20
|
if (exists $options{do_codon}) { |
|
341
|
0
|
|
|
|
|
0
|
$self->do_codon($options{do_codon}); |
|
342
|
|
|
|
|
|
|
} |
|
343
|
8
|
50
|
|
|
|
19
|
if (exists $options{version}) { |
|
344
|
0
|
|
|
|
|
0
|
$self->version($options{version}); |
|
345
|
|
|
|
|
|
|
} |
|
346
|
8
|
50
|
33
|
|
|
23
|
if (exists $options{file} or $options{table}) { |
|
347
|
8
|
|
33
|
|
|
16
|
$options{file} ||= $options{table}; |
|
348
|
8
|
50
|
|
|
|
30
|
$self->open_file( $options{file} ) or |
|
349
|
|
|
|
|
|
|
croak "unable to open file!"; |
|
350
|
|
|
|
|
|
|
} |
|
351
|
8
|
50
|
|
|
|
22
|
if (exists $options{class}) { |
|
352
|
8
|
|
|
|
|
16
|
my $class = $options{class}; |
|
353
|
8
|
50
|
|
|
|
526
|
if (eval "require $class; 1") { |
|
354
|
8
|
|
|
|
|
28
|
$SFCLASS = $class; |
|
355
|
|
|
|
|
|
|
} |
|
356
|
|
|
|
|
|
|
else { |
|
357
|
0
|
|
|
|
|
0
|
croak $@; |
|
358
|
|
|
|
|
|
|
} |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
} |
|
361
|
|
|
|
|
|
|
} |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# done |
|
364
|
11
|
|
|
|
|
45
|
return $self; |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub do_gene { |
|
368
|
593
|
|
|
593
|
1
|
1419
|
my $self = shift; |
|
369
|
593
|
100
|
|
|
|
919
|
if (@_) { |
|
370
|
7
|
|
|
|
|
16
|
$self->{'do_gene'} = shift; |
|
371
|
|
|
|
|
|
|
} |
|
372
|
593
|
|
|
|
|
1373
|
return $self->{'do_gene'}; |
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
sub do_exon { |
|
376
|
188
|
|
|
188
|
1
|
218
|
my $self = shift; |
|
377
|
188
|
100
|
|
|
|
321
|
if (@_) { |
|
378
|
6
|
|
|
|
|
10
|
$self->{'do_exon'} = shift; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
188
|
|
|
|
|
339
|
return $self->{'do_exon'}; |
|
381
|
|
|
|
|
|
|
} |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
sub do_cds { |
|
384
|
272
|
|
|
272
|
1
|
324
|
my $self = shift; |
|
385
|
272
|
100
|
|
|
|
399
|
if (@_) { |
|
386
|
5
|
|
|
|
|
9
|
$self->{'do_cds'} = shift; |
|
387
|
|
|
|
|
|
|
} |
|
388
|
272
|
|
|
|
|
419
|
return $self->{'do_cds'}; |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
sub do_utr { |
|
392
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
393
|
0
|
0
|
|
|
|
0
|
if (@_) { |
|
394
|
0
|
|
|
|
|
0
|
$self->{'do_utr'} = shift; |
|
395
|
|
|
|
|
|
|
} |
|
396
|
0
|
|
|
|
|
0
|
return $self->{'do_utr'}; |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
sub do_codon { |
|
400
|
34
|
|
|
34
|
1
|
40
|
my $self = shift; |
|
401
|
34
|
50
|
|
|
|
77
|
if (@_) { |
|
402
|
0
|
|
|
|
|
0
|
$self->{'do_codon'} = shift; |
|
403
|
|
|
|
|
|
|
} |
|
404
|
34
|
|
|
|
|
62
|
return $self->{'do_codon'}; |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
sub do_name { |
|
408
|
|
|
|
|
|
|
# this does nothing other than maintain compatibility with ucsc parser |
|
409
|
0
|
|
|
0
|
0
|
0
|
return 0; |
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
sub share { |
|
413
|
|
|
|
|
|
|
# this does nothing other than maintain compatibility with ucsc parser |
|
414
|
0
|
|
|
0
|
0
|
0
|
return 1; |
|
415
|
|
|
|
|
|
|
} |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
sub simplify { |
|
418
|
85
|
|
|
85
|
1
|
91
|
my $self = shift; |
|
419
|
85
|
100
|
|
|
|
139
|
if (defined $_[0]) { |
|
420
|
7
|
|
|
|
|
12
|
$self->{simplify} = shift; |
|
421
|
|
|
|
|
|
|
} |
|
422
|
85
|
|
|
|
|
854
|
return $self->{simplify}; |
|
423
|
|
|
|
|
|
|
} |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
sub version { |
|
426
|
24
|
|
|
24
|
1
|
1102
|
my $self = shift; |
|
427
|
24
|
100
|
|
|
|
47
|
if (@_) { |
|
428
|
11
|
|
|
|
|
21
|
my $v = shift; |
|
429
|
11
|
100
|
33
|
|
|
37
|
if ($v eq '3') { |
|
|
|
50
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
430
|
7
|
|
|
|
|
22
|
$self->{version} = $v; |
|
431
|
7
|
|
|
|
|
11
|
$self->{gff3} = 1; |
|
432
|
|
|
|
|
|
|
} |
|
433
|
|
|
|
|
|
|
elsif ($v eq '2.5' or $v eq '2.2') { |
|
434
|
4
|
|
|
|
|
11
|
$self->{version} = '2.5'; |
|
435
|
4
|
|
|
|
|
8
|
$self->{gtf} = 1; |
|
436
|
|
|
|
|
|
|
} |
|
437
|
|
|
|
|
|
|
elsif ($v eq '2' or $v eq '1') { |
|
438
|
0
|
|
|
|
|
0
|
$self->{version} = $v; |
|
439
|
|
|
|
|
|
|
} |
|
440
|
|
|
|
|
|
|
else { |
|
441
|
0
|
|
|
|
|
0
|
warn "unrecognized GFF version '$v'!\n"; |
|
442
|
|
|
|
|
|
|
} |
|
443
|
|
|
|
|
|
|
} |
|
444
|
24
|
|
|
|
|
299
|
return $self->{version}; |
|
445
|
|
|
|
|
|
|
} |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
sub open_file { |
|
448
|
11
|
|
|
11
|
1
|
20
|
my $self = shift; |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
# check file |
|
451
|
11
|
|
|
|
|
13
|
my $filename = shift; |
|
452
|
11
|
50
|
|
|
|
18
|
unless ($filename) { |
|
453
|
0
|
|
|
|
|
0
|
cluck("no file name passed!\n"); |
|
454
|
0
|
|
|
|
|
0
|
return; |
|
455
|
|
|
|
|
|
|
} |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# check type list |
|
458
|
11
|
|
|
|
|
99
|
my $typelist = Bio::ToolBox::Data::file->sample_gff_type_list($filename); |
|
459
|
11
|
50
|
|
|
|
57
|
if ($typelist !~ /\w+/) { |
|
460
|
0
|
|
|
|
|
0
|
print "GFF file has no evident types!? $filename may not be a valid GFF file\n"; |
|
461
|
0
|
|
|
|
|
0
|
return; |
|
462
|
|
|
|
|
|
|
} |
|
463
|
11
|
|
|
|
|
37
|
$self->{typelist} = $typelist; |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
# Open filehandle object |
|
466
|
11
|
50
|
|
|
|
84
|
my $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename) or |
|
467
|
|
|
|
|
|
|
croak " cannot open file '$filename'!\n"; |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
# check gff version pragma |
|
470
|
11
|
|
|
|
|
220
|
my $first = $fh->getline; |
|
471
|
11
|
50
|
|
|
|
364
|
if ($first =~ /^##gff.version\s+([\d\.]+)\s*$/i) { |
|
472
|
|
|
|
|
|
|
# override any version that may have been inferred from the extension |
|
473
|
|
|
|
|
|
|
# based on the assumption that this pragma is correct |
|
474
|
11
|
|
|
|
|
41
|
$self->version($1); |
|
475
|
|
|
|
|
|
|
} |
|
476
|
|
|
|
|
|
|
else { |
|
477
|
|
|
|
|
|
|
# no pragma, reopen the file |
|
478
|
0
|
|
|
|
|
0
|
$fh->close; |
|
479
|
0
|
|
|
|
|
0
|
$fh = Bio::ToolBox::Data::file->open_to_read_fh($filename); |
|
480
|
|
|
|
|
|
|
# set version based on file type extension???? |
|
481
|
0
|
0
|
|
|
|
0
|
if ($filename =~ /\.gtf.*$/i) { |
|
|
|
0
|
|
|
|
|
|
|
482
|
0
|
|
|
|
|
0
|
$self->version('2.5'); |
|
483
|
0
|
|
|
|
|
0
|
$self->{gtf} = 1; |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
elsif ($filename =~ /\.gff3.*$/i) { |
|
486
|
0
|
|
|
|
|
0
|
$self->version('3'); |
|
487
|
0
|
|
|
|
|
0
|
$self->{gff3} = 1; |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
} |
|
490
|
11
|
|
|
|
|
22
|
$self->{fh} = $fh; |
|
491
|
11
|
|
|
|
|
36
|
return 1; |
|
492
|
|
|
|
|
|
|
} |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
sub fh { |
|
495
|
668
|
|
|
668
|
1
|
1966
|
return shift->{fh}; |
|
496
|
|
|
|
|
|
|
} |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub typelist { |
|
499
|
13
|
|
|
13
|
1
|
66
|
return shift->{typelist}; |
|
500
|
|
|
|
|
|
|
} |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
sub next_feature { |
|
503
|
587
|
|
|
587
|
1
|
3053
|
my $self = shift; |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# check that we have an open filehandle |
|
506
|
587
|
50
|
|
|
|
837
|
unless ($self->fh) { |
|
507
|
0
|
|
|
|
|
0
|
croak("no GFF file loaded to parse!"); |
|
508
|
|
|
|
|
|
|
} |
|
509
|
587
|
50
|
|
|
|
868
|
return if $self->{'eof'}; |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# check convertor |
|
512
|
587
|
100
|
|
|
|
872
|
unless (defined $self->{convertor_sub}) { |
|
513
|
11
|
|
|
|
|
26
|
$self->_assign_gff_converter; |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
# look for the next feature line |
|
517
|
587
|
|
|
|
|
9255
|
while (my $line = $self->{fh}->getline) { |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# check first character |
|
520
|
770
|
|
|
|
|
13217
|
my $firstchar = substr($line, 0, 1); |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
# skip any comment and pragma lines that we might encounter |
|
523
|
770
|
100
|
|
|
|
1392
|
if ($firstchar eq '#') { |
|
|
|
50
|
|
|
|
|
|
|
524
|
25
|
50
|
|
|
|
63
|
if ($line =~ /^###$/) { |
|
|
|
50
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# a close pragma, all we can do is check for orphans |
|
526
|
|
|
|
|
|
|
# a properly written shouldn't have any orphans, but just in case |
|
527
|
0
|
|
|
|
|
0
|
$self->check_orphanage; |
|
528
|
0
|
|
|
|
|
0
|
next; |
|
529
|
|
|
|
|
|
|
} |
|
530
|
|
|
|
|
|
|
elsif ($line =~ /^##sequence.region/i) { |
|
531
|
|
|
|
|
|
|
# sequence region pragma |
|
532
|
0
|
|
|
|
|
0
|
my ($pragma, $seq_id, $start, $stop) = split /\s+/, $line; |
|
533
|
0
|
0
|
0
|
|
|
0
|
if (defined $seq_id and $start =~ /^\d+$/ and $stop =~ /^\d+$/) { |
|
|
|
|
0
|
|
|
|
|
|
534
|
|
|
|
|
|
|
# we're actually only concerned with the stop coordinate |
|
535
|
0
|
|
|
|
|
0
|
$self->{seq_ids}{$seq_id} = $stop; |
|
536
|
|
|
|
|
|
|
} |
|
537
|
|
|
|
|
|
|
else { |
|
538
|
0
|
|
|
|
|
0
|
warn "malformed sequence-region pragma! $line\n"; |
|
539
|
|
|
|
|
|
|
} |
|
540
|
0
|
|
|
|
|
0
|
next; |
|
541
|
|
|
|
|
|
|
} |
|
542
|
|
|
|
|
|
|
else { |
|
543
|
|
|
|
|
|
|
# must be some sort of pragma or a comment line, may be useful, keep it |
|
544
|
25
|
|
|
|
|
26
|
push @{$self->{comments}}, $line; |
|
|
25
|
|
|
|
|
44
|
|
|
545
|
25
|
|
|
|
|
290
|
next; |
|
546
|
|
|
|
|
|
|
} |
|
547
|
|
|
|
|
|
|
} |
|
548
|
|
|
|
|
|
|
elsif ($firstchar eq '>') { |
|
549
|
|
|
|
|
|
|
# fasta header line |
|
550
|
|
|
|
|
|
|
# this is almost always at the end of the file, and rarely is sequence put |
|
551
|
|
|
|
|
|
|
# into GFF files anyway, so let's assume it's the end of the file |
|
552
|
0
|
|
|
|
|
0
|
$self->{'eof'} = 1; |
|
553
|
0
|
|
|
|
|
0
|
return; |
|
554
|
|
|
|
|
|
|
} |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
# line must be a GFF feature |
|
557
|
745
|
|
|
|
|
870
|
chomp $line; |
|
558
|
745
|
|
|
|
|
2860
|
my @fields = split('\t', $line); |
|
559
|
745
|
50
|
|
|
|
1251
|
next unless scalar(@fields) == 9; |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
# check the primary_tag and generate the SeqFeature object for known types |
|
562
|
745
|
|
|
|
|
1115
|
my $type = lc $fields[2]; |
|
563
|
745
|
100
|
|
|
|
1899
|
if ($type eq 'cds') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
564
|
265
|
100
|
|
|
|
474
|
if ($self->do_cds) { |
|
565
|
199
|
|
|
|
|
249
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
|
199
|
|
|
|
|
320
|
|
|
566
|
|
|
|
|
|
|
} else { |
|
567
|
66
|
|
|
|
|
868
|
next; |
|
568
|
|
|
|
|
|
|
} |
|
569
|
|
|
|
|
|
|
} |
|
570
|
|
|
|
|
|
|
elsif ($type eq 'exon') { |
|
571
|
178
|
50
|
|
|
|
327
|
if ($self->do_exon) { |
|
572
|
178
|
|
|
|
|
206
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
|
178
|
|
|
|
|
237
|
|
|
573
|
|
|
|
|
|
|
} else { |
|
574
|
0
|
|
|
|
|
0
|
next; |
|
575
|
|
|
|
|
|
|
} |
|
576
|
|
|
|
|
|
|
} |
|
577
|
|
|
|
|
|
|
elsif ($type =~ /utr|untranslated/) { |
|
578
|
0
|
0
|
|
|
|
0
|
if ($self->do_utr) { |
|
579
|
0
|
|
|
|
|
0
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
|
0
|
|
|
|
|
0
|
|
|
580
|
|
|
|
|
|
|
} else { |
|
581
|
0
|
|
|
|
|
0
|
next; |
|
582
|
|
|
|
|
|
|
} |
|
583
|
|
|
|
|
|
|
} |
|
584
|
|
|
|
|
|
|
elsif ($type =~ /codon/) { |
|
585
|
32
|
50
|
|
|
|
69
|
if ($self->do_codon) { |
|
586
|
0
|
|
|
|
|
0
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
|
0
|
|
|
|
|
0
|
|
|
587
|
|
|
|
|
|
|
} else { |
|
588
|
32
|
|
|
|
|
438
|
next; |
|
589
|
|
|
|
|
|
|
} |
|
590
|
|
|
|
|
|
|
} |
|
591
|
|
|
|
|
|
|
elsif ($type =~ /gene$/) { |
|
592
|
165
|
50
|
|
|
|
303
|
if ($self->do_gene) { |
|
593
|
165
|
|
|
|
|
219
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
|
165
|
|
|
|
|
235
|
|
|
594
|
|
|
|
|
|
|
} else { |
|
595
|
0
|
|
|
|
|
0
|
next; |
|
596
|
|
|
|
|
|
|
} |
|
597
|
|
|
|
|
|
|
} |
|
598
|
|
|
|
|
|
|
elsif ($type =~ /transcript|rna/) { |
|
599
|
36
|
|
|
|
|
53
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
|
36
|
|
|
|
|
79
|
|
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
elsif ($type =~ /chromosome|contig|scaffold/) { |
|
602
|
|
|
|
|
|
|
# gff3 files can record the chromosome as a gff record |
|
603
|
|
|
|
|
|
|
# process this as a region |
|
604
|
7
|
|
|
|
|
20
|
$self->{seq_ids}{$fields[0]} = $fields[4]; |
|
605
|
7
|
|
|
|
|
95
|
next; |
|
606
|
|
|
|
|
|
|
} |
|
607
|
|
|
|
|
|
|
else { |
|
608
|
|
|
|
|
|
|
# everything else must be some non-standard weird element |
|
609
|
|
|
|
|
|
|
# only process this if the user wants everything |
|
610
|
62
|
100
|
|
|
|
105
|
return &{$self->{convertor_sub}}($self, \@fields) unless $self->simplify; |
|
|
2
|
|
|
|
|
6
|
|
|
611
|
|
|
|
|
|
|
} |
|
612
|
|
|
|
|
|
|
} |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
# presumably reached the end of the file |
|
615
|
7
|
|
|
|
|
245
|
$self->{'eof'} = 1; |
|
616
|
7
|
|
|
|
|
21
|
return; |
|
617
|
|
|
|
|
|
|
} |
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub next_top_feature { |
|
620
|
70
|
|
|
70
|
1
|
87
|
my $self = shift; |
|
621
|
|
|
|
|
|
|
# check that we have an open filehandle |
|
622
|
70
|
50
|
|
|
|
89
|
unless ($self->fh) { |
|
623
|
0
|
|
|
|
|
0
|
croak("no GFF3 file loaded to parse!"); |
|
624
|
|
|
|
|
|
|
} |
|
625
|
70
|
50
|
|
|
|
101
|
unless ($self->{'eof'}) { |
|
626
|
0
|
0
|
|
|
|
0
|
$self->parse_file or croak "unable to parse file!"; |
|
627
|
|
|
|
|
|
|
} |
|
628
|
70
|
|
|
|
|
91
|
return shift @{ $self->{top_features} }; |
|
|
70
|
|
|
|
|
144
|
|
|
629
|
|
|
|
|
|
|
} |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
sub top_features { |
|
632
|
4
|
|
|
4
|
1
|
4328
|
my $self = shift; |
|
633
|
4
|
50
|
|
|
|
74
|
unless ($self->{'eof'}) { |
|
634
|
0
|
|
|
|
|
0
|
$self->parse_file; |
|
635
|
|
|
|
|
|
|
} |
|
636
|
4
|
|
|
|
|
10
|
my @features = @{ $self->{top_features} }; |
|
|
4
|
|
|
|
|
17
|
|
|
637
|
4
|
50
|
|
|
|
19
|
return wantarray ? @features : \@features; |
|
638
|
|
|
|
|
|
|
} |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
*parse_table = \&parse_file; |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub parse_file { |
|
643
|
7
|
|
|
7
|
1
|
23
|
my $self = shift; |
|
644
|
|
|
|
|
|
|
# check that we have an open filehandle |
|
645
|
7
|
50
|
|
|
|
18
|
unless ($self->fh) { |
|
646
|
0
|
|
|
|
|
0
|
confess("no file loaded to parse!"); |
|
647
|
|
|
|
|
|
|
} |
|
648
|
7
|
50
|
|
|
|
20
|
return 1 if $self->{'eof'}; |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
# Each line will be processed into a SeqFeature object, and then checked |
|
651
|
|
|
|
|
|
|
# for parentage. Child objects with a Parent tag will be appropriately |
|
652
|
|
|
|
|
|
|
# associated with its parent, or put into an orphanage. Any orphans |
|
653
|
|
|
|
|
|
|
# left will be checked a final time for parents; if a parent can't be |
|
654
|
|
|
|
|
|
|
# found, it will be lost. Features without a parent are assumed to be |
|
655
|
|
|
|
|
|
|
# top-level features. |
|
656
|
|
|
|
|
|
|
|
|
657
|
7
|
50
|
|
|
|
14
|
printf " Parsing %s format file....\n", |
|
|
|
100
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
$self->version eq '3' ? 'GFF3' : |
|
659
|
|
|
|
|
|
|
$self->version =~ /2\../ ? 'GTF' : 'GFF'; |
|
660
|
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
TOP_FEATURE_LOOP: |
|
663
|
7
|
|
|
|
|
40
|
while (my $feature = $self->next_feature) { |
|
664
|
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
### Process the feature |
|
666
|
|
|
|
|
|
|
# check the ID |
|
667
|
576
|
|
|
|
|
1059
|
my $id = $feature->primary_id; |
|
668
|
|
|
|
|
|
|
# if the seqfeature didn't have an ID specified from the file, then |
|
669
|
|
|
|
|
|
|
# Bio::ToolBox::SeqFeature will autogenerate one, but Bio::SeqFeature::Lite |
|
670
|
|
|
|
|
|
|
# will not - so we will likely lose that feature |
|
671
|
576
|
100
|
|
|
|
1778
|
if ($id) { |
|
672
|
|
|
|
|
|
|
# remember this feature since we have an ID |
|
673
|
404
|
50
|
|
|
|
609
|
if (exists $self->{loaded}{$id}) { |
|
674
|
|
|
|
|
|
|
# this ID should be unique in the GFF file |
|
675
|
|
|
|
|
|
|
# otherwise it might be a shared duplicate or a malformed GFF file |
|
676
|
|
|
|
|
|
|
# generally only a concern for top level features |
|
677
|
0
|
0
|
|
|
|
0
|
if ($feature->primary_tag =~ /gene|rna|transcript/) { |
|
678
|
0
|
|
|
|
|
0
|
$self->{duplicate_ids}{$id} += 1; |
|
679
|
|
|
|
|
|
|
} |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
# store all of the features as an array |
|
682
|
0
|
0
|
|
|
|
0
|
if (ref($self->{loaded}{$id}) eq 'ARRAY') { |
|
683
|
|
|
|
|
|
|
# there's more than two duplicates! pile it on! |
|
684
|
0
|
|
|
|
|
0
|
push @{ $self->{loaded}{$id} }, $feature; |
|
|
0
|
|
|
|
|
0
|
|
|
685
|
|
|
|
|
|
|
} |
|
686
|
|
|
|
|
|
|
else { |
|
687
|
0
|
|
|
|
|
0
|
my $existing = $self->{loaded}{$id}; |
|
688
|
0
|
|
|
|
|
0
|
$self->{loaded}{$id} = [$existing, $feature]; |
|
689
|
|
|
|
|
|
|
} |
|
690
|
|
|
|
|
|
|
} |
|
691
|
|
|
|
|
|
|
else { |
|
692
|
|
|
|
|
|
|
# unique ID, so remember it |
|
693
|
404
|
|
|
|
|
688
|
$self->{loaded}{$id} = $feature; |
|
694
|
|
|
|
|
|
|
} |
|
695
|
|
|
|
|
|
|
} |
|
696
|
|
|
|
|
|
|
# if the feature didn't have an ID, we'll just assume it is |
|
697
|
|
|
|
|
|
|
# a child of another feature, otherwise it may get lost |
|
698
|
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
# look for parents and children |
|
700
|
576
|
100
|
|
|
|
918
|
if ($feature->has_tag('Parent')) { |
|
701
|
|
|
|
|
|
|
# must be a child |
|
702
|
|
|
|
|
|
|
# there may be more than one parent, per the GFF3 specification |
|
703
|
411
|
|
|
|
|
1288
|
foreach my $parent_id ( $feature->get_tag_values('Parent') ) { |
|
704
|
411
|
50
|
|
|
|
1821
|
if (exists $self->{loaded}{$parent_id}) { |
|
705
|
|
|
|
|
|
|
# we've seen this id |
|
706
|
|
|
|
|
|
|
# associate the child with the parent |
|
707
|
411
|
|
|
|
|
467
|
my $parent = $self->{loaded}{$parent_id}; |
|
708
|
411
|
|
|
|
|
802
|
$parent->add_SeqFeature($feature); |
|
709
|
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
# check boundaries for gtf genes |
|
711
|
|
|
|
|
|
|
# gtf genes may not be explicitly defined so must correct as necessary |
|
712
|
|
|
|
|
|
|
# gff3 files shouldn't have this issue |
|
713
|
411
|
100
|
|
|
|
12852
|
if ($self->{gtf}) { |
|
714
|
312
|
50
|
|
|
|
435
|
if ($feature->start < $parent->start) { |
|
715
|
0
|
|
|
|
|
0
|
$parent->start( $feature->start ); |
|
716
|
|
|
|
|
|
|
} |
|
717
|
312
|
100
|
|
|
|
1864
|
if ($feature->end > $parent->end) { |
|
718
|
1
|
|
|
|
|
2
|
$parent->end( $feature->end ); |
|
719
|
|
|
|
|
|
|
} |
|
720
|
|
|
|
|
|
|
# check parent's parent too |
|
721
|
312
|
100
|
|
|
|
1824
|
if ($parent->has_tag('Parent')) { |
|
722
|
|
|
|
|
|
|
# in all likelihood parent is a transcript and there is a |
|
723
|
|
|
|
|
|
|
# gene that probably also needs fixin' |
|
724
|
278
|
|
|
|
|
858
|
my ($grandparent_id) = $parent->get_tag_values('Parent'); |
|
725
|
278
|
|
|
|
|
1155
|
my $grandparent = $self->{loaded}{$grandparent_id}; |
|
726
|
278
|
50
|
|
|
|
386
|
if ($feature->start < $grandparent->start) { |
|
727
|
0
|
|
|
|
|
0
|
$grandparent->start( $feature->start ); |
|
728
|
|
|
|
|
|
|
} |
|
729
|
278
|
50
|
|
|
|
1620
|
if ($feature->end > $grandparent->end) { |
|
730
|
0
|
|
|
|
|
0
|
$grandparent->end( $feature->end ); |
|
731
|
|
|
|
|
|
|
} |
|
732
|
|
|
|
|
|
|
} |
|
733
|
|
|
|
|
|
|
} |
|
734
|
|
|
|
|
|
|
} |
|
735
|
|
|
|
|
|
|
else { |
|
736
|
|
|
|
|
|
|
# can't find the parent, maybe not loaded yet? |
|
737
|
|
|
|
|
|
|
# put 'em in the orphanage |
|
738
|
0
|
|
|
|
|
0
|
$self->_add_orphan($feature); |
|
739
|
|
|
|
|
|
|
} |
|
740
|
|
|
|
|
|
|
} |
|
741
|
|
|
|
|
|
|
} |
|
742
|
|
|
|
|
|
|
else { |
|
743
|
|
|
|
|
|
|
# must be a parent |
|
744
|
165
|
|
|
|
|
285
|
push @{ $self->{top_features} }, $feature; |
|
|
165
|
|
|
|
|
415
|
|
|
745
|
|
|
|
|
|
|
} |
|
746
|
|
|
|
|
|
|
} |
|
747
|
|
|
|
|
|
|
# Finished loading the GFF lines |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
# check for orphans |
|
750
|
7
|
50
|
|
|
|
16
|
if (scalar @{ $self->{orphans} }) { |
|
|
7
|
|
|
|
|
27
|
|
|
751
|
0
|
|
|
|
|
0
|
$self->check_orphanage; |
|
752
|
|
|
|
|
|
|
# report |
|
753
|
0
|
0
|
|
|
|
0
|
if (scalar @{ $self->{orphans} }) { |
|
|
0
|
|
|
|
|
0
|
|
|
754
|
|
|
|
|
|
|
printf " GFF errors: %d features could not be associated with reported parents!\n", |
|
755
|
0
|
|
|
|
|
0
|
scalar @{ $self->{orphans} }; |
|
|
0
|
|
|
|
|
0
|
|
|
756
|
|
|
|
|
|
|
} |
|
757
|
|
|
|
|
|
|
} |
|
758
|
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
# report on duplicate IDs |
|
760
|
7
|
50
|
|
|
|
11
|
if (keys %{ $self->{duplicate_ids} }) { |
|
|
7
|
|
|
|
|
964
|
|
|
761
|
|
|
|
|
|
|
printf " GFF errors: %d IDs were duplicated\n", |
|
762
|
0
|
|
|
|
|
0
|
scalar(keys %{ $self->{duplicate_ids} }); |
|
|
0
|
|
|
|
|
0
|
|
|
763
|
|
|
|
|
|
|
} |
|
764
|
|
|
|
|
|
|
|
|
765
|
7
|
|
|
|
|
37
|
return 1; |
|
766
|
|
|
|
|
|
|
} |
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
sub _make_gene_parent { |
|
770
|
|
|
|
|
|
|
# for generating GTF gene parent features |
|
771
|
12
|
|
|
12
|
|
17
|
my ($self, $fields, $gene_id) = @_; |
|
772
|
12
|
|
|
|
|
43
|
my $gene = $SFCLASS->new( |
|
773
|
|
|
|
|
|
|
-seq_id => $fields->[0], |
|
774
|
|
|
|
|
|
|
-primary_tag => 'gene', |
|
775
|
|
|
|
|
|
|
-start => $fields->[3], |
|
776
|
|
|
|
|
|
|
-end => $fields->[4], |
|
777
|
|
|
|
|
|
|
-strand => $fields->[6], |
|
778
|
|
|
|
|
|
|
-primary_id => $gene_id, |
|
779
|
|
|
|
|
|
|
); |
|
780
|
|
|
|
|
|
|
|
|
781
|
12
|
50
|
|
|
|
262
|
if ($fields->[8] =~ /gene_name "([^"]+)";?/) { |
|
782
|
12
|
|
|
|
|
28
|
$gene->display_name($1); |
|
783
|
|
|
|
|
|
|
} |
|
784
|
|
|
|
|
|
|
else { |
|
785
|
0
|
|
|
|
|
0
|
$gene->display_name($gene_id); |
|
786
|
|
|
|
|
|
|
} |
|
787
|
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# add extra information if possible |
|
789
|
12
|
100
|
|
|
|
50
|
unless ($self->simplify) { |
|
790
|
2
|
50
|
|
|
|
13
|
if ($fields->[8] =~ /gene_biotype "([^"]+)";?/) { |
|
|
|
50
|
|
|
|
|
|
|
791
|
0
|
|
|
|
|
0
|
$gene->add_tag_value('gene_biotype', $1); |
|
792
|
|
|
|
|
|
|
} |
|
793
|
|
|
|
|
|
|
elsif ($fields->[8] =~ /gene_type "([^"]+)";?/) { |
|
794
|
0
|
|
|
|
|
0
|
$gene->add_tag_value('gene_type', $1); |
|
795
|
|
|
|
|
|
|
} |
|
796
|
2
|
50
|
|
|
|
8
|
if ($fields->[8] =~ /gene_source "([^"]+)";?/) { |
|
797
|
0
|
|
|
|
|
0
|
$gene->source($1); |
|
798
|
|
|
|
|
|
|
} |
|
799
|
|
|
|
|
|
|
} |
|
800
|
12
|
|
|
|
|
22
|
return $gene; |
|
801
|
|
|
|
|
|
|
} |
|
802
|
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
sub _make_rna_parent { |
|
805
|
|
|
|
|
|
|
# for generating GTF gene parent features |
|
806
|
0
|
|
|
0
|
|
0
|
my ($self, $fields, $transcript_id) = @_; |
|
807
|
0
|
|
|
|
|
0
|
my $rna = $SFCLASS->new( |
|
808
|
|
|
|
|
|
|
-seq_id => $fields->[0], |
|
809
|
|
|
|
|
|
|
-primary_tag => 'transcript', |
|
810
|
|
|
|
|
|
|
-start => $fields->[3], |
|
811
|
|
|
|
|
|
|
-end => $fields->[4], |
|
812
|
|
|
|
|
|
|
-strand => $fields->[6], |
|
813
|
|
|
|
|
|
|
-primary_id => $transcript_id, |
|
814
|
|
|
|
|
|
|
); |
|
815
|
|
|
|
|
|
|
|
|
816
|
0
|
0
|
|
|
|
0
|
if ($fields->[8] =~ /transcript_name "([^"]+)";?/) { |
|
817
|
0
|
|
|
|
|
0
|
$rna->display_name($1); |
|
818
|
|
|
|
|
|
|
} |
|
819
|
|
|
|
|
|
|
else { |
|
820
|
0
|
|
|
|
|
0
|
$rna->display_name($transcript_id); |
|
821
|
|
|
|
|
|
|
} |
|
822
|
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
# add extra information if possible |
|
824
|
0
|
0
|
|
|
|
0
|
unless ($self->simplify) { |
|
825
|
0
|
0
|
|
|
|
0
|
if ($fields->[8] =~ /transcript_biotype "([^"]+)";?/) { |
|
|
|
0
|
|
|
|
|
|
|
826
|
0
|
|
|
|
|
0
|
$rna->add_tag_value('transcript_biotype', $1); |
|
827
|
|
|
|
|
|
|
} |
|
828
|
|
|
|
|
|
|
elsif ($fields->[8] =~ /transcript_type "([^"]+)";?/) { |
|
829
|
0
|
|
|
|
|
0
|
$rna->add_tag_value('transcript_type', $1); |
|
830
|
|
|
|
|
|
|
} |
|
831
|
0
|
0
|
|
|
|
0
|
if ($fields->[8] =~ /transcript_source "([^"]+)";?/) { |
|
832
|
0
|
|
|
|
|
0
|
$rna->source($1); |
|
833
|
|
|
|
|
|
|
} |
|
834
|
|
|
|
|
|
|
} |
|
835
|
0
|
|
|
|
|
0
|
return $rna; |
|
836
|
|
|
|
|
|
|
} |
|
837
|
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
sub find_gene { |
|
840
|
37
|
|
|
37
|
1
|
11305
|
my $self = shift; |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
# check that we have gene2seqf table |
|
843
|
37
|
100
|
|
|
|
63
|
unless (exists $self->{gene2seqf}) { |
|
844
|
5
|
50
|
|
|
|
16
|
croak "must parse file first!" unless $self->{'eof'}; |
|
845
|
5
|
|
|
|
|
14
|
$self->{gene2seqf} = {}; |
|
846
|
5
|
|
|
|
|
7
|
foreach (@{ $self->{top_features} }) { |
|
|
5
|
|
|
|
|
14
|
|
|
847
|
107
|
|
|
|
|
142
|
my $name = lc $_->display_name; |
|
848
|
107
|
50
|
|
|
|
353
|
if (exists $self->{gene2seqf}->{$name}) { |
|
849
|
0
|
|
|
|
|
0
|
push @{ $self->{gene2seqf}->{$name} }, $_; |
|
|
0
|
|
|
|
|
0
|
|
|
850
|
|
|
|
|
|
|
} |
|
851
|
|
|
|
|
|
|
else { |
|
852
|
107
|
|
|
|
|
210
|
$self->{gene2seqf}->{$name} = [$_]; |
|
853
|
|
|
|
|
|
|
} |
|
854
|
|
|
|
|
|
|
} |
|
855
|
|
|
|
|
|
|
} |
|
856
|
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
# get the name and coordinates from arguments |
|
858
|
37
|
|
|
|
|
45
|
my ($name, $id, $chrom, $start, $end, $strand); |
|
859
|
37
|
50
|
|
|
|
79
|
if (scalar @_ == 0) { |
|
|
|
100
|
|
|
|
|
|
|
860
|
0
|
|
|
|
|
0
|
carp "must provide information to find_gene method!"; |
|
861
|
0
|
|
|
|
|
0
|
return; |
|
862
|
|
|
|
|
|
|
} |
|
863
|
|
|
|
|
|
|
elsif (scalar @_ == 1) { |
|
864
|
4
|
|
|
|
|
11
|
$name = $_[0]; |
|
865
|
|
|
|
|
|
|
} |
|
866
|
|
|
|
|
|
|
else { |
|
867
|
33
|
|
|
|
|
54
|
my %opt = @_; |
|
868
|
33
|
|
0
|
|
|
51
|
$name = $opt{name} || $opt{display_name} || undef; |
|
869
|
33
|
|
0
|
|
|
51
|
$id = $opt{id} || $opt{primary_id} || undef; |
|
870
|
33
|
|
50
|
|
|
107
|
$chrom = $opt{chrom} || $opt{seq_id} || undef; |
|
871
|
33
|
|
50
|
|
|
59
|
$start = $opt{start} || undef; |
|
872
|
33
|
|
50
|
|
|
81
|
$end = $opt{stop} || $opt{end} || undef; |
|
873
|
33
|
|
50
|
|
|
63
|
$strand = $opt{strand} || 0; |
|
874
|
|
|
|
|
|
|
} |
|
875
|
37
|
50
|
|
|
|
51
|
unless ($name) { |
|
876
|
0
|
|
|
|
|
0
|
carp "name is required for find_gene!"; |
|
877
|
0
|
|
|
|
|
0
|
return; |
|
878
|
|
|
|
|
|
|
} |
|
879
|
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
# check if a gene with this name exists |
|
881
|
37
|
50
|
|
|
|
88
|
if (exists $self->{gene2seqf}->{lc $name} ) { |
|
882
|
|
|
|
|
|
|
# we found a matching gene |
|
883
|
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
# pull out the gene seqfeature(s) array reference |
|
885
|
|
|
|
|
|
|
# there may be more than one gene |
|
886
|
37
|
|
|
|
|
52
|
my $genes = $self->{gene2seqf}->{ lc $name }; |
|
887
|
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
# go through a series of checks to find the appropriate |
|
889
|
37
|
100
|
|
|
|
50
|
if ($id) { |
|
890
|
33
|
|
|
|
|
51
|
foreach my $g (@$genes) { |
|
891
|
33
|
50
|
|
|
|
83
|
if ($g->primary_id eq $id) { |
|
892
|
33
|
|
|
|
|
55
|
return $g; |
|
893
|
|
|
|
|
|
|
} |
|
894
|
|
|
|
|
|
|
} |
|
895
|
0
|
|
|
|
|
0
|
return; # none of these matched despite having an ID |
|
896
|
|
|
|
|
|
|
} |
|
897
|
4
|
0
|
33
|
|
|
11
|
if ($chrom and $start and $end) { |
|
|
|
|
33
|
|
|
|
|
|
898
|
0
|
|
|
|
|
0
|
foreach my $g (@$genes) { |
|
899
|
0
|
0
|
0
|
|
|
0
|
if ( |
|
|
|
|
0
|
|
|
|
|
|
900
|
|
|
|
|
|
|
# overlap method borrowed from Bio::RangeI |
|
901
|
|
|
|
|
|
|
($g->strand == $strand) and not ( |
|
902
|
|
|
|
|
|
|
$g->start > $end or |
|
903
|
|
|
|
|
|
|
$g->end < $start |
|
904
|
|
|
|
|
|
|
) |
|
905
|
|
|
|
|
|
|
) { |
|
906
|
|
|
|
|
|
|
# gene and transcript overlap on the same strand |
|
907
|
|
|
|
|
|
|
# we found the intersecting gene |
|
908
|
0
|
|
|
|
|
0
|
return $g; |
|
909
|
|
|
|
|
|
|
} |
|
910
|
|
|
|
|
|
|
} |
|
911
|
0
|
|
|
|
|
0
|
return; # none of these matched despite having coordinate info |
|
912
|
|
|
|
|
|
|
} |
|
913
|
4
|
50
|
|
|
|
10
|
if (scalar @$genes == 1) { |
|
|
|
0
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
# going on trust here that this is the one |
|
915
|
4
|
|
|
|
|
10
|
return $genes->[0]; |
|
916
|
|
|
|
|
|
|
} |
|
917
|
|
|
|
|
|
|
elsif (scalar @$genes > 1) { |
|
918
|
0
|
|
|
|
|
0
|
carp "more than one gene named $name found!"; |
|
919
|
0
|
|
|
|
|
0
|
return $genes->[0]; |
|
920
|
|
|
|
|
|
|
} |
|
921
|
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
# nothing suitable found |
|
923
|
0
|
|
|
|
|
0
|
return; |
|
924
|
|
|
|
|
|
|
} |
|
925
|
|
|
|
|
|
|
} |
|
926
|
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
sub from_gff_string { |
|
929
|
0
|
|
|
0
|
1
|
0
|
my ($self, $string) = @_; |
|
930
|
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
# check string |
|
932
|
0
|
|
|
|
|
0
|
chomp $string; |
|
933
|
0
|
0
|
|
|
|
0
|
unless ($string) { |
|
934
|
0
|
|
|
|
|
0
|
cluck("must pass a string!\n"); |
|
935
|
0
|
|
|
|
|
0
|
return; |
|
936
|
|
|
|
|
|
|
} |
|
937
|
0
|
|
|
|
|
0
|
my @fields = split('\t', $string); |
|
938
|
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
# check convertor |
|
940
|
0
|
0
|
|
|
|
0
|
unless (defined $self->{convertor_sub}) { |
|
941
|
0
|
|
|
|
|
0
|
$self->_assign_gff_converter; |
|
942
|
|
|
|
|
|
|
} |
|
943
|
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
# parse appropriately |
|
945
|
0
|
|
|
|
|
0
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
|
0
|
|
|
|
|
0
|
|
|
946
|
|
|
|
|
|
|
} |
|
947
|
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
sub _assign_gff_converter { |
|
950
|
11
|
|
|
11
|
|
18
|
my $self = shift; |
|
951
|
11
|
100
|
|
|
|
31
|
if ($self->{gff3}) { |
|
|
|
50
|
|
|
|
|
|
|
952
|
7
|
|
|
|
|
21
|
$self->{convertor_sub} = \&_gff3_to_seqf; |
|
953
|
|
|
|
|
|
|
} |
|
954
|
|
|
|
|
|
|
elsif ($self->{gtf}) { |
|
955
|
4
|
100
|
|
|
|
9
|
if ($self->simplify) { |
|
956
|
2
|
|
|
|
|
6
|
$self->{convertor_sub} = \&_gtf_to_seqf_simple; |
|
957
|
|
|
|
|
|
|
} |
|
958
|
|
|
|
|
|
|
else { |
|
959
|
2
|
|
|
|
|
6
|
$self->{convertor_sub} = \&_gtf_to_seqf_full; |
|
960
|
|
|
|
|
|
|
} |
|
961
|
|
|
|
|
|
|
# double check we have transcript information |
|
962
|
4
|
50
|
|
|
|
8
|
if ($self->typelist !~ /transcript|rna/i) { |
|
963
|
|
|
|
|
|
|
# we will have to rely on exon and/or cds information to get transcript |
|
964
|
0
|
0
|
0
|
|
|
0
|
unless ($self->do_exon or $self->do_cds) { |
|
965
|
0
|
|
|
|
|
0
|
$self->do_exon(1); |
|
966
|
|
|
|
|
|
|
} |
|
967
|
|
|
|
|
|
|
} |
|
968
|
4
|
50
|
33
|
|
|
11
|
if ($self->do_gene and $self->typelist !~ /gene/i) { |
|
969
|
4
|
|
|
|
|
8
|
$self->do_exon(1); |
|
970
|
|
|
|
|
|
|
} |
|
971
|
|
|
|
|
|
|
} |
|
972
|
|
|
|
|
|
|
else { |
|
973
|
0
|
|
|
|
|
0
|
$self->{convertor_sub} = \&_gff2_to_seqf; |
|
974
|
|
|
|
|
|
|
} |
|
975
|
|
|
|
|
|
|
} |
|
976
|
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
sub _gff3_to_seqf { |
|
979
|
266
|
|
|
266
|
|
330
|
my ($self, $fields) = @_; |
|
980
|
266
|
|
|
|
|
313
|
my $group = $fields->[8]; |
|
981
|
266
|
|
|
|
|
369
|
my $feature = $self->_gff_to_seqf($fields); |
|
982
|
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
# process groups |
|
984
|
266
|
|
|
|
|
1575
|
foreach my $g (split(/\s*;\s*/, $group)) { |
|
985
|
1382
|
|
|
|
|
3306
|
my ($tag, $value) = split /=/, $g; |
|
986
|
1382
|
|
|
|
|
1836
|
$tag = $self->unescape($tag); |
|
987
|
1382
|
|
|
|
|
2012
|
my @values = map { $self->unescape($_) } split(/,/, $value); |
|
|
1470
|
|
|
|
|
1696
|
|
|
988
|
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
# determine the appropriate attribute based on tag name |
|
990
|
1382
|
100
|
|
|
|
3618
|
if ($tag eq 'Name') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
991
|
266
|
|
|
|
|
489
|
$feature->display_name($values[0]); |
|
992
|
|
|
|
|
|
|
} |
|
993
|
|
|
|
|
|
|
elsif ($tag eq 'ID') { |
|
994
|
167
|
|
|
|
|
330
|
$feature->primary_id($values[0]); |
|
995
|
|
|
|
|
|
|
} |
|
996
|
|
|
|
|
|
|
elsif ($tag eq 'Parent') { |
|
997
|
|
|
|
|
|
|
# always record Parent except for transcripts when genes are not wanted |
|
998
|
99
|
50
|
33
|
|
|
154
|
unless (not $self->do_gene and $feature->primary_tag =~ /rna|transcript/i) { |
|
999
|
99
|
|
|
|
|
121
|
foreach (@values) { |
|
1000
|
99
|
|
|
|
|
177
|
$feature->add_tag_value($tag, $_); |
|
1001
|
|
|
|
|
|
|
} |
|
1002
|
|
|
|
|
|
|
} |
|
1003
|
|
|
|
|
|
|
} |
|
1004
|
|
|
|
|
|
|
elsif (lc $tag eq 'exon_id') { |
|
1005
|
|
|
|
|
|
|
# ensembl GFF3 store the exon id but doesn't record it as the ID, why? |
|
1006
|
0
|
|
|
|
|
0
|
$feature->primary_id($values[0]); |
|
1007
|
|
|
|
|
|
|
} |
|
1008
|
|
|
|
|
|
|
elsif (not $self->{simplify}) { |
|
1009
|
2
|
|
|
|
|
4
|
foreach (@values) { |
|
1010
|
2
|
|
|
|
|
8
|
$feature->add_tag_value($tag, $_); |
|
1011
|
|
|
|
|
|
|
} |
|
1012
|
|
|
|
|
|
|
} |
|
1013
|
|
|
|
|
|
|
} |
|
1014
|
|
|
|
|
|
|
|
|
1015
|
266
|
|
|
|
|
965
|
return $feature; |
|
1016
|
|
|
|
|
|
|
} |
|
1017
|
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
sub _gtf_to_seqf_simple { |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
312
|
|
|
312
|
|
431
|
my ($self, $fields) = @_; |
|
1022
|
312
|
|
|
|
|
388
|
my $group = $fields->[8]; |
|
1023
|
312
|
|
|
|
|
451
|
my $feature = $self->_gff_to_seqf($fields); |
|
1024
|
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
# extract essential tags |
|
1026
|
312
|
|
|
|
|
345
|
my ($gene_id, $transcript_id); |
|
1027
|
312
|
50
|
|
|
|
1104
|
if ($group =~ /gene_id "([^"]+)";?/i) { |
|
1028
|
312
|
|
|
|
|
560
|
$gene_id = $1; |
|
1029
|
|
|
|
|
|
|
} |
|
1030
|
312
|
50
|
|
|
|
793
|
if ($group =~ /transcript_id "([^"]+)";?/i) { |
|
1031
|
312
|
|
|
|
|
441
|
$transcript_id = $1; |
|
1032
|
|
|
|
|
|
|
} |
|
1033
|
312
|
50
|
33
|
|
|
462
|
unless ($gene_id or $transcript_id) { |
|
1034
|
|
|
|
|
|
|
# improperly formatted GTF file without one of these two items, nothing more to do |
|
1035
|
0
|
|
|
|
|
0
|
return $feature; |
|
1036
|
|
|
|
|
|
|
} |
|
1037
|
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
# common subfeatures including exon, CDS, UTR, and codons |
|
1039
|
312
|
100
|
|
|
|
1013
|
if ($fields->[2] =~ /cds|exon|utr|codon|untranslated/i) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1040
|
278
|
|
|
|
|
661
|
$feature->add_tag_value('Parent', $transcript_id); |
|
1041
|
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
# exon id if present |
|
1043
|
278
|
50
|
66
|
|
|
1489
|
if ($fields->[2] eq 'exon' and $group =~ /exon_id "([^"]+)";?/i) { |
|
1044
|
0
|
|
|
|
|
0
|
$feature->primary_id($1); |
|
1045
|
|
|
|
|
|
|
} |
|
1046
|
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
# check gene parent |
|
1048
|
278
|
50
|
33
|
|
|
463
|
if ($self->do_gene and not exists $self->{loaded}{$gene_id}) { |
|
1049
|
0
|
|
|
|
|
0
|
my $gene = $self->_make_gene_parent($fields, $gene_id); |
|
1050
|
0
|
|
|
|
|
0
|
$self->{loaded}{$gene_id} = $gene; |
|
1051
|
0
|
|
|
|
|
0
|
push @{ $self->{top_features} }, $gene; |
|
|
0
|
|
|
|
|
0
|
|
|
1052
|
|
|
|
|
|
|
} |
|
1053
|
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
# check transcript parent |
|
1055
|
278
|
50
|
|
|
|
455
|
unless (exists $self->{loaded}{$transcript_id}) { |
|
1056
|
0
|
|
|
|
|
0
|
my $rna = $self->_make_rna_parent($fields, $transcript_id); |
|
1057
|
0
|
|
|
|
|
0
|
$self->{loaded}{$transcript_id} = $rna; |
|
1058
|
0
|
0
|
|
|
|
0
|
if ($self->do_gene) { |
|
1059
|
0
|
|
|
|
|
0
|
$rna->add_tag_value('Parent', $gene_id); |
|
1060
|
0
|
|
|
|
|
0
|
$self->{loaded}{$gene_id}->add_SeqFeature($rna); |
|
1061
|
|
|
|
|
|
|
} |
|
1062
|
|
|
|
|
|
|
else { |
|
1063
|
0
|
|
|
|
|
0
|
push @{ $self->{top_features} }, $rna; |
|
|
0
|
|
|
|
|
0
|
|
|
1064
|
|
|
|
|
|
|
} |
|
1065
|
|
|
|
|
|
|
} |
|
1066
|
|
|
|
|
|
|
} |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
# a transcript feature |
|
1069
|
|
|
|
|
|
|
elsif ($fields->[2] =~ /rna|transcript/i) { |
|
1070
|
|
|
|
|
|
|
# these are sometimes present in GTF files, such as from Ensembl |
|
1071
|
|
|
|
|
|
|
# but are not required and often absent |
|
1072
|
34
|
|
|
|
|
79
|
$feature->primary_id($transcript_id); |
|
1073
|
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
# transcript information |
|
1075
|
34
|
50
|
|
|
|
192
|
if ($group =~ /transcript_name "([^"]+)";?/i) { |
|
1076
|
34
|
|
|
|
|
69
|
$feature->display_name($1); |
|
1077
|
|
|
|
|
|
|
} |
|
1078
|
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
# check whether parent was loaded and add gene information if not |
|
1080
|
34
|
50
|
|
|
|
158
|
if ($self->do_gene) { |
|
1081
|
34
|
|
|
|
|
77
|
$feature->add_tag_value('Parent', $gene_id); |
|
1082
|
34
|
100
|
|
|
|
155
|
if (not exists $self->{loaded}{$gene_id}) { |
|
1083
|
10
|
|
|
|
|
24
|
my $gene = $self->_make_gene_parent($fields, $gene_id); |
|
1084
|
10
|
|
|
|
|
18
|
$self->{loaded}{$gene_id} = $gene; |
|
1085
|
10
|
|
|
|
|
14
|
push @{ $self->{top_features} }, $gene; |
|
|
10
|
|
|
|
|
18
|
|
|
1086
|
|
|
|
|
|
|
} |
|
1087
|
|
|
|
|
|
|
} |
|
1088
|
|
|
|
|
|
|
} |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
# a gene feature |
|
1091
|
|
|
|
|
|
|
elsif ($fields->[2] eq 'gene') { |
|
1092
|
|
|
|
|
|
|
# these are sometimes present in GTF files, such as from Ensembl |
|
1093
|
|
|
|
|
|
|
# but are not required and often absent |
|
1094
|
0
|
|
|
|
|
0
|
$feature->primary_id($gene_id); |
|
1095
|
0
|
0
|
|
|
|
0
|
if ($group =~ /gene_name "([^"]+)";?/i) { |
|
1096
|
0
|
|
|
|
|
0
|
$feature->display_name($1); |
|
1097
|
|
|
|
|
|
|
} |
|
1098
|
|
|
|
|
|
|
} |
|
1099
|
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
# anything else, like CNS (conserved noncoding sequence) doesn't get any |
|
1101
|
|
|
|
|
|
|
# further special attributes, as far as I can tell |
|
1102
|
312
|
|
|
|
|
1394
|
return $feature; |
|
1103
|
|
|
|
|
|
|
} |
|
1104
|
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
sub _gtf_to_seqf_full { |
|
1107
|
|
|
|
|
|
|
|
|
1108
|
2
|
|
|
2
|
|
6
|
my ($self, $fields) = @_; |
|
1109
|
2
|
|
|
|
|
4
|
my $group = $fields->[8]; |
|
1110
|
2
|
|
|
|
|
5
|
my $feature = $self->_gff_to_seqf($fields); |
|
1111
|
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
# process the group tags |
|
1113
|
2
|
|
|
|
|
3
|
my %attributes; |
|
1114
|
2
|
|
|
|
|
9
|
foreach my $g (split('; ', $group)) { # supposed to be "; " as delimiter |
|
1115
|
10
|
|
|
|
|
19
|
my ($tag, $value, @bits) = split ' ', $g; |
|
1116
|
10
|
|
|
|
|
25
|
$value =~ s/[";]//g; # remove the flanking double quotes, assume no internal quotes |
|
1117
|
10
|
|
|
|
|
20
|
$attributes{$tag} = $value; |
|
1118
|
|
|
|
|
|
|
} |
|
1119
|
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
# assign special tags based on the feature type |
|
1121
|
2
|
50
|
|
|
|
16
|
if ($fields->[2] =~ /cds|exon|utr|codon|untranslated/i) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1122
|
0
|
|
|
|
|
0
|
$feature->add_tag_value('Parent', $attributes{'transcript_id'}); |
|
1123
|
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
# exon id if present |
|
1125
|
0
|
0
|
0
|
|
|
0
|
if ($fields->[2] eq 'exon' and exists $attributes{'exon_id'}) { |
|
1126
|
0
|
|
|
|
|
0
|
$feature->primary_id($attributes{'exon_id'}); |
|
1127
|
|
|
|
|
|
|
} |
|
1128
|
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
# check gene parent |
|
1130
|
0
|
0
|
|
|
|
0
|
if ($self->do_gene) { |
|
1131
|
0
|
|
0
|
|
|
0
|
my $gene_id = $attributes{gene_id} || undef; |
|
1132
|
0
|
0
|
0
|
|
|
0
|
if ($gene_id and not exists $self->{loaded}{$gene_id}) { |
|
1133
|
0
|
|
|
|
|
0
|
my $gene = $self->_make_gene_parent($fields, $gene_id); |
|
1134
|
0
|
|
|
|
|
0
|
$self->{loaded}{$gene_id} = $gene; |
|
1135
|
0
|
|
|
|
|
0
|
push @{ $self->{top_features} }, $gene; |
|
|
0
|
|
|
|
|
0
|
|
|
1136
|
|
|
|
|
|
|
} |
|
1137
|
|
|
|
|
|
|
} |
|
1138
|
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
# check transcript parent |
|
1140
|
0
|
|
0
|
|
|
0
|
my $transcript_id = $attributes{transcript_id} || undef; |
|
1141
|
0
|
0
|
0
|
|
|
0
|
if ($transcript_id and not exists $self->{loaded}{$transcript_id}) { |
|
1142
|
0
|
|
|
|
|
0
|
my $rna = $self->_make_rna_parent($fields, $transcript_id); |
|
1143
|
0
|
|
|
|
|
0
|
$self->{loaded}{$transcript_id} = $rna; |
|
1144
|
0
|
0
|
|
|
|
0
|
if ($self->do_gene) { |
|
1145
|
0
|
|
|
|
|
0
|
$rna->add_tag_value('Parent', $attributes{gene_id}); |
|
1146
|
0
|
|
|
|
|
0
|
$self->{loaded}{ $attributes{gene_id} }->add_SeqFeature($rna); |
|
1147
|
|
|
|
|
|
|
} |
|
1148
|
|
|
|
|
|
|
else { |
|
1149
|
0
|
|
|
|
|
0
|
push @{ $self->{top_features} }, $rna; |
|
|
0
|
|
|
|
|
0
|
|
|
1150
|
|
|
|
|
|
|
} |
|
1151
|
|
|
|
|
|
|
} |
|
1152
|
|
|
|
|
|
|
} |
|
1153
|
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
# transcript |
|
1155
|
|
|
|
|
|
|
elsif ($fields->[2] =~ /transcript|rna/i) { |
|
1156
|
|
|
|
|
|
|
# these are sometimes present in GTF files, such as from Ensembl |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
# transcript information |
|
1159
|
2
|
|
|
|
|
10
|
$feature->primary_id($attributes{transcript_id}); |
|
1160
|
2
|
|
|
|
|
10
|
delete $attributes{transcript_id}; |
|
1161
|
2
|
50
|
|
|
|
5
|
if (exists $attributes{transcript_name}) { |
|
1162
|
2
|
|
|
|
|
7
|
$feature->display_name($attributes{transcript_name}); |
|
1163
|
2
|
|
|
|
|
10
|
delete $attributes{transcript_name}; |
|
1164
|
|
|
|
|
|
|
} |
|
1165
|
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
# check gene parent |
|
1167
|
2
|
50
|
|
|
|
5
|
if ($self->do_gene) { |
|
1168
|
2
|
|
50
|
|
|
5
|
my $gene_id = $attributes{gene_id} || undef; |
|
1169
|
2
|
50
|
33
|
|
|
9
|
if ($gene_id and not exists $self->{loaded}{$gene_id}) { |
|
1170
|
2
|
|
|
|
|
8
|
my $gene = $self->_make_gene_parent($fields, $gene_id); |
|
1171
|
2
|
|
|
|
|
4
|
$self->{loaded}{$gene_id} = $gene; |
|
1172
|
2
|
|
|
|
|
4
|
push @{ $self->{top_features} }, $gene; |
|
|
2
|
|
|
|
|
6
|
|
|
1173
|
|
|
|
|
|
|
} |
|
1174
|
2
|
|
|
|
|
8
|
$feature->add_tag_value('Parent', $gene_id); |
|
1175
|
|
|
|
|
|
|
} |
|
1176
|
|
|
|
|
|
|
} |
|
1177
|
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
# gene |
|
1179
|
|
|
|
|
|
|
elsif ($fields->[2] eq 'gene') { |
|
1180
|
|
|
|
|
|
|
# these are sometimes present in GTF files, such as from Ensembl |
|
1181
|
|
|
|
|
|
|
# but are not required and often absent |
|
1182
|
0
|
|
|
|
|
0
|
$feature->primary_id($attributes{gene_id}); |
|
1183
|
0
|
|
|
|
|
0
|
delete $attributes{gene_id}; |
|
1184
|
0
|
0
|
|
|
|
0
|
if (exists $attributes{gene_name}) { |
|
1185
|
0
|
|
|
|
|
0
|
$feature->display_name($attributes{gene_name}); |
|
1186
|
0
|
|
|
|
|
0
|
delete $attributes{gene_name}; |
|
1187
|
|
|
|
|
|
|
} |
|
1188
|
|
|
|
|
|
|
} |
|
1189
|
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
# store remaining attributes, if any |
|
1191
|
2
|
|
|
|
|
14
|
foreach my $key (keys %attributes) { |
|
1192
|
6
|
|
|
|
|
19
|
$feature->add_tag_value($key, $attributes{$key}); |
|
1193
|
|
|
|
|
|
|
} |
|
1194
|
2
|
|
|
|
|
14
|
return $feature; |
|
1195
|
|
|
|
|
|
|
} |
|
1196
|
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
sub _gff2_to_seqf { |
|
1199
|
|
|
|
|
|
|
# generic gff1 or gff2 format or poorly defined gff3 or gtf file |
|
1200
|
|
|
|
|
|
|
# hope for the best! |
|
1201
|
0
|
|
|
0
|
|
0
|
my ($self, $fields) = @_; |
|
1202
|
0
|
|
|
|
|
0
|
my $feature = $self->_gff_to_seqf($fields); |
|
1203
|
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
# process groups |
|
1205
|
|
|
|
|
|
|
# we have no uniform method of combining features, so we'll leave the tags |
|
1206
|
|
|
|
|
|
|
# as is and hope for the best |
|
1207
|
0
|
|
|
|
|
0
|
foreach my $g (split(/\s*;\s*/, $fields->[8])) { |
|
1208
|
0
|
|
|
|
|
0
|
my ($tag, $value) = split /\s+/, $g; |
|
1209
|
0
|
0
|
0
|
|
|
0
|
next unless ($tag and $value); |
|
1210
|
0
|
|
|
|
|
0
|
$feature->add_tag_value($tag, $value); |
|
1211
|
|
|
|
|
|
|
} |
|
1212
|
0
|
|
|
|
|
0
|
return $feature; |
|
1213
|
|
|
|
|
|
|
} |
|
1214
|
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
sub _gff_to_seqf { |
|
1217
|
580
|
|
|
580
|
|
709
|
my ($self, $fields) = @_; |
|
1218
|
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
# generate the basic SeqFeature |
|
1220
|
580
|
|
|
|
|
2006
|
my $feature = $SFCLASS->new( |
|
1221
|
|
|
|
|
|
|
-seq_id => $fields->[0], |
|
1222
|
|
|
|
|
|
|
-source => $fields->[1], |
|
1223
|
|
|
|
|
|
|
-primary_tag => $fields->[2], |
|
1224
|
|
|
|
|
|
|
-start => $fields->[3], |
|
1225
|
|
|
|
|
|
|
-end => $fields->[4], |
|
1226
|
|
|
|
|
|
|
-strand => $fields->[6], |
|
1227
|
|
|
|
|
|
|
); |
|
1228
|
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
# add more attributes if they're not null |
|
1230
|
580
|
50
|
|
|
|
10084
|
if ($fields->[5] ne '.') { |
|
1231
|
0
|
|
|
|
|
0
|
$feature->score($fields->[5]); |
|
1232
|
|
|
|
|
|
|
} |
|
1233
|
580
|
100
|
|
|
|
895
|
if ($fields->[7] ne '.') { |
|
1234
|
199
|
|
|
|
|
352
|
$feature->phase($fields->[7]); |
|
1235
|
|
|
|
|
|
|
} |
|
1236
|
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
# finished |
|
1238
|
580
|
|
|
|
|
979
|
return $feature; |
|
1239
|
|
|
|
|
|
|
} |
|
1240
|
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
sub unescape { |
|
1243
|
|
|
|
|
|
|
# Borrowed unashamedly from bioperl Bio::Tools::GFF |
|
1244
|
|
|
|
|
|
|
# which in turn was borrowed from Bio::DB::GFF |
|
1245
|
2852
|
|
|
2852
|
1
|
2624
|
my $self = shift; |
|
1246
|
2852
|
|
|
|
|
2644
|
my $v = shift; |
|
1247
|
2852
|
|
|
|
|
3101
|
$v =~ tr/+/ /; |
|
1248
|
2852
|
|
|
|
|
3636
|
$v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge; |
|
|
5574
|
|
|
|
|
10458
|
|
|
1249
|
2852
|
|
|
|
|
4421
|
return $v; |
|
1250
|
|
|
|
|
|
|
} |
|
1251
|
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
sub _add_orphan { |
|
1254
|
0
|
|
|
0
|
|
|
my ($self, $feature) = @_; |
|
1255
|
0
|
|
|
|
|
|
push @{ $self->{orphans} }, $feature; |
|
|
0
|
|
|
|
|
|
|
|
1256
|
0
|
|
|
|
|
|
return 1; |
|
1257
|
|
|
|
|
|
|
} |
|
1258
|
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
sub check_orphanage { |
|
1261
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
1262
|
0
|
0
|
|
|
|
|
return unless scalar @{ $self->{orphans} }; |
|
|
0
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
# go through the list of orphans |
|
1265
|
0
|
|
|
|
|
|
my @reunited; # list of indices to delete after reuniting orphan with parent |
|
1266
|
0
|
|
|
|
|
|
for (my $i = 0; $i < scalar @{ $self->{orphans} }; $i++) { |
|
|
0
|
|
|
|
|
|
|
|
1267
|
0
|
|
|
|
|
|
my $orphan = $self->{orphans}->[$i]; |
|
1268
|
0
|
|
|
|
|
|
my $success = 0; |
|
1269
|
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
# find the parent |
|
1271
|
0
|
|
|
|
|
|
foreach my $parent ($orphan->get_tag_values('Parent') ) { |
|
1272
|
0
|
0
|
|
|
|
|
if (exists $self->{loaded}{$parent}) { |
|
1273
|
|
|
|
|
|
|
# we have loaded the parent |
|
1274
|
|
|
|
|
|
|
# associate each orphan feature with the parent |
|
1275
|
0
|
|
|
|
|
|
$self->{loaded}{$parent}->add_SeqFeature($orphan); |
|
1276
|
0
|
|
|
|
|
|
$success++; |
|
1277
|
|
|
|
|
|
|
} |
|
1278
|
|
|
|
|
|
|
} |
|
1279
|
|
|
|
|
|
|
# delete the orphan from the array if it found it's long lost parent |
|
1280
|
0
|
0
|
|
|
|
|
push @reunited, $i if $success; |
|
1281
|
|
|
|
|
|
|
} |
|
1282
|
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
# clean up the orphanage |
|
1284
|
0
|
|
|
|
|
|
while (@reunited) { |
|
1285
|
0
|
|
|
|
|
|
my $i = pop @reunited; |
|
1286
|
0
|
|
|
|
|
|
splice(@{ $self->{orphans} }, $i, 1); |
|
|
0
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
} |
|
1288
|
|
|
|
|
|
|
} |
|
1289
|
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
sub orphans { |
|
1291
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1292
|
0
|
|
|
|
|
|
my @orphans; |
|
1293
|
0
|
|
|
|
|
|
foreach (@{ $self->{orphans} }) { |
|
|
0
|
|
|
|
|
|
|
|
1294
|
0
|
|
|
|
|
|
push @orphans, $_; |
|
1295
|
|
|
|
|
|
|
} |
|
1296
|
0
|
0
|
|
|
|
|
return wantarray ? @orphans : \@orphans; |
|
1297
|
|
|
|
|
|
|
} |
|
1298
|
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
sub comments { |
|
1301
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1302
|
0
|
|
|
|
|
|
my @comments; |
|
1303
|
0
|
|
|
|
|
|
foreach (@{ $self->{comments} }) { |
|
|
0
|
|
|
|
|
|
|
|
1304
|
0
|
|
|
|
|
|
push @comments, $_; |
|
1305
|
|
|
|
|
|
|
} |
|
1306
|
0
|
0
|
|
|
|
|
return wantarray ? @comments : \@comments; |
|
1307
|
|
|
|
|
|
|
} |
|
1308
|
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
sub seq_ids { |
|
1311
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1312
|
0
|
0
|
|
|
|
|
unless (scalar keys %{$self->{seq_ids}}) { |
|
|
0
|
|
|
|
|
|
|
|
1313
|
0
|
|
|
|
|
|
$self->_get_seq_ids; |
|
1314
|
|
|
|
|
|
|
} |
|
1315
|
0
|
|
|
|
|
|
my @s = keys %{$self->{seq_ids}}; |
|
|
0
|
|
|
|
|
|
|
|
1316
|
0
|
0
|
|
|
|
|
return wantarray ? @s : \@s; |
|
1317
|
|
|
|
|
|
|
} |
|
1318
|
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
sub seq_id_lengths { |
|
1321
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
1322
|
0
|
0
|
|
|
|
|
unless (scalar keys %{$self->{seq_ids}}) { |
|
|
0
|
|
|
|
|
|
|
|
1323
|
0
|
|
|
|
|
|
$self->_get_seq_ids; |
|
1324
|
|
|
|
|
|
|
} |
|
1325
|
0
|
|
|
|
|
|
return $self->{seq_ids}; |
|
1326
|
|
|
|
|
|
|
} |
|
1327
|
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
sub _get_seq_ids { |
|
1329
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
1330
|
0
|
0
|
|
|
|
|
return unless $self->{'eof'}; |
|
1331
|
0
|
|
|
|
|
|
foreach (@{ $self->{top_features} }) { |
|
|
0
|
|
|
|
|
|
|
|
1332
|
0
|
|
|
|
|
|
my $s = $_->seq_id; |
|
1333
|
0
|
0
|
|
|
|
|
unless (exists $self->{seq_ids}{$s}) { |
|
1334
|
0
|
|
|
|
|
|
$self->{seq_ids}{$s} = 1; |
|
1335
|
|
|
|
|
|
|
} |
|
1336
|
0
|
0
|
|
|
|
|
$self->{seq_ids}{$s} = $_->end if $_->end > $self->{seq_ids}{$s}; |
|
1337
|
|
|
|
|
|
|
} |
|
1338
|
|
|
|
|
|
|
} |
|
1339
|
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
__END__ |