File Coverage

blib/lib/Bio/ToolBox/parser/gff.pm

Criterion	Covered	Total	%
statement	278	488	56.9
branch	149	306	48.6
condition	17	81	20.9
subroutine	25	38	65.7
pod	22	25	88.0
total	491	938	52.3

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::parser::gff;
2
3							our $VERSION = '1.66';
4
5							=head1 NAME
6
7							Bio::ToolBox::parser::gff - parse GFF3, GTF, and GFF files
8
9							=head1 SYNOPSIS
10
11							use Bio::ToolBox::parser::gff;
12							my $filename = 'file.gff3';
13
14							my $parser = Bio::ToolBox::parser::gff->new(
15							file => $filename,
16							do_gene => 1,
17							do_exon => 1,
18							) or die "unable to open gff file!\n";
19
20							while (my $feature = $parser->next_top_feature() ) {
21							# each $feature is a SeqFeature object
22							my @children = $feature->get_SeqFeatures();
23							}
24
25							=head1 DESCRIPTION
26
27							This module parses a GFF file into SeqFeature objects. It natively
28							handles GFF3, GTF, and general GFF files.
29
30							For both GFF3 and GTF files, fully nested gene models, typically
31							gene =E transcript =E (exon, CDS, etc), may be built using the appropriate
32							attribute tags. For GFF3 files, these include ID and Parent tags; for GTF
33							these include C and C tags.
34
35							For GFF3 files, any feature without a C tag is assumed to be a
36							parent. Children features referencing a parent feature that has not been
37							loaded are considered orphans. Orphans are attempted to be re-associated
38							with missing parents after the file is completely parsed. Any orphans left
39							may be collected. Files with orphans are considered poorly formatted or
40							incomplete and should be fixed. Multiple parentage, for example exons
41							shared between different transcripts of the same gene, are fully supported.
42
43							Embedded Fasta sequences are ignored, as are most comment and pragma lines.
44
45							The SeqFeature objects that are returned are L
46							objects. Refer to that documentation for more information.
47
48							=head1 METHODS
49
50							=head2 Initialize and modify the parser.
51
52							These are class methods to initialize the parser with an annotation file
53							and modify the parsing behavior. Most parameters can be set either upon
54							initialization or as class methods on the object. Unpredictable behavior
55							may occur if you implement these in the midst of parsing a file.
56
57							Do not open subsequent files with the same object. Always create a new
58							object to parse a new file
59
60							=over 4
61
62							=item new
63
64							my $parser = Bio::ToolBox::parser::gff->new($filename);
65							my $parser = Bio::ToolBox::parser::gff->new(
66							file => 'file.gtf.gz',
67							do_gene => 1,
68							do_utr => 1,
69							);
70
71							Initialize a new gff parser object. Pass a single value (a GFF file name)
72							to open a file. Alternatively, pass an array of key value pairs to control
73							how the file is parsed. Options include the following.
74
75							=over 4
76
77							=item file
78
79							Provide a GFF file name to be parsed. It should have a gff, gtf, or gff3 file
80							extension. The file may be gzip compressed.
81
82							=item version
83
84							Specify the version. Normally this is not needed, as version can be determined
85							either from the file extension (in the case of gtf and gff3) or from the
86							C<##gff-version> pragma at the top of the file. Acceptable values include 1, 2,
87							2.5 (gtf), or 3.
88
89							=item class
90
91							Pass the name of a L compliant class that will be used to
92							create the SeqFeature objects. The default is to use L.
93
94							=item simplify
95
96							Pass a boolean value to simplify the SeqFeature objects parsed from the GFF
97							file and ignore extraneous attributes.
98
99							=item do_gene
100
101							Pass a boolean (1 or 0) value to combine multiple transcripts with the same gene
102							name under a single gene object. Default is true.
103
104							=item do_cds
105
106							=item do_exon
107
108							=item do_utr
109
110							=item do_codon
111
112							Pass a boolean (1 or 0) value to parse certain subfeatures. Exon subfeatures
113							are always parsed, but C, C, C, C,
114							and C features may be optionally parsed. Default is false.
115
116							=back
117
118							=item open_file
119
120							$parser->open_file($file) or die "unable to open $file!";
121
122							Pass the name of a GFF file to be parsed. The file may optionally be gzipped
123							(F<.gz> extension). Do not open a new file when one has already opened a file.
124							Create a new object for a new file, or concatenate the GFF files.
125
126							=item version
127
128							Set or get the GFF version of the current file. Acceptable values include 1, 2,
129							2.5 (gtf), or 3. Normally this is determined by file extension or
130							C pragma on the first line, and should not need to be set by the
131							user in most circumstances.
132
133							=item simplify
134
135							Pass a boolean true value to simplify the attributes of GFF3 and GTF files
136							that may have considerable numbers of tags, e.g. Ensembl files. Only
137							essential information, including name, ID, and parentage, is retained.
138							Useful if you're trying to quickly parse annotation files for basic
139							information.
140
141							=back
142
143							=head2 Feature retrieval
144
145							The following methods parse the GFF file lines into SeqFeature objects.
146							It is best if these methods are not mixed; unexpected results may occur.
147
148							=over 4
149
150							=item next_top_feature
151
152							This method will return a top level parent SeqFeature object
153							assembled with child features as sub-features. For example, a gene
154							object with mRNA subfeatures, which in turn may have exon and/or CDS
155							subfeatures. Child features are assembled based on the existence of
156							proper Parent attributes in child features. If no Parent attributes are
157							included in the GFF file, then this will behave as L.
158
159							Child features (those containing a C attribute)
160							are associated with the parent feature. A warning will be issued about lost
161							children (orphans). Shared subfeatures, for example exons common to
162							multiple transcripts, are associated properly with each parent. An opportunity
163							to rescue orphans is available using the L method.
164
165							Note that subfeatures may not necessarily be in ascending genomic order
166							when associated with the feature, depending on their order in the GFF3
167							file and whether shared subfeatures are present or not. When calling
168							subfeatures in your program, you may want to sort the subfeatures. For
169							example
170
171							my @subfeatures = map { $_->[0] }
172							sort { $a->[1] <=> $b->[1] }
173							map { [$_, $_->start] }
174							$parent->get_SeqFeatures;
175
176							=item top_features
177
178							This method will return an array of the top (parent) features defined in
179							the GFF file. This is similar to the L method except that
180							all features are returned at once.
181
182							=item next_feature
183
184							This method will return a SeqFeature object representation of
185							the next feature (line) in the file. Parent - child relationships are
186							NOT assembled; however, undefined parents in a GTF file may still be
187							generated, just not returned.
188
189							This method is best used with simple GFF files with no hierarchies
190							present. This may be used in a while loop until the end of the file
191							is reached. Pragmas are ignored and comment lines and sequence are
192							automatically skipped.
193
194							=back
195
196							=head2 Other methods
197
198							Additional methods for working with the parser object and the parsed
199							SeqFeature objects.
200
201							=over 4
202
203							=item fh
204
205							This method returns the L object of the opened GFF file.
206
207							=item parse_file
208
209							Parses the entire file into memory. This is automatically called when
210							either L or L is called.
211
212							=item find_gene
213
214							my $gene = $parser->find_gene(
215							name => $display_name,
216							id => $primary_id,
217							) or warn "gene $display_name can not be found!";
218
219							Pass a gene name, or an array of key =E values (C, C,
220							C, C, and/or coordinate information), that can be used
221							to find a gene already loaded into memory. Only useful after
222							is called. Genes with a matching name are confirmed by a matching ID or
223							overlapping coordinates, if available. Otherwise the first match is returned.
224
225							=item orphans
226
227							my @orphans = $parser->orphans;
228							printf "we have %d orphans left over!", scalar @orpans;
229
230							This method will return an array of orphan SeqFeature objects that indicated
231							they had a parent but said parent could not be found. Typically, this is an
232							indication of an incomplete or malformed GFF3 file. Nevertheless, it might
233							be a good idea to check this after retrieving all top features.
234
235							=item comments
236
237							This method will return an array of the comment or pragma lines that may have
238							been in the parsed file. These may or may not be useful.
239
240							=item typelist
241
242							Returns a comma-delimited string of the GFF primary tags (column 3)
243							observed in the first 1000 lines of an opened file. Useful for
244							checking what is in the GFF file. See
245							L.
246
247							=item from_gff_string
248
249							my $seqfeature = $parser->from_gff_string($string);
250
251							This method will parse a single GFF, GTF, or GFF3 formatted string or line
252							of text and return a SeqFeature object.
253
254							=item unescape
255
256							This method will unescape special characters in a text string. Certain
257							characters, including ";" and "=", are reserved for GFF3 formatting and
258							are not allowed, thus requiring them to be escaped.
259
260							=item seq_ids
261
262							Returns an array or array reference of the names of the chromosomes or
263							reference sequences present in the file. These may be defined by GFF3
264							sequence-region pragmas or inferred from the features.
265
266							=item seq_id_lengths
267
268							my $seq2len = $parser->seq_id_lengths;
269							foreach (keys %$seq2len) {
270							printf "chromosome %s is %d bp long\n", $_, $seq2len->{$_};
271							}
272
273							Returns a hash reference to the chromosomes or reference sequences and
274							their corresponding lengths. In this case, the length is either defined
275							by the C pragma or inferred by the greatest end position of
276							the top features.
277
278							=back
279
280							=head1 SEE ALSO
281
282							L, L, L
283
284							=cut
285
286	1			1		71575	use strict;
	1					2
	1					25
287	1			1		4	use Carp qw(carp cluck croak);
	1					2
	1					47
288	1			1		443	use Bio::ToolBox::Data::file;
	1					2
	1					4216
289							my $SFCLASS = 'Bio::ToolBox::SeqFeature'; # alternative to Bio::SeqFeature::Lite
290							eval "require $SFCLASS" or croak $@;
291
292							1;
293
294							sub new {
295	11			11	1	11117	my $class = shift;
296	11					148	my $self = {
297							'fh' => undef,
298							'top_features' => [],
299							'orphans' => [],
300							'duplicate_ids' => {},
301							'loaded' => {},
302							'eof' => 0,
303							'do_gene' => 1,
304							'do_exon' => 0,
305							'do_cds' => 0,
306							'do_utr' => 0,
307							'do_codon' => 0,
308							'gff3' => 0,
309							'gtf' => 0,
310							'comments' => [],
311							'seq_ids' => {},
312							'simplify' => 0,
313							'typelist' => '',
314							'convertor_sub' => undef,
315							};
316	11					29	bless $self, $class;
317
318							# check for options
319	11	100				35	if (@_) {
320	8	50				25	if (scalar @_ == 1) {
321	0	0				0	$self->open_file($_[0]) or croak "unable to open file!";
322							}
323							else {
324	8					28	my %options = @_;
325	8	100				24	if (exists $options{simplify}) {
326	4					15	$self->simplify( $options{simplify} );
327							}
328	8	100				22	if (exists $options{do_gene}) {
329	4					13	$self->do_gene($options{do_gene});
330							}
331	8	100				18	if (exists $options{do_exon}) {
332	2					10	$self->do_exon($options{do_exon});
333							}
334	8	100				20	if (exists $options{do_cds}) {
335	4					13	$self->do_cds($options{do_cds});
336							}
337	8	50				20	if (exists $options{do_utr}) {
338	0					0	$self->do_utr($options{do_utr});
339							}
340	8	50				20	if (exists $options{do_codon}) {
341	0					0	$self->do_codon($options{do_codon});
342							}
343	8	50				19	if (exists $options{version}) {
344	0					0	$self->version($options{version});
345							}
346	8	50	33			23	if (exists $options{file} or $options{table}) {
347	8		33			16	$options{file} \|\|= $options{table};
348	8	50				30	$self->open_file( $options{file} ) or
349							croak "unable to open file!";
350							}
351	8	50				22	if (exists $options{class}) {
352	8					16	my $class = $options{class};
353	8	50				526	if (eval "require $class; 1") {
354	8					28	$SFCLASS = $class;
355							}
356							else {
357	0					0	croak $@;
358							}
359							}
360							}
361							}
362
363							# done
364	11					45	return $self;
365							}
366
367							sub do_gene {
368	593			593	1	1419	my $self = shift;
369	593	100				919	if (@_) {
370	7					16	$self->{'do_gene'} = shift;
371							}
372	593					1373	return $self->{'do_gene'};
373							}
374
375							sub do_exon {
376	188			188	1	218	my $self = shift;
377	188	100				321	if (@_) {
378	6					10	$self->{'do_exon'} = shift;
379							}
380	188					339	return $self->{'do_exon'};
381							}
382
383							sub do_cds {
384	272			272	1	324	my $self = shift;
385	272	100				399	if (@_) {
386	5					9	$self->{'do_cds'} = shift;
387							}
388	272					419	return $self->{'do_cds'};
389							}
390
391							sub do_utr {
392	0			0	1	0	my $self = shift;
393	0	0				0	if (@_) {
394	0					0	$self->{'do_utr'} = shift;
395							}
396	0					0	return $self->{'do_utr'};
397							}
398
399							sub do_codon {
400	34			34	1	40	my $self = shift;
401	34	50				77	if (@_) {
402	0					0	$self->{'do_codon'} = shift;
403							}
404	34					62	return $self->{'do_codon'};
405							}
406
407							sub do_name {
408							# this does nothing other than maintain compatibility with ucsc parser
409	0			0	0	0	return 0;
410							}
411
412							sub share {
413							# this does nothing other than maintain compatibility with ucsc parser
414	0			0	0	0	return 1;
415							}
416
417							sub simplify {
418	85			85	1	91	my $self = shift;
419	85	100				139	if (defined $_[0]) {
420	7					12	$self->{simplify} = shift;
421							}
422	85					854	return $self->{simplify};
423							}
424
425							sub version {
426	24			24	1	1102	my $self = shift;
427	24	100				47	if (@_) {
428	11					21	my $v = shift;
429	11	100	33			37	if ($v eq '3') {
		50	0
		0
430	7					22	$self->{version} = $v;
431	7					11	$self->{gff3} = 1;
432							}
433							elsif ($v eq '2.5' or $v eq '2.2') {
434	4					11	$self->{version} = '2.5';
435	4					8	$self->{gtf} = 1;
436							}
437							elsif ($v eq '2' or $v eq '1') {
438	0					0	$self->{version} = $v;
439							}
440							else {
441	0					0	warn "unrecognized GFF version '$v'!\n";
442							}
443							}
444	24					299	return $self->{version};
445							}
446
447							sub open_file {
448	11			11	1	20	my $self = shift;
449
450							# check file
451	11					13	my $filename = shift;
452	11	50				18	unless ($filename) {
453	0					0	cluck("no file name passed!\n");
454	0					0	return;
455							}
456
457							# check type list
458	11					99	my $typelist = Bio::ToolBox::Data::file->sample_gff_type_list($filename);
459	11	50				57	if ($typelist !~ /\w+/) {
460	0					0	print "GFF file has no evident types!? $filename may not be a valid GFF file\n";
461	0					0	return;
462							}
463	11					37	$self->{typelist} = $typelist;
464
465							# Open filehandle object
466	11	50				84	my $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename) or
467							croak " cannot open file '$filename'!\n";
468
469							# check gff version pragma
470	11					220	my $first = $fh->getline;
471	11	50				364	if ($first =~ /^##gff.version\s+([\d\.]+)\s*$/i) {
472							# override any version that may have been inferred from the extension
473							# based on the assumption that this pragma is correct
474	11					41	$self->version($1);
475							}
476							else {
477							# no pragma, reopen the file
478	0					0	$fh->close;
479	0					0	$fh = Bio::ToolBox::Data::file->open_to_read_fh($filename);
480							# set version based on file type extension????
481	0	0				0	if ($filename =~ /\.gtf.*$/i) {
		0
482	0					0	$self->version('2.5');
483	0					0	$self->{gtf} = 1;
484							}
485							elsif ($filename =~ /\.gff3.*$/i) {
486	0					0	$self->version('3');
487	0					0	$self->{gff3} = 1;
488							}
489							}
490	11					22	$self->{fh} = $fh;
491	11					36	return 1;
492							}
493
494							sub fh {
495	668			668	1	1966	return shift->{fh};
496							}
497
498							sub typelist {
499	13			13	1	66	return shift->{typelist};
500							}
501
502							sub next_feature {
503	587			587	1	3053	my $self = shift;
504
505							# check that we have an open filehandle
506	587	50				837	unless ($self->fh) {
507	0					0	croak("no GFF file loaded to parse!");
508							}
509	587	50				868	return if $self->{'eof'};
510
511							# check convertor
512	587	100				872	unless (defined $self->{convertor_sub}) {
513	11					26	$self->_assign_gff_converter;
514							}
515
516							# look for the next feature line
517	587					9255	while (my $line = $self->{fh}->getline) {
518
519							# check first character
520	770					13217	my $firstchar = substr($line, 0, 1);
521
522							# skip any comment and pragma lines that we might encounter
523	770	100				1392	if ($firstchar eq '#') {
		50
524	25	50				63	if ($line =~ /^###$/) {
		50
525							# a close pragma, all we can do is check for orphans
526							# a properly written shouldn't have any orphans, but just in case
527	0					0	$self->check_orphanage;
528	0					0	next;
529							}
530							elsif ($line =~ /^##sequence.region/i) {
531							# sequence region pragma
532	0					0	my ($pragma, $seq_id, $start, $stop) = split /\s+/, $line;
533	0	0	0			0	if (defined $seq_id and $start =~ /^\d+$/ and $stop =~ /^\d+$/) {
			0
534							# we're actually only concerned with the stop coordinate
535	0					0	$self->{seq_ids}{$seq_id} = $stop;
536							}
537							else {
538	0					0	warn "malformed sequence-region pragma! $line\n";
539							}
540	0					0	next;
541							}
542							else {
543							# must be some sort of pragma or a comment line, may be useful, keep it
544	25					26	push @{$self->{comments}}, $line;
	25					44
545	25					290	next;
546							}
547							}
548							elsif ($firstchar eq '>') {
549							# fasta header line
550							# this is almost always at the end of the file, and rarely is sequence put
551							# into GFF files anyway, so let's assume it's the end of the file
552	0					0	$self->{'eof'} = 1;
553	0					0	return;
554							}
555
556							# line must be a GFF feature
557	745					870	chomp $line;
558	745					2860	my @fields = split('\t', $line);
559	745	50				1251	next unless scalar(@fields) == 9;
560
561							# check the primary_tag and generate the SeqFeature object for known types
562	745					1115	my $type = lc $fields[2];
563	745	100				1899	if ($type eq 'cds') {
		100
		50
		100
		100
		100
		100
564	265	100				474	if ($self->do_cds) {
565	199					249	return &{$self->{convertor_sub}}($self, \@fields);
	199					320
566							} else {
567	66					868	next;
568							}
569							}
570							elsif ($type eq 'exon') {
571	178	50				327	if ($self->do_exon) {
572	178					206	return &{$self->{convertor_sub}}($self, \@fields);
	178					237
573							} else {
574	0					0	next;
575							}
576							}
577							elsif ($type =~ /utr\|untranslated/) {
578	0	0				0	if ($self->do_utr) {
579	0					0	return &{$self->{convertor_sub}}($self, \@fields);
	0					0
580							} else {
581	0					0	next;
582							}
583							}
584							elsif ($type =~ /codon/) {
585	32	50				69	if ($self->do_codon) {
586	0					0	return &{$self->{convertor_sub}}($self, \@fields);
	0					0
587							} else {
588	32					438	next;
589							}
590							}
591							elsif ($type =~ /gene$/) {
592	165	50				303	if ($self->do_gene) {
593	165					219	return &{$self->{convertor_sub}}($self, \@fields);
	165					235
594							} else {
595	0					0	next;
596							}
597							}
598							elsif ($type =~ /transcript\|rna/) {
599	36					53	return &{$self->{convertor_sub}}($self, \@fields);
	36					79
600							}
601							elsif ($type =~ /chromosome\|contig\|scaffold/) {
602							# gff3 files can record the chromosome as a gff record
603							# process this as a region
604	7					20	$self->{seq_ids}{$fields[0]} = $fields[4];
605	7					95	next;
606							}
607							else {
608							# everything else must be some non-standard weird element
609							# only process this if the user wants everything
610	62	100				105	return &{$self->{convertor_sub}}($self, \@fields) unless $self->simplify;
	2					6
611							}
612							}
613
614							# presumably reached the end of the file
615	7					245	$self->{'eof'} = 1;
616	7					21	return;
617							}
618
619							sub next_top_feature {
620	70			70	1	87	my $self = shift;
621							# check that we have an open filehandle
622	70	50				89	unless ($self->fh) {
623	0					0	croak("no GFF3 file loaded to parse!");
624							}
625	70	50				101	unless ($self->{'eof'}) {
626	0	0				0	$self->parse_file or croak "unable to parse file!";
627							}
628	70					91	return shift @{ $self->{top_features} };
	70					144
629							}
630
631							sub top_features {
632	4			4	1	4328	my $self = shift;
633	4	50				74	unless ($self->{'eof'}) {
634	0					0	$self->parse_file;
635							}
636	4					10	my @features = @{ $self->{top_features} };
	4					17
637	4	50				19	return wantarray ? @features : \@features;
638							}
639
640							*parse_table = \&parse_file;
641
642							sub parse_file {
643	7			7	1	23	my $self = shift;
644							# check that we have an open filehandle
645	7	50				18	unless ($self->fh) {
646	0					0	confess("no file loaded to parse!");
647							}
648	7	50				20	return 1 if $self->{'eof'};
649
650							# Each line will be processed into a SeqFeature object, and then checked
651							# for parentage. Child objects with a Parent tag will be appropriately
652							# associated with its parent, or put into an orphanage. Any orphans
653							# left will be checked a final time for parents; if a parent can't be
654							# found, it will be lost. Features without a parent are assumed to be
655							# top-level features.
656
657	7	50				14	printf " Parsing %s format file....\n",
		100
658							$self->version eq '3' ? 'GFF3' :
659							$self->version =~ /2\../ ? 'GTF' : 'GFF';
660
661
662							TOP_FEATURE_LOOP:
663	7					40	while (my $feature = $self->next_feature) {
664
665							### Process the feature
666							# check the ID
667	576					1059	my $id = $feature->primary_id;
668							# if the seqfeature didn't have an ID specified from the file, then
669							# Bio::ToolBox::SeqFeature will autogenerate one, but Bio::SeqFeature::Lite
670							# will not - so we will likely lose that feature
671	576	100				1778	if ($id) {
672							# remember this feature since we have an ID
673	404	50				609	if (exists $self->{loaded}{$id}) {
674							# this ID should be unique in the GFF file
675							# otherwise it might be a shared duplicate or a malformed GFF file
676							# generally only a concern for top level features
677	0	0				0	if ($feature->primary_tag =~ /gene\|rna\|transcript/) {
678	0					0	$self->{duplicate_ids}{$id} += 1;
679							}
680
681							# store all of the features as an array
682	0	0				0	if (ref($self->{loaded}{$id}) eq 'ARRAY') {
683							# there's more than two duplicates! pile it on!
684	0					0	push @{ $self->{loaded}{$id} }, $feature;
	0					0
685							}
686							else {
687	0					0	my $existing = $self->{loaded}{$id};
688	0					0	$self->{loaded}{$id} = [$existing, $feature];
689							}
690							}
691							else {
692							# unique ID, so remember it
693	404					688	$self->{loaded}{$id} = $feature;
694							}
695							}
696							# if the feature didn't have an ID, we'll just assume it is
697							# a child of another feature, otherwise it may get lost
698
699							# look for parents and children
700	576	100				918	if ($feature->has_tag('Parent')) {
701							# must be a child
702							# there may be more than one parent, per the GFF3 specification
703	411					1288	foreach my $parent_id ( $feature->get_tag_values('Parent') ) {
704	411	50				1821	if (exists $self->{loaded}{$parent_id}) {
705							# we've seen this id
706							# associate the child with the parent
707	411					467	my $parent = $self->{loaded}{$parent_id};
708	411					802	$parent->add_SeqFeature($feature);
709
710							# check boundaries for gtf genes
711							# gtf genes may not be explicitly defined so must correct as necessary
712							# gff3 files shouldn't have this issue
713	411	100				12852	if ($self->{gtf}) {
714	312	50				435	if ($feature->start < $parent->start) {
715	0					0	$parent->start( $feature->start );
716							}
717	312	100				1864	if ($feature->end > $parent->end) {
718	1					2	$parent->end( $feature->end );
719							}
720							# check parent's parent too
721	312	100				1824	if ($parent->has_tag('Parent')) {
722							# in all likelihood parent is a transcript and there is a
723							# gene that probably also needs fixin'
724	278					858	my ($grandparent_id) = $parent->get_tag_values('Parent');
725	278					1155	my $grandparent = $self->{loaded}{$grandparent_id};
726	278	50				386	if ($feature->start < $grandparent->start) {
727	0					0	$grandparent->start( $feature->start );
728							}
729	278	50				1620	if ($feature->end > $grandparent->end) {
730	0					0	$grandparent->end( $feature->end );
731							}
732							}
733							}
734							}
735							else {
736							# can't find the parent, maybe not loaded yet?
737							# put 'em in the orphanage
738	0					0	$self->_add_orphan($feature);
739							}
740							}
741							}
742							else {
743							# must be a parent
744	165					285	push @{ $self->{top_features} }, $feature;
	165					415
745							}
746							}
747							# Finished loading the GFF lines
748
749							# check for orphans
750	7	50				16	if (scalar @{ $self->{orphans} }) {
	7					27
751	0					0	$self->check_orphanage;
752							# report
753	0	0				0	if (scalar @{ $self->{orphans} }) {
	0					0
754							printf " GFF errors: %d features could not be associated with reported parents!\n",
755	0					0	scalar @{ $self->{orphans} };
	0					0
756							}
757							}
758
759							# report on duplicate IDs
760	7	50				11	if (keys %{ $self->{duplicate_ids} }) {
	7					964
761							printf " GFF errors: %d IDs were duplicated\n",
762	0					0	scalar(keys %{ $self->{duplicate_ids} });
	0					0
763							}
764
765	7					37	return 1;
766							}
767
768
769							sub _make_gene_parent {
770							# for generating GTF gene parent features
771	12			12		17	my ($self, $fields, $gene_id) = @_;
772	12					43	my $gene = $SFCLASS->new(
773							-seq_id => $fields->[0],
774							-primary_tag => 'gene',
775							-start => $fields->[3],
776							-end => $fields->[4],
777							-strand => $fields->[6],
778							-primary_id => $gene_id,
779							);
780
781	12	50				262	if ($fields->[8] =~ /gene_name "([^"]+)";?/) {
782	12					28	$gene->display_name($1);
783							}
784							else {
785	0					0	$gene->display_name($gene_id);
786							}
787
788							# add extra information if possible
789	12	100				50	unless ($self->simplify) {
790	2	50				13	if ($fields->[8] =~ /gene_biotype "([^"]+)";?/) {
		50
791	0					0	$gene->add_tag_value('gene_biotype', $1);
792							}
793							elsif ($fields->[8] =~ /gene_type "([^"]+)";?/) {
794	0					0	$gene->add_tag_value('gene_type', $1);
795							}
796	2	50				8	if ($fields->[8] =~ /gene_source "([^"]+)";?/) {
797	0					0	$gene->source($1);
798							}
799							}
800	12					22	return $gene;
801							}
802
803
804							sub _make_rna_parent {
805							# for generating GTF gene parent features
806	0			0		0	my ($self, $fields, $transcript_id) = @_;
807	0					0	my $rna = $SFCLASS->new(
808							-seq_id => $fields->[0],
809							-primary_tag => 'transcript',
810							-start => $fields->[3],
811							-end => $fields->[4],
812							-strand => $fields->[6],
813							-primary_id => $transcript_id,
814							);
815
816	0	0				0	if ($fields->[8] =~ /transcript_name "([^"]+)";?/) {
817	0					0	$rna->display_name($1);
818							}
819							else {
820	0					0	$rna->display_name($transcript_id);
821							}
822
823							# add extra information if possible
824	0	0				0	unless ($self->simplify) {
825	0	0				0	if ($fields->[8] =~ /transcript_biotype "([^"]+)";?/) {
		0
826	0					0	$rna->add_tag_value('transcript_biotype', $1);
827							}
828							elsif ($fields->[8] =~ /transcript_type "([^"]+)";?/) {
829	0					0	$rna->add_tag_value('transcript_type', $1);
830							}
831	0	0				0	if ($fields->[8] =~ /transcript_source "([^"]+)";?/) {
832	0					0	$rna->source($1);
833							}
834							}
835	0					0	return $rna;
836							}
837
838
839							sub find_gene {
840	37			37	1	11305	my $self = shift;
841
842							# check that we have gene2seqf table
843	37	100				63	unless (exists $self->{gene2seqf}) {
844	5	50				16	croak "must parse file first!" unless $self->{'eof'};
845	5					14	$self->{gene2seqf} = {};
846	5					7	foreach (@{ $self->{top_features} }) {
	5					14
847	107					142	my $name = lc $_->display_name;
848	107	50				353	if (exists $self->{gene2seqf}->{$name}) {
849	0					0	push @{ $self->{gene2seqf}->{$name} }, $_;
	0					0
850							}
851							else {
852	107					210	$self->{gene2seqf}->{$name} = [$_];
853							}
854							}
855							}
856
857							# get the name and coordinates from arguments
858	37					45	my ($name, $id, $chrom, $start, $end, $strand);
859	37	50				79	if (scalar @_ == 0) {
		100
860	0					0	carp "must provide information to find_gene method!";
861	0					0	return;
862							}
863							elsif (scalar @_ == 1) {
864	4					11	$name = $_[0];
865							}
866							else {
867	33					54	my %opt = @_;
868	33		0			51	$name = $opt{name} \|\| $opt{display_name} \|\| undef;
869	33		0			51	$id = $opt{id} \|\| $opt{primary_id} \|\| undef;
870	33		50			107	$chrom = $opt{chrom} \|\| $opt{seq_id} \|\| undef;
871	33		50			59	$start = $opt{start} \|\| undef;
872	33		50			81	$end = $opt{stop} \|\| $opt{end} \|\| undef;
873	33		50			63	$strand = $opt{strand} \|\| 0;
874							}
875	37	50				51	unless ($name) {
876	0					0	carp "name is required for find_gene!";
877	0					0	return;
878							}
879
880							# check if a gene with this name exists
881	37	50				88	if (exists $self->{gene2seqf}->{lc $name} ) {
882							# we found a matching gene
883
884							# pull out the gene seqfeature(s) array reference
885							# there may be more than one gene
886	37					52	my $genes = $self->{gene2seqf}->{ lc $name };
887
888							# go through a series of checks to find the appropriate
889	37	100				50	if ($id) {
890	33					51	foreach my $g (@$genes) {
891	33	50				83	if ($g->primary_id eq $id) {
892	33					55	return $g;
893							}
894							}
895	0					0	return; # none of these matched despite having an ID
896							}
897	4	0	33			11	if ($chrom and $start and $end) {
			33
898	0					0	foreach my $g (@$genes) {
899	0	0	0			0	if (
			0
900							# overlap method borrowed from Bio::RangeI
901							($g->strand == $strand) and not (
902							$g->start > $end or
903							$g->end < $start
904							)
905							) {
906							# gene and transcript overlap on the same strand
907							# we found the intersecting gene
908	0					0	return $g;
909							}
910							}
911	0					0	return; # none of these matched despite having coordinate info
912							}
913	4	50				10	if (scalar @$genes == 1) {
		0
914							# going on trust here that this is the one
915	4					10	return $genes->[0];
916							}
917							elsif (scalar @$genes > 1) {
918	0					0	carp "more than one gene named $name found!";
919	0					0	return $genes->[0];
920							}
921
922							# nothing suitable found
923	0					0	return;
924							}
925							}
926
927
928							sub from_gff_string {
929	0			0	1	0	my ($self, $string) = @_;
930
931							# check string
932	0					0	chomp $string;
933	0	0				0	unless ($string) {
934	0					0	cluck("must pass a string!\n");
935	0					0	return;
936							}
937	0					0	my @fields = split('\t', $string);
938
939							# check convertor
940	0	0				0	unless (defined $self->{convertor_sub}) {
941	0					0	$self->_assign_gff_converter;
942							}
943
944							# parse appropriately
945	0					0	return &{$self->{convertor_sub}}($self, \@fields);
	0					0
946							}
947
948
949							sub _assign_gff_converter {
950	11			11		18	my $self = shift;
951	11	100				31	if ($self->{gff3}) {
		50
952	7					21	$self->{convertor_sub} = \&_gff3_to_seqf;
953							}
954							elsif ($self->{gtf}) {
955	4	100				9	if ($self->simplify) {
956	2					6	$self->{convertor_sub} = \&_gtf_to_seqf_simple;
957							}
958							else {
959	2					6	$self->{convertor_sub} = \&_gtf_to_seqf_full;
960							}
961							# double check we have transcript information
962	4	50				8	if ($self->typelist !~ /transcript\|rna/i) {
963							# we will have to rely on exon and/or cds information to get transcript
964	0	0	0			0	unless ($self->do_exon or $self->do_cds) {
965	0					0	$self->do_exon(1);
966							}
967							}
968	4	50	33			11	if ($self->do_gene and $self->typelist !~ /gene/i) {
969	4					8	$self->do_exon(1);
970							}
971							}
972							else {
973	0					0	$self->{convertor_sub} = \&_gff2_to_seqf;
974							}
975							}
976
977
978							sub _gff3_to_seqf {
979	266			266		330	my ($self, $fields) = @_;
980	266					313	my $group = $fields->[8];
981	266					369	my $feature = $self->_gff_to_seqf($fields);
982
983							# process groups
984	266					1575	foreach my $g (split(/\s;\s/, $group)) {
985	1382					3306	my ($tag, $value) = split /=/, $g;
986	1382					1836	$tag = $self->unescape($tag);
987	1382					2012	my @values = map { $self->unescape($_) } split(/,/, $value);
	1470					1696
988
989							# determine the appropriate attribute based on tag name
990	1382	100				3618	if ($tag eq 'Name') {
		100
		100
		50
		100
991	266					489	$feature->display_name($values[0]);
992							}
993							elsif ($tag eq 'ID') {
994	167					330	$feature->primary_id($values[0]);
995							}
996							elsif ($tag eq 'Parent') {
997							# always record Parent except for transcripts when genes are not wanted
998	99	50	33			154	unless (not $self->do_gene and $feature->primary_tag =~ /rna\|transcript/i) {
999	99					121	foreach (@values) {
1000	99					177	$feature->add_tag_value($tag, $_);
1001							}
1002							}
1003							}
1004							elsif (lc $tag eq 'exon_id') {
1005							# ensembl GFF3 store the exon id but doesn't record it as the ID, why?
1006	0					0	$feature->primary_id($values[0]);
1007							}
1008							elsif (not $self->{simplify}) {
1009	2					4	foreach (@values) {
1010	2					8	$feature->add_tag_value($tag, $_);
1011							}
1012							}
1013							}
1014
1015	266					965	return $feature;
1016							}
1017
1018
1019							sub _gtf_to_seqf_simple {
1020
1021	312			312		431	my ($self, $fields) = @_;
1022	312					388	my $group = $fields->[8];
1023	312					451	my $feature = $self->_gff_to_seqf($fields);
1024
1025							# extract essential tags
1026	312					345	my ($gene_id, $transcript_id);
1027	312	50				1104	if ($group =~ /gene_id "([^"]+)";?/i) {
1028	312					560	$gene_id = $1;
1029							}
1030	312	50				793	if ($group =~ /transcript_id "([^"]+)";?/i) {
1031	312					441	$transcript_id = $1;
1032							}
1033	312	50	33			462	unless ($gene_id or $transcript_id) {
1034							# improperly formatted GTF file without one of these two items, nothing more to do
1035	0					0	return $feature;
1036							}
1037
1038							# common subfeatures including exon, CDS, UTR, and codons
1039	312	100				1013	if ($fields->[2] =~ /cds\|exon\|utr\|codon\|untranslated/i) {
		50
		0
1040	278					661	$feature->add_tag_value('Parent', $transcript_id);
1041
1042							# exon id if present
1043	278	50	66			1489	if ($fields->[2] eq 'exon' and $group =~ /exon_id "([^"]+)";?/i) {
1044	0					0	$feature->primary_id($1);
1045							}
1046
1047							# check gene parent
1048	278	50	33			463	if ($self->do_gene and not exists $self->{loaded}{$gene_id}) {
1049	0					0	my $gene = $self->_make_gene_parent($fields, $gene_id);
1050	0					0	$self->{loaded}{$gene_id} = $gene;
1051	0					0	push @{ $self->{top_features} }, $gene;
	0					0
1052							}
1053
1054							# check transcript parent
1055	278	50				455	unless (exists $self->{loaded}{$transcript_id}) {
1056	0					0	my $rna = $self->_make_rna_parent($fields, $transcript_id);
1057	0					0	$self->{loaded}{$transcript_id} = $rna;
1058	0	0				0	if ($self->do_gene) {
1059	0					0	$rna->add_tag_value('Parent', $gene_id);
1060	0					0	$self->{loaded}{$gene_id}->add_SeqFeature($rna);
1061							}
1062							else {
1063	0					0	push @{ $self->{top_features} }, $rna;
	0					0
1064							}
1065							}
1066							}
1067
1068							# a transcript feature
1069							elsif ($fields->[2] =~ /rna\|transcript/i) {
1070							# these are sometimes present in GTF files, such as from Ensembl
1071							# but are not required and often absent
1072	34					79	$feature->primary_id($transcript_id);
1073
1074							# transcript information
1075	34	50				192	if ($group =~ /transcript_name "([^"]+)";?/i) {
1076	34					69	$feature->display_name($1);
1077							}
1078
1079							# check whether parent was loaded and add gene information if not
1080	34	50				158	if ($self->do_gene) {
1081	34					77	$feature->add_tag_value('Parent', $gene_id);
1082	34	100				155	if (not exists $self->{loaded}{$gene_id}) {
1083	10					24	my $gene = $self->_make_gene_parent($fields, $gene_id);
1084	10					18	$self->{loaded}{$gene_id} = $gene;
1085	10					14	push @{ $self->{top_features} }, $gene;
	10					18
1086							}
1087							}
1088							}
1089
1090							# a gene feature
1091							elsif ($fields->[2] eq 'gene') {
1092							# these are sometimes present in GTF files, such as from Ensembl
1093							# but are not required and often absent
1094	0					0	$feature->primary_id($gene_id);
1095	0	0				0	if ($group =~ /gene_name "([^"]+)";?/i) {
1096	0					0	$feature->display_name($1);
1097							}
1098							}
1099
1100							# anything else, like CNS (conserved noncoding sequence) doesn't get any
1101							# further special attributes, as far as I can tell
1102	312					1394	return $feature;
1103							}
1104
1105
1106							sub _gtf_to_seqf_full {
1107
1108	2			2		6	my ($self, $fields) = @_;
1109	2					4	my $group = $fields->[8];
1110	2					5	my $feature = $self->_gff_to_seqf($fields);
1111
1112							# process the group tags
1113	2					3	my %attributes;
1114	2					9	foreach my $g (split('; ', $group)) { # supposed to be "; " as delimiter
1115	10					19	my ($tag, $value, @bits) = split ' ', $g;
1116	10					25	$value =~ s/[";]//g; # remove the flanking double quotes, assume no internal quotes
1117	10					20	$attributes{$tag} = $value;
1118							}
1119
1120							# assign special tags based on the feature type
1121	2	50				16	if ($fields->[2] =~ /cds\|exon\|utr\|codon\|untranslated/i) {
		50
		0
1122	0					0	$feature->add_tag_value('Parent', $attributes{'transcript_id'});
1123
1124							# exon id if present
1125	0	0	0			0	if ($fields->[2] eq 'exon' and exists $attributes{'exon_id'}) {
1126	0					0	$feature->primary_id($attributes{'exon_id'});
1127							}
1128
1129							# check gene parent
1130	0	0				0	if ($self->do_gene) {
1131	0		0			0	my $gene_id = $attributes{gene_id} \|\| undef;
1132	0	0	0			0	if ($gene_id and not exists $self->{loaded}{$gene_id}) {
1133	0					0	my $gene = $self->_make_gene_parent($fields, $gene_id);
1134	0					0	$self->{loaded}{$gene_id} = $gene;
1135	0					0	push @{ $self->{top_features} }, $gene;
	0					0
1136							}
1137							}
1138
1139							# check transcript parent
1140	0		0			0	my $transcript_id = $attributes{transcript_id} \|\| undef;
1141	0	0	0			0	if ($transcript_id and not exists $self->{loaded}{$transcript_id}) {
1142	0					0	my $rna = $self->_make_rna_parent($fields, $transcript_id);
1143	0					0	$self->{loaded}{$transcript_id} = $rna;
1144	0	0				0	if ($self->do_gene) {
1145	0					0	$rna->add_tag_value('Parent', $attributes{gene_id});
1146	0					0	$self->{loaded}{ $attributes{gene_id} }->add_SeqFeature($rna);
1147							}
1148							else {
1149	0					0	push @{ $self->{top_features} }, $rna;
	0					0
1150							}
1151							}
1152							}
1153
1154							# transcript
1155							elsif ($fields->[2] =~ /transcript\|rna/i) {
1156							# these are sometimes present in GTF files, such as from Ensembl
1157
1158							# transcript information
1159	2					10	$feature->primary_id($attributes{transcript_id});
1160	2					10	delete $attributes{transcript_id};
1161	2	50				5	if (exists $attributes{transcript_name}) {
1162	2					7	$feature->display_name($attributes{transcript_name});
1163	2					10	delete $attributes{transcript_name};
1164							}
1165
1166							# check gene parent
1167	2	50				5	if ($self->do_gene) {
1168	2		50			5	my $gene_id = $attributes{gene_id} \|\| undef;
1169	2	50	33			9	if ($gene_id and not exists $self->{loaded}{$gene_id}) {
1170	2					8	my $gene = $self->_make_gene_parent($fields, $gene_id);
1171	2					4	$self->{loaded}{$gene_id} = $gene;
1172	2					4	push @{ $self->{top_features} }, $gene;
	2					6
1173							}
1174	2					8	$feature->add_tag_value('Parent', $gene_id);
1175							}
1176							}
1177
1178							# gene
1179							elsif ($fields->[2] eq 'gene') {
1180							# these are sometimes present in GTF files, such as from Ensembl
1181							# but are not required and often absent
1182	0					0	$feature->primary_id($attributes{gene_id});
1183	0					0	delete $attributes{gene_id};
1184	0	0				0	if (exists $attributes{gene_name}) {
1185	0					0	$feature->display_name($attributes{gene_name});
1186	0					0	delete $attributes{gene_name};
1187							}
1188							}
1189
1190							# store remaining attributes, if any
1191	2					14	foreach my $key (keys %attributes) {
1192	6					19	$feature->add_tag_value($key, $attributes{$key});
1193							}
1194	2					14	return $feature;
1195							}
1196
1197
1198							sub _gff2_to_seqf {
1199							# generic gff1 or gff2 format or poorly defined gff3 or gtf file
1200							# hope for the best!
1201	0			0		0	my ($self, $fields) = @_;
1202	0					0	my $feature = $self->_gff_to_seqf($fields);
1203
1204							# process groups
1205							# we have no uniform method of combining features, so we'll leave the tags
1206							# as is and hope for the best
1207	0					0	foreach my $g (split(/\s;\s/, $fields->[8])) {
1208	0					0	my ($tag, $value) = split /\s+/, $g;
1209	0	0	0			0	next unless ($tag and $value);
1210	0					0	$feature->add_tag_value($tag, $value);
1211							}
1212	0					0	return $feature;
1213							}
1214
1215
1216							sub _gff_to_seqf {
1217	580			580		709	my ($self, $fields) = @_;
1218
1219							# generate the basic SeqFeature
1220	580					2006	my $feature = $SFCLASS->new(
1221							-seq_id => $fields->[0],
1222							-source => $fields->[1],
1223							-primary_tag => $fields->[2],
1224							-start => $fields->[3],
1225							-end => $fields->[4],
1226							-strand => $fields->[6],
1227							);
1228
1229							# add more attributes if they're not null
1230	580	50				10084	if ($fields->[5] ne '.') {
1231	0					0	$feature->score($fields->[5]);
1232							}
1233	580	100				895	if ($fields->[7] ne '.') {
1234	199					352	$feature->phase($fields->[7]);
1235							}
1236
1237							# finished
1238	580					979	return $feature;
1239							}
1240
1241
1242							sub unescape {
1243							# Borrowed unashamedly from bioperl Bio::Tools::GFF
1244							# which in turn was borrowed from Bio::DB::GFF
1245	2852			2852	1	2624	my $self = shift;
1246	2852					2644	my $v = shift;
1247	2852					3101	$v =~ tr/+/ /;
1248	2852					3636	$v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
	5574					10458
1249	2852					4421	return $v;
1250							}
1251
1252
1253							sub _add_orphan {
1254	0			0			my ($self, $feature) = @_;
1255	0						push @{ $self->{orphans} }, $feature;
	0
1256	0						return 1;
1257							}
1258
1259
1260							sub check_orphanage {
1261	0			0	0		my $self = shift;
1262	0	0					return unless scalar @{ $self->{orphans} };
	0
1263
1264							# go through the list of orphans
1265	0						my @reunited; # list of indices to delete after reuniting orphan with parent
1266	0						for (my $i = 0; $i < scalar @{ $self->{orphans} }; $i++) {
	0
1267	0						my $orphan = $self->{orphans}->[$i];
1268	0						my $success = 0;
1269
1270							# find the parent
1271	0						foreach my $parent ($orphan->get_tag_values('Parent') ) {
1272	0	0					if (exists $self->{loaded}{$parent}) {
1273							# we have loaded the parent
1274							# associate each orphan feature with the parent
1275	0						$self->{loaded}{$parent}->add_SeqFeature($orphan);
1276	0						$success++;
1277							}
1278							}
1279							# delete the orphan from the array if it found it's long lost parent
1280	0	0					push @reunited, $i if $success;
1281							}
1282
1283							# clean up the orphanage
1284	0						while (@reunited) {
1285	0						my $i = pop @reunited;
1286	0						splice(@{ $self->{orphans} }, $i, 1);
	0
1287							}
1288							}
1289
1290							sub orphans {
1291	0			0	1		my $self = shift;
1292	0						my @orphans;
1293	0						foreach (@{ $self->{orphans} }) {
	0
1294	0						push @orphans, $_;
1295							}
1296	0	0					return wantarray ? @orphans : \@orphans;
1297							}
1298
1299
1300							sub comments {
1301	0			0	1		my $self = shift;
1302	0						my @comments;
1303	0						foreach (@{ $self->{comments} }) {
	0
1304	0						push @comments, $_;
1305							}
1306	0	0					return wantarray ? @comments : \@comments;
1307							}
1308
1309
1310							sub seq_ids {
1311	0			0	1		my $self = shift;
1312	0	0					unless (scalar keys %{$self->{seq_ids}}) {
	0
1313	0						$self->_get_seq_ids;
1314							}
1315	0						my @s = keys %{$self->{seq_ids}};
	0
1316	0	0					return wantarray ? @s : \@s;
1317							}
1318
1319
1320							sub seq_id_lengths {
1321	0			0	1		my $self = shift;
1322	0	0					unless (scalar keys %{$self->{seq_ids}}) {
	0
1323	0						$self->_get_seq_ids;
1324							}
1325	0						return $self->{seq_ids};
1326							}
1327
1328							sub _get_seq_ids {
1329	0			0			my $self = shift;
1330	0	0					return unless $self->{'eof'};
1331	0						foreach (@{ $self->{top_features} }) {
	0
1332	0						my $s = $_->seq_id;
1333	0	0					unless (exists $self->{seq_ids}{$s}) {
1334	0						$self->{seq_ids}{$s} = 1;
1335							}
1336	0	0					$self->{seq_ids}{$s} = $_->end if $_->end > $self->{seq_ids}{$s};
1337							}
1338							}
1339
1340							__END__