File Coverage

blib/lib/Bio/ToolBox/parser/gff.pm
Criterion Covered Total %
statement 278 488 56.9
branch 149 306 48.6
condition 17 81 20.9
subroutine 25 38 65.7
pod 22 25 88.0
total 491 938 52.3


line stmt bran cond sub pod time code
1             package Bio::ToolBox::parser::gff;
2              
3             our $VERSION = '1.66';
4              
5             =head1 NAME
6              
7             Bio::ToolBox::parser::gff - parse GFF3, GTF, and GFF files
8              
9             =head1 SYNOPSIS
10              
11             use Bio::ToolBox::parser::gff;
12             my $filename = 'file.gff3';
13            
14             my $parser = Bio::ToolBox::parser::gff->new(
15             file => $filename,
16             do_gene => 1,
17             do_exon => 1,
18             ) or die "unable to open gff file!\n";
19            
20             while (my $feature = $parser->next_top_feature() ) {
21             # each $feature is a SeqFeature object
22             my @children = $feature->get_SeqFeatures();
23             }
24              
25             =head1 DESCRIPTION
26              
27             This module parses a GFF file into SeqFeature objects. It natively
28             handles GFF3, GTF, and general GFF files.
29              
30             For both GFF3 and GTF files, fully nested gene models, typically
31             gene =E transcript =E (exon, CDS, etc), may be built using the appropriate
32             attribute tags. For GFF3 files, these include ID and Parent tags; for GTF
33             these include C and C tags.
34              
35             For GFF3 files, any feature without a C tag is assumed to be a
36             parent. Children features referencing a parent feature that has not been
37             loaded are considered orphans. Orphans are attempted to be re-associated
38             with missing parents after the file is completely parsed. Any orphans left
39             may be collected. Files with orphans are considered poorly formatted or
40             incomplete and should be fixed. Multiple parentage, for example exons
41             shared between different transcripts of the same gene, are fully supported.
42              
43             Embedded Fasta sequences are ignored, as are most comment and pragma lines.
44              
45             The SeqFeature objects that are returned are L
46             objects. Refer to that documentation for more information.
47              
48             =head1 METHODS
49              
50             =head2 Initialize and modify the parser.
51              
52             These are class methods to initialize the parser with an annotation file
53             and modify the parsing behavior. Most parameters can be set either upon
54             initialization or as class methods on the object. Unpredictable behavior
55             may occur if you implement these in the midst of parsing a file.
56              
57             Do not open subsequent files with the same object. Always create a new
58             object to parse a new file
59              
60             =over 4
61              
62             =item new
63              
64             my $parser = Bio::ToolBox::parser::gff->new($filename);
65             my $parser = Bio::ToolBox::parser::gff->new(
66             file => 'file.gtf.gz',
67             do_gene => 1,
68             do_utr => 1,
69             );
70              
71             Initialize a new gff parser object. Pass a single value (a GFF file name)
72             to open a file. Alternatively, pass an array of key value pairs to control
73             how the file is parsed. Options include the following.
74              
75             =over 4
76              
77             =item file
78              
79             Provide a GFF file name to be parsed. It should have a gff, gtf, or gff3 file
80             extension. The file may be gzip compressed.
81              
82             =item version
83              
84             Specify the version. Normally this is not needed, as version can be determined
85             either from the file extension (in the case of gtf and gff3) or from the
86             C<##gff-version> pragma at the top of the file. Acceptable values include 1, 2,
87             2.5 (gtf), or 3.
88              
89             =item class
90              
91             Pass the name of a L compliant class that will be used to
92             create the SeqFeature objects. The default is to use L.
93              
94             =item simplify
95              
96             Pass a boolean value to simplify the SeqFeature objects parsed from the GFF
97             file and ignore extraneous attributes.
98              
99             =item do_gene
100              
101             Pass a boolean (1 or 0) value to combine multiple transcripts with the same gene
102             name under a single gene object. Default is true.
103              
104             =item do_cds
105              
106             =item do_exon
107              
108             =item do_utr
109              
110             =item do_codon
111              
112             Pass a boolean (1 or 0) value to parse certain subfeatures. Exon subfeatures
113             are always parsed, but C, C, C, C,
114             and C features may be optionally parsed. Default is false.
115              
116             =back
117              
118             =item open_file
119              
120             $parser->open_file($file) or die "unable to open $file!";
121              
122             Pass the name of a GFF file to be parsed. The file may optionally be gzipped
123             (F<.gz> extension). Do not open a new file when one has already opened a file.
124             Create a new object for a new file, or concatenate the GFF files.
125              
126             =item version
127              
128             Set or get the GFF version of the current file. Acceptable values include 1, 2,
129             2.5 (gtf), or 3. Normally this is determined by file extension or
130             C pragma on the first line, and should not need to be set by the
131             user in most circumstances.
132              
133             =item simplify
134              
135             Pass a boolean true value to simplify the attributes of GFF3 and GTF files
136             that may have considerable numbers of tags, e.g. Ensembl files. Only
137             essential information, including name, ID, and parentage, is retained.
138             Useful if you're trying to quickly parse annotation files for basic
139             information.
140              
141             =back
142              
143             =head2 Feature retrieval
144              
145             The following methods parse the GFF file lines into SeqFeature objects.
146             It is best if these methods are not mixed; unexpected results may occur.
147              
148             =over 4
149              
150             =item next_top_feature
151              
152             This method will return a top level parent SeqFeature object
153             assembled with child features as sub-features. For example, a gene
154             object with mRNA subfeatures, which in turn may have exon and/or CDS
155             subfeatures. Child features are assembled based on the existence of
156             proper Parent attributes in child features. If no Parent attributes are
157             included in the GFF file, then this will behave as L.
158              
159             Child features (those containing a C attribute)
160             are associated with the parent feature. A warning will be issued about lost
161             children (orphans). Shared subfeatures, for example exons common to
162             multiple transcripts, are associated properly with each parent. An opportunity
163             to rescue orphans is available using the L method.
164              
165             Note that subfeatures may not necessarily be in ascending genomic order
166             when associated with the feature, depending on their order in the GFF3
167             file and whether shared subfeatures are present or not. When calling
168             subfeatures in your program, you may want to sort the subfeatures. For
169             example
170              
171             my @subfeatures = map { $_->[0] }
172             sort { $a->[1] <=> $b->[1] }
173             map { [$_, $_->start] }
174             $parent->get_SeqFeatures;
175              
176             =item top_features
177              
178             This method will return an array of the top (parent) features defined in
179             the GFF file. This is similar to the L method except that
180             all features are returned at once.
181              
182             =item next_feature
183              
184             This method will return a SeqFeature object representation of
185             the next feature (line) in the file. Parent - child relationships are
186             NOT assembled; however, undefined parents in a GTF file may still be
187             generated, just not returned.
188              
189             This method is best used with simple GFF files with no hierarchies
190             present. This may be used in a while loop until the end of the file
191             is reached. Pragmas are ignored and comment lines and sequence are
192             automatically skipped.
193              
194             =back
195              
196             =head2 Other methods
197              
198             Additional methods for working with the parser object and the parsed
199             SeqFeature objects.
200              
201             =over 4
202              
203             =item fh
204              
205             This method returns the L object of the opened GFF file.
206              
207             =item parse_file
208              
209             Parses the entire file into memory. This is automatically called when
210             either L or L is called.
211              
212             =item find_gene
213              
214             my $gene = $parser->find_gene(
215             name => $display_name,
216             id => $primary_id,
217             ) or warn "gene $display_name can not be found!";
218              
219             Pass a gene name, or an array of key =E values (C, C,
220             C, C, and/or coordinate information), that can be used
221             to find a gene already loaded into memory. Only useful after
222             is called. Genes with a matching name are confirmed by a matching ID or
223             overlapping coordinates, if available. Otherwise the first match is returned.
224              
225             =item orphans
226              
227             my @orphans = $parser->orphans;
228             printf "we have %d orphans left over!", scalar @orpans;
229              
230             This method will return an array of orphan SeqFeature objects that indicated
231             they had a parent but said parent could not be found. Typically, this is an
232             indication of an incomplete or malformed GFF3 file. Nevertheless, it might
233             be a good idea to check this after retrieving all top features.
234              
235             =item comments
236              
237             This method will return an array of the comment or pragma lines that may have
238             been in the parsed file. These may or may not be useful.
239              
240             =item typelist
241              
242             Returns a comma-delimited string of the GFF primary tags (column 3)
243             observed in the first 1000 lines of an opened file. Useful for
244             checking what is in the GFF file. See
245             L.
246              
247             =item from_gff_string
248              
249             my $seqfeature = $parser->from_gff_string($string);
250              
251             This method will parse a single GFF, GTF, or GFF3 formatted string or line
252             of text and return a SeqFeature object.
253              
254             =item unescape
255              
256             This method will unescape special characters in a text string. Certain
257             characters, including ";" and "=", are reserved for GFF3 formatting and
258             are not allowed, thus requiring them to be escaped.
259              
260             =item seq_ids
261              
262             Returns an array or array reference of the names of the chromosomes or
263             reference sequences present in the file. These may be defined by GFF3
264             sequence-region pragmas or inferred from the features.
265              
266             =item seq_id_lengths
267              
268             my $seq2len = $parser->seq_id_lengths;
269             foreach (keys %$seq2len) {
270             printf "chromosome %s is %d bp long\n", $_, $seq2len->{$_};
271             }
272              
273             Returns a hash reference to the chromosomes or reference sequences and
274             their corresponding lengths. In this case, the length is either defined
275             by the C pragma or inferred by the greatest end position of
276             the top features.
277              
278             =back
279              
280             =head1 SEE ALSO
281              
282             L, L, L
283              
284             =cut
285              
286 1     1   71575 use strict;
  1         2  
  1         25  
287 1     1   4 use Carp qw(carp cluck croak);
  1         2  
  1         47  
288 1     1   443 use Bio::ToolBox::Data::file;
  1         2  
  1         4216  
289             my $SFCLASS = 'Bio::ToolBox::SeqFeature'; # alternative to Bio::SeqFeature::Lite
290             eval "require $SFCLASS" or croak $@;
291              
292             1;
293              
294             sub new {
295 11     11 1 11117 my $class = shift;
296 11         148 my $self = {
297             'fh' => undef,
298             'top_features' => [],
299             'orphans' => [],
300             'duplicate_ids' => {},
301             'loaded' => {},
302             'eof' => 0,
303             'do_gene' => 1,
304             'do_exon' => 0,
305             'do_cds' => 0,
306             'do_utr' => 0,
307             'do_codon' => 0,
308             'gff3' => 0,
309             'gtf' => 0,
310             'comments' => [],
311             'seq_ids' => {},
312             'simplify' => 0,
313             'typelist' => '',
314             'convertor_sub' => undef,
315             };
316 11         29 bless $self, $class;
317            
318             # check for options
319 11 100       35 if (@_) {
320 8 50       25 if (scalar @_ == 1) {
321 0 0       0 $self->open_file($_[0]) or croak "unable to open file!";
322             }
323             else {
324 8         28 my %options = @_;
325 8 100       24 if (exists $options{simplify}) {
326 4         15 $self->simplify( $options{simplify} );
327             }
328 8 100       22 if (exists $options{do_gene}) {
329 4         13 $self->do_gene($options{do_gene});
330             }
331 8 100       18 if (exists $options{do_exon}) {
332 2         10 $self->do_exon($options{do_exon});
333             }
334 8 100       20 if (exists $options{do_cds}) {
335 4         13 $self->do_cds($options{do_cds});
336             }
337 8 50       20 if (exists $options{do_utr}) {
338 0         0 $self->do_utr($options{do_utr});
339             }
340 8 50       20 if (exists $options{do_codon}) {
341 0         0 $self->do_codon($options{do_codon});
342             }
343 8 50       19 if (exists $options{version}) {
344 0         0 $self->version($options{version});
345             }
346 8 50 33     23 if (exists $options{file} or $options{table}) {
347 8   33     16 $options{file} ||= $options{table};
348 8 50       30 $self->open_file( $options{file} ) or
349             croak "unable to open file!";
350             }
351 8 50       22 if (exists $options{class}) {
352 8         16 my $class = $options{class};
353 8 50       526 if (eval "require $class; 1") {
354 8         28 $SFCLASS = $class;
355             }
356             else {
357 0         0 croak $@;
358             }
359             }
360             }
361             }
362            
363             # done
364 11         45 return $self;
365             }
366              
367             sub do_gene {
368 593     593 1 1419 my $self = shift;
369 593 100       919 if (@_) {
370 7         16 $self->{'do_gene'} = shift;
371             }
372 593         1373 return $self->{'do_gene'};
373             }
374              
375             sub do_exon {
376 188     188 1 218 my $self = shift;
377 188 100       321 if (@_) {
378 6         10 $self->{'do_exon'} = shift;
379             }
380 188         339 return $self->{'do_exon'};
381             }
382              
383             sub do_cds {
384 272     272 1 324 my $self = shift;
385 272 100       399 if (@_) {
386 5         9 $self->{'do_cds'} = shift;
387             }
388 272         419 return $self->{'do_cds'};
389             }
390              
391             sub do_utr {
392 0     0 1 0 my $self = shift;
393 0 0       0 if (@_) {
394 0         0 $self->{'do_utr'} = shift;
395             }
396 0         0 return $self->{'do_utr'};
397             }
398              
399             sub do_codon {
400 34     34 1 40 my $self = shift;
401 34 50       77 if (@_) {
402 0         0 $self->{'do_codon'} = shift;
403             }
404 34         62 return $self->{'do_codon'};
405             }
406              
407             sub do_name {
408             # this does nothing other than maintain compatibility with ucsc parser
409 0     0 0 0 return 0;
410             }
411              
412             sub share {
413             # this does nothing other than maintain compatibility with ucsc parser
414 0     0 0 0 return 1;
415             }
416              
417             sub simplify {
418 85     85 1 91 my $self = shift;
419 85 100       139 if (defined $_[0]) {
420 7         12 $self->{simplify} = shift;
421             }
422 85         854 return $self->{simplify};
423             }
424              
425             sub version {
426 24     24 1 1102 my $self = shift;
427 24 100       47 if (@_) {
428 11         21 my $v = shift;
429 11 100 33     37 if ($v eq '3') {
    50 0        
    0          
430 7         22 $self->{version} = $v;
431 7         11 $self->{gff3} = 1;
432             }
433             elsif ($v eq '2.5' or $v eq '2.2') {
434 4         11 $self->{version} = '2.5';
435 4         8 $self->{gtf} = 1;
436             }
437             elsif ($v eq '2' or $v eq '1') {
438 0         0 $self->{version} = $v;
439             }
440             else {
441 0         0 warn "unrecognized GFF version '$v'!\n";
442             }
443             }
444 24         299 return $self->{version};
445             }
446              
447             sub open_file {
448 11     11 1 20 my $self = shift;
449            
450             # check file
451 11         13 my $filename = shift;
452 11 50       18 unless ($filename) {
453 0         0 cluck("no file name passed!\n");
454 0         0 return;
455             }
456            
457             # check type list
458 11         99 my $typelist = Bio::ToolBox::Data::file->sample_gff_type_list($filename);
459 11 50       57 if ($typelist !~ /\w+/) {
460 0         0 print "GFF file has no evident types!? $filename may not be a valid GFF file\n";
461 0         0 return;
462             }
463 11         37 $self->{typelist} = $typelist;
464            
465             # Open filehandle object
466 11 50       84 my $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename) or
467             croak " cannot open file '$filename'!\n";
468            
469             # check gff version pragma
470 11         220 my $first = $fh->getline;
471 11 50       364 if ($first =~ /^##gff.version\s+([\d\.]+)\s*$/i) {
472             # override any version that may have been inferred from the extension
473             # based on the assumption that this pragma is correct
474 11         41 $self->version($1);
475             }
476             else {
477             # no pragma, reopen the file
478 0         0 $fh->close;
479 0         0 $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename);
480             # set version based on file type extension????
481 0 0       0 if ($filename =~ /\.gtf.*$/i) {
    0          
482 0         0 $self->version('2.5');
483 0         0 $self->{gtf} = 1;
484             }
485             elsif ($filename =~ /\.gff3.*$/i) {
486 0         0 $self->version('3');
487 0         0 $self->{gff3} = 1;
488             }
489             }
490 11         22 $self->{fh} = $fh;
491 11         36 return 1;
492             }
493              
494             sub fh {
495 668     668 1 1966 return shift->{fh};
496             }
497              
498             sub typelist {
499 13     13 1 66 return shift->{typelist};
500             }
501              
502             sub next_feature {
503 587     587 1 3053 my $self = shift;
504            
505             # check that we have an open filehandle
506 587 50       837 unless ($self->fh) {
507 0         0 croak("no GFF file loaded to parse!");
508             }
509 587 50       868 return if $self->{'eof'};
510            
511             # check convertor
512 587 100       872 unless (defined $self->{convertor_sub}) {
513 11         26 $self->_assign_gff_converter;
514             }
515            
516             # look for the next feature line
517 587         9255 while (my $line = $self->{fh}->getline) {
518            
519             # check first character
520 770         13217 my $firstchar = substr($line, 0, 1);
521            
522             # skip any comment and pragma lines that we might encounter
523 770 100       1392 if ($firstchar eq '#') {
    50          
524 25 50       63 if ($line =~ /^###$/) {
    50          
525             # a close pragma, all we can do is check for orphans
526             # a properly written shouldn't have any orphans, but just in case
527 0         0 $self->check_orphanage;
528 0         0 next;
529             }
530             elsif ($line =~ /^##sequence.region/i) {
531             # sequence region pragma
532 0         0 my ($pragma, $seq_id, $start, $stop) = split /\s+/, $line;
533 0 0 0     0 if (defined $seq_id and $start =~ /^\d+$/ and $stop =~ /^\d+$/) {
      0        
534             # we're actually only concerned with the stop coordinate
535 0         0 $self->{seq_ids}{$seq_id} = $stop;
536             }
537             else {
538 0         0 warn "malformed sequence-region pragma! $line\n";
539             }
540 0         0 next;
541             }
542             else {
543             # must be some sort of pragma or a comment line, may be useful, keep it
544 25         26 push @{$self->{comments}}, $line;
  25         44  
545 25         290 next;
546             }
547             }
548             elsif ($firstchar eq '>') {
549             # fasta header line
550             # this is almost always at the end of the file, and rarely is sequence put
551             # into GFF files anyway, so let's assume it's the end of the file
552 0         0 $self->{'eof'} = 1;
553 0         0 return;
554             }
555            
556             # line must be a GFF feature
557 745         870 chomp $line;
558 745         2860 my @fields = split('\t', $line);
559 745 50       1251 next unless scalar(@fields) == 9;
560            
561             # check the primary_tag and generate the SeqFeature object for known types
562 745         1115 my $type = lc $fields[2];
563 745 100       1899 if ($type eq 'cds') {
    100          
    50          
    100          
    100          
    100          
    100          
564 265 100       474 if ($self->do_cds) {
565 199         249 return &{$self->{convertor_sub}}($self, \@fields);
  199         320  
566             } else {
567 66         868 next;
568             }
569             }
570             elsif ($type eq 'exon') {
571 178 50       327 if ($self->do_exon) {
572 178         206 return &{$self->{convertor_sub}}($self, \@fields);
  178         237  
573             } else {
574 0         0 next;
575             }
576             }
577             elsif ($type =~ /utr|untranslated/) {
578 0 0       0 if ($self->do_utr) {
579 0         0 return &{$self->{convertor_sub}}($self, \@fields);
  0         0  
580             } else {
581 0         0 next;
582             }
583             }
584             elsif ($type =~ /codon/) {
585 32 50       69 if ($self->do_codon) {
586 0         0 return &{$self->{convertor_sub}}($self, \@fields);
  0         0  
587             } else {
588 32         438 next;
589             }
590             }
591             elsif ($type =~ /gene$/) {
592 165 50       303 if ($self->do_gene) {
593 165         219 return &{$self->{convertor_sub}}($self, \@fields);
  165         235  
594             } else {
595 0         0 next;
596             }
597             }
598             elsif ($type =~ /transcript|rna/) {
599 36         53 return &{$self->{convertor_sub}}($self, \@fields);
  36         79  
600             }
601             elsif ($type =~ /chromosome|contig|scaffold/) {
602             # gff3 files can record the chromosome as a gff record
603             # process this as a region
604 7         20 $self->{seq_ids}{$fields[0]} = $fields[4];
605 7         95 next;
606             }
607             else {
608             # everything else must be some non-standard weird element
609             # only process this if the user wants everything
610 62 100       105 return &{$self->{convertor_sub}}($self, \@fields) unless $self->simplify;
  2         6  
611             }
612             }
613            
614             # presumably reached the end of the file
615 7         245 $self->{'eof'} = 1;
616 7         21 return;
617             }
618              
619             sub next_top_feature {
620 70     70 1 87 my $self = shift;
621             # check that we have an open filehandle
622 70 50       89 unless ($self->fh) {
623 0         0 croak("no GFF3 file loaded to parse!");
624             }
625 70 50       101 unless ($self->{'eof'}) {
626 0 0       0 $self->parse_file or croak "unable to parse file!";
627             }
628 70         91 return shift @{ $self->{top_features} };
  70         144  
629             }
630              
631             sub top_features {
632 4     4 1 4328 my $self = shift;
633 4 50       74 unless ($self->{'eof'}) {
634 0         0 $self->parse_file;
635             }
636 4         10 my @features = @{ $self->{top_features} };
  4         17  
637 4 50       19 return wantarray ? @features : \@features;
638             }
639              
640             *parse_table = \&parse_file;
641              
642             sub parse_file {
643 7     7 1 23 my $self = shift;
644             # check that we have an open filehandle
645 7 50       18 unless ($self->fh) {
646 0         0 confess("no file loaded to parse!");
647             }
648 7 50       20 return 1 if $self->{'eof'};
649            
650             # Each line will be processed into a SeqFeature object, and then checked
651             # for parentage. Child objects with a Parent tag will be appropriately
652             # associated with its parent, or put into an orphanage. Any orphans
653             # left will be checked a final time for parents; if a parent can't be
654             # found, it will be lost. Features without a parent are assumed to be
655             # top-level features.
656            
657 7 50       14 printf " Parsing %s format file....\n",
    100          
658             $self->version eq '3' ? 'GFF3' :
659             $self->version =~ /2\../ ? 'GTF' : 'GFF';
660            
661            
662             TOP_FEATURE_LOOP:
663 7         40 while (my $feature = $self->next_feature) {
664            
665             ### Process the feature
666             # check the ID
667 576         1059 my $id = $feature->primary_id;
668             # if the seqfeature didn't have an ID specified from the file, then
669             # Bio::ToolBox::SeqFeature will autogenerate one, but Bio::SeqFeature::Lite
670             # will not - so we will likely lose that feature
671 576 100       1778 if ($id) {
672             # remember this feature since we have an ID
673 404 50       609 if (exists $self->{loaded}{$id}) {
674             # this ID should be unique in the GFF file
675             # otherwise it might be a shared duplicate or a malformed GFF file
676             # generally only a concern for top level features
677 0 0       0 if ($feature->primary_tag =~ /gene|rna|transcript/) {
678 0         0 $self->{duplicate_ids}{$id} += 1;
679             }
680            
681             # store all of the features as an array
682 0 0       0 if (ref($self->{loaded}{$id}) eq 'ARRAY') {
683             # there's more than two duplicates! pile it on!
684 0         0 push @{ $self->{loaded}{$id} }, $feature;
  0         0  
685             }
686             else {
687 0         0 my $existing = $self->{loaded}{$id};
688 0         0 $self->{loaded}{$id} = [$existing, $feature];
689             }
690             }
691             else {
692             # unique ID, so remember it
693 404         688 $self->{loaded}{$id} = $feature;
694             }
695             }
696             # if the feature didn't have an ID, we'll just assume it is
697             # a child of another feature, otherwise it may get lost
698            
699             # look for parents and children
700 576 100       918 if ($feature->has_tag('Parent')) {
701             # must be a child
702             # there may be more than one parent, per the GFF3 specification
703 411         1288 foreach my $parent_id ( $feature->get_tag_values('Parent') ) {
704 411 50       1821 if (exists $self->{loaded}{$parent_id}) {
705             # we've seen this id
706             # associate the child with the parent
707 411         467 my $parent = $self->{loaded}{$parent_id};
708 411         802 $parent->add_SeqFeature($feature);
709            
710             # check boundaries for gtf genes
711             # gtf genes may not be explicitly defined so must correct as necessary
712             # gff3 files shouldn't have this issue
713 411 100       12852 if ($self->{gtf}) {
714 312 50       435 if ($feature->start < $parent->start) {
715 0         0 $parent->start( $feature->start );
716             }
717 312 100       1864 if ($feature->end > $parent->end) {
718 1         2 $parent->end( $feature->end );
719             }
720             # check parent's parent too
721 312 100       1824 if ($parent->has_tag('Parent')) {
722             # in all likelihood parent is a transcript and there is a
723             # gene that probably also needs fixin'
724 278         858 my ($grandparent_id) = $parent->get_tag_values('Parent');
725 278         1155 my $grandparent = $self->{loaded}{$grandparent_id};
726 278 50       386 if ($feature->start < $grandparent->start) {
727 0         0 $grandparent->start( $feature->start );
728             }
729 278 50       1620 if ($feature->end > $grandparent->end) {
730 0         0 $grandparent->end( $feature->end );
731             }
732             }
733             }
734             }
735             else {
736             # can't find the parent, maybe not loaded yet?
737             # put 'em in the orphanage
738 0         0 $self->_add_orphan($feature);
739             }
740             }
741             }
742             else {
743             # must be a parent
744 165         285 push @{ $self->{top_features} }, $feature;
  165         415  
745             }
746             }
747             # Finished loading the GFF lines
748            
749             # check for orphans
750 7 50       16 if (scalar @{ $self->{orphans} }) {
  7         27  
751 0         0 $self->check_orphanage;
752             # report
753 0 0       0 if (scalar @{ $self->{orphans} }) {
  0         0  
754             printf " GFF errors: %d features could not be associated with reported parents!\n",
755 0         0 scalar @{ $self->{orphans} };
  0         0  
756             }
757             }
758            
759             # report on duplicate IDs
760 7 50       11 if (keys %{ $self->{duplicate_ids} }) {
  7         964  
761             printf " GFF errors: %d IDs were duplicated\n",
762 0         0 scalar(keys %{ $self->{duplicate_ids} });
  0         0  
763             }
764            
765 7         37 return 1;
766             }
767              
768              
769             sub _make_gene_parent {
770             # for generating GTF gene parent features
771 12     12   17 my ($self, $fields, $gene_id) = @_;
772 12         43 my $gene = $SFCLASS->new(
773             -seq_id => $fields->[0],
774             -primary_tag => 'gene',
775             -start => $fields->[3],
776             -end => $fields->[4],
777             -strand => $fields->[6],
778             -primary_id => $gene_id,
779             );
780            
781 12 50       262 if ($fields->[8] =~ /gene_name "([^"]+)";?/) {
782 12         28 $gene->display_name($1);
783             }
784             else {
785 0         0 $gene->display_name($gene_id);
786             }
787            
788             # add extra information if possible
789 12 100       50 unless ($self->simplify) {
790 2 50       13 if ($fields->[8] =~ /gene_biotype "([^"]+)";?/) {
    50          
791 0         0 $gene->add_tag_value('gene_biotype', $1);
792             }
793             elsif ($fields->[8] =~ /gene_type "([^"]+)";?/) {
794 0         0 $gene->add_tag_value('gene_type', $1);
795             }
796 2 50       8 if ($fields->[8] =~ /gene_source "([^"]+)";?/) {
797 0         0 $gene->source($1);
798             }
799             }
800 12         22 return $gene;
801             }
802              
803              
804             sub _make_rna_parent {
805             # for generating GTF gene parent features
806 0     0   0 my ($self, $fields, $transcript_id) = @_;
807 0         0 my $rna = $SFCLASS->new(
808             -seq_id => $fields->[0],
809             -primary_tag => 'transcript',
810             -start => $fields->[3],
811             -end => $fields->[4],
812             -strand => $fields->[6],
813             -primary_id => $transcript_id,
814             );
815            
816 0 0       0 if ($fields->[8] =~ /transcript_name "([^"]+)";?/) {
817 0         0 $rna->display_name($1);
818             }
819             else {
820 0         0 $rna->display_name($transcript_id);
821             }
822            
823             # add extra information if possible
824 0 0       0 unless ($self->simplify) {
825 0 0       0 if ($fields->[8] =~ /transcript_biotype "([^"]+)";?/) {
    0          
826 0         0 $rna->add_tag_value('transcript_biotype', $1);
827             }
828             elsif ($fields->[8] =~ /transcript_type "([^"]+)";?/) {
829 0         0 $rna->add_tag_value('transcript_type', $1);
830             }
831 0 0       0 if ($fields->[8] =~ /transcript_source "([^"]+)";?/) {
832 0         0 $rna->source($1);
833             }
834             }
835 0         0 return $rna;
836             }
837              
838              
839             sub find_gene {
840 37     37 1 11305 my $self = shift;
841            
842             # check that we have gene2seqf table
843 37 100       63 unless (exists $self->{gene2seqf}) {
844 5 50       16 croak "must parse file first!" unless $self->{'eof'};
845 5         14 $self->{gene2seqf} = {};
846 5         7 foreach (@{ $self->{top_features} }) {
  5         14  
847 107         142 my $name = lc $_->display_name;
848 107 50       353 if (exists $self->{gene2seqf}->{$name}) {
849 0         0 push @{ $self->{gene2seqf}->{$name} }, $_;
  0         0  
850             }
851             else {
852 107         210 $self->{gene2seqf}->{$name} = [$_];
853             }
854             }
855             }
856            
857             # get the name and coordinates from arguments
858 37         45 my ($name, $id, $chrom, $start, $end, $strand);
859 37 50       79 if (scalar @_ == 0) {
    100          
860 0         0 carp "must provide information to find_gene method!";
861 0         0 return;
862             }
863             elsif (scalar @_ == 1) {
864 4         11 $name = $_[0];
865             }
866             else {
867 33         54 my %opt = @_;
868 33   0     51 $name = $opt{name} || $opt{display_name} || undef;
869 33   0     51 $id = $opt{id} || $opt{primary_id} || undef;
870 33   50     107 $chrom = $opt{chrom} || $opt{seq_id} || undef;
871 33   50     59 $start = $opt{start} || undef;
872 33   50     81 $end = $opt{stop} || $opt{end} || undef;
873 33   50     63 $strand = $opt{strand} || 0;
874             }
875 37 50       51 unless ($name) {
876 0         0 carp "name is required for find_gene!";
877 0         0 return;
878             }
879            
880             # check if a gene with this name exists
881 37 50       88 if (exists $self->{gene2seqf}->{lc $name} ) {
882             # we found a matching gene
883            
884             # pull out the gene seqfeature(s) array reference
885             # there may be more than one gene
886 37         52 my $genes = $self->{gene2seqf}->{ lc $name };
887            
888             # go through a series of checks to find the appropriate
889 37 100       50 if ($id) {
890 33         51 foreach my $g (@$genes) {
891 33 50       83 if ($g->primary_id eq $id) {
892 33         55 return $g;
893             }
894             }
895 0         0 return; # none of these matched despite having an ID
896             }
897 4 0 33     11 if ($chrom and $start and $end) {
      33        
898 0         0 foreach my $g (@$genes) {
899 0 0 0     0 if (
      0        
900             # overlap method borrowed from Bio::RangeI
901             ($g->strand == $strand) and not (
902             $g->start > $end or
903             $g->end < $start
904             )
905             ) {
906             # gene and transcript overlap on the same strand
907             # we found the intersecting gene
908 0         0 return $g;
909             }
910             }
911 0         0 return; # none of these matched despite having coordinate info
912             }
913 4 50       10 if (scalar @$genes == 1) {
    0          
914             # going on trust here that this is the one
915 4         10 return $genes->[0];
916             }
917             elsif (scalar @$genes > 1) {
918 0         0 carp "more than one gene named $name found!";
919 0         0 return $genes->[0];
920             }
921            
922             # nothing suitable found
923 0         0 return;
924             }
925             }
926              
927              
928             sub from_gff_string {
929 0     0 1 0 my ($self, $string) = @_;
930            
931             # check string
932 0         0 chomp $string;
933 0 0       0 unless ($string) {
934 0         0 cluck("must pass a string!\n");
935 0         0 return;
936             }
937 0         0 my @fields = split('\t', $string);
938            
939             # check convertor
940 0 0       0 unless (defined $self->{convertor_sub}) {
941 0         0 $self->_assign_gff_converter;
942             }
943            
944             # parse appropriately
945 0         0 return &{$self->{convertor_sub}}($self, \@fields);
  0         0  
946             }
947              
948              
949             sub _assign_gff_converter {
950 11     11   18 my $self = shift;
951 11 100       31 if ($self->{gff3}) {
    50          
952 7         21 $self->{convertor_sub} = \&_gff3_to_seqf;
953             }
954             elsif ($self->{gtf}) {
955 4 100       9 if ($self->simplify) {
956 2         6 $self->{convertor_sub} = \&_gtf_to_seqf_simple;
957             }
958             else {
959 2         6 $self->{convertor_sub} = \&_gtf_to_seqf_full;
960             }
961             # double check we have transcript information
962 4 50       8 if ($self->typelist !~ /transcript|rna/i) {
963             # we will have to rely on exon and/or cds information to get transcript
964 0 0 0     0 unless ($self->do_exon or $self->do_cds) {
965 0         0 $self->do_exon(1);
966             }
967             }
968 4 50 33     11 if ($self->do_gene and $self->typelist !~ /gene/i) {
969 4         8 $self->do_exon(1);
970             }
971             }
972             else {
973 0         0 $self->{convertor_sub} = \&_gff2_to_seqf;
974             }
975             }
976              
977              
978             sub _gff3_to_seqf {
979 266     266   330 my ($self, $fields) = @_;
980 266         313 my $group = $fields->[8];
981 266         369 my $feature = $self->_gff_to_seqf($fields);
982            
983             # process groups
984 266         1575 foreach my $g (split(/\s*;\s*/, $group)) {
985 1382         3306 my ($tag, $value) = split /=/, $g;
986 1382         1836 $tag = $self->unescape($tag);
987 1382         2012 my @values = map { $self->unescape($_) } split(/,/, $value);
  1470         1696  
988            
989             # determine the appropriate attribute based on tag name
990 1382 100       3618 if ($tag eq 'Name') {
    100          
    100          
    50          
    100          
991 266         489 $feature->display_name($values[0]);
992             }
993             elsif ($tag eq 'ID') {
994 167         330 $feature->primary_id($values[0]);
995             }
996             elsif ($tag eq 'Parent') {
997             # always record Parent except for transcripts when genes are not wanted
998 99 50 33     154 unless (not $self->do_gene and $feature->primary_tag =~ /rna|transcript/i) {
999 99         121 foreach (@values) {
1000 99         177 $feature->add_tag_value($tag, $_);
1001             }
1002             }
1003             }
1004             elsif (lc $tag eq 'exon_id') {
1005             # ensembl GFF3 store the exon id but doesn't record it as the ID, why?
1006 0         0 $feature->primary_id($values[0]);
1007             }
1008             elsif (not $self->{simplify}) {
1009 2         4 foreach (@values) {
1010 2         8 $feature->add_tag_value($tag, $_);
1011             }
1012             }
1013             }
1014            
1015 266         965 return $feature;
1016             }
1017              
1018              
1019             sub _gtf_to_seqf_simple {
1020              
1021 312     312   431 my ($self, $fields) = @_;
1022 312         388 my $group = $fields->[8];
1023 312         451 my $feature = $self->_gff_to_seqf($fields);
1024            
1025             # extract essential tags
1026 312         345 my ($gene_id, $transcript_id);
1027 312 50       1104 if ($group =~ /gene_id "([^"]+)";?/i) {
1028 312         560 $gene_id = $1;
1029             }
1030 312 50       793 if ($group =~ /transcript_id "([^"]+)";?/i) {
1031 312         441 $transcript_id = $1;
1032             }
1033 312 50 33     462 unless ($gene_id or $transcript_id) {
1034             # improperly formatted GTF file without one of these two items, nothing more to do
1035 0         0 return $feature;
1036             }
1037            
1038             # common subfeatures including exon, CDS, UTR, and codons
1039 312 100       1013 if ($fields->[2] =~ /cds|exon|utr|codon|untranslated/i) {
    50          
    0          
1040 278         661 $feature->add_tag_value('Parent', $transcript_id);
1041            
1042             # exon id if present
1043 278 50 66     1489 if ($fields->[2] eq 'exon' and $group =~ /exon_id "([^"]+)";?/i) {
1044 0         0 $feature->primary_id($1);
1045             }
1046            
1047             # check gene parent
1048 278 50 33     463 if ($self->do_gene and not exists $self->{loaded}{$gene_id}) {
1049 0         0 my $gene = $self->_make_gene_parent($fields, $gene_id);
1050 0         0 $self->{loaded}{$gene_id} = $gene;
1051 0         0 push @{ $self->{top_features} }, $gene;
  0         0  
1052             }
1053            
1054             # check transcript parent
1055 278 50       455 unless (exists $self->{loaded}{$transcript_id}) {
1056 0         0 my $rna = $self->_make_rna_parent($fields, $transcript_id);
1057 0         0 $self->{loaded}{$transcript_id} = $rna;
1058 0 0       0 if ($self->do_gene) {
1059 0         0 $rna->add_tag_value('Parent', $gene_id);
1060 0         0 $self->{loaded}{$gene_id}->add_SeqFeature($rna);
1061             }
1062             else {
1063 0         0 push @{ $self->{top_features} }, $rna;
  0         0  
1064             }
1065             }
1066             }
1067            
1068             # a transcript feature
1069             elsif ($fields->[2] =~ /rna|transcript/i) {
1070             # these are sometimes present in GTF files, such as from Ensembl
1071             # but are not required and often absent
1072 34         79 $feature->primary_id($transcript_id);
1073            
1074             # transcript information
1075 34 50       192 if ($group =~ /transcript_name "([^"]+)";?/i) {
1076 34         69 $feature->display_name($1);
1077             }
1078            
1079             # check whether parent was loaded and add gene information if not
1080 34 50       158 if ($self->do_gene) {
1081 34         77 $feature->add_tag_value('Parent', $gene_id);
1082 34 100       155 if (not exists $self->{loaded}{$gene_id}) {
1083 10         24 my $gene = $self->_make_gene_parent($fields, $gene_id);
1084 10         18 $self->{loaded}{$gene_id} = $gene;
1085 10         14 push @{ $self->{top_features} }, $gene;
  10         18  
1086             }
1087             }
1088             }
1089            
1090             # a gene feature
1091             elsif ($fields->[2] eq 'gene') {
1092             # these are sometimes present in GTF files, such as from Ensembl
1093             # but are not required and often absent
1094 0         0 $feature->primary_id($gene_id);
1095 0 0       0 if ($group =~ /gene_name "([^"]+)";?/i) {
1096 0         0 $feature->display_name($1);
1097             }
1098             }
1099            
1100             # anything else, like CNS (conserved noncoding sequence) doesn't get any
1101             # further special attributes, as far as I can tell
1102 312         1394 return $feature;
1103             }
1104              
1105              
1106             sub _gtf_to_seqf_full {
1107            
1108 2     2   6 my ($self, $fields) = @_;
1109 2         4 my $group = $fields->[8];
1110 2         5 my $feature = $self->_gff_to_seqf($fields);
1111            
1112             # process the group tags
1113 2         3 my %attributes;
1114 2         9 foreach my $g (split('; ', $group)) { # supposed to be "; " as delimiter
1115 10         19 my ($tag, $value, @bits) = split ' ', $g;
1116 10         25 $value =~ s/[";]//g; # remove the flanking double quotes, assume no internal quotes
1117 10         20 $attributes{$tag} = $value;
1118             }
1119            
1120             # assign special tags based on the feature type
1121 2 50       16 if ($fields->[2] =~ /cds|exon|utr|codon|untranslated/i) {
    50          
    0          
1122 0         0 $feature->add_tag_value('Parent', $attributes{'transcript_id'});
1123            
1124             # exon id if present
1125 0 0 0     0 if ($fields->[2] eq 'exon' and exists $attributes{'exon_id'}) {
1126 0         0 $feature->primary_id($attributes{'exon_id'});
1127             }
1128            
1129             # check gene parent
1130 0 0       0 if ($self->do_gene) {
1131 0   0     0 my $gene_id = $attributes{gene_id} || undef;
1132 0 0 0     0 if ($gene_id and not exists $self->{loaded}{$gene_id}) {
1133 0         0 my $gene = $self->_make_gene_parent($fields, $gene_id);
1134 0         0 $self->{loaded}{$gene_id} = $gene;
1135 0         0 push @{ $self->{top_features} }, $gene;
  0         0  
1136             }
1137             }
1138            
1139             # check transcript parent
1140 0   0     0 my $transcript_id = $attributes{transcript_id} || undef;
1141 0 0 0     0 if ($transcript_id and not exists $self->{loaded}{$transcript_id}) {
1142 0         0 my $rna = $self->_make_rna_parent($fields, $transcript_id);
1143 0         0 $self->{loaded}{$transcript_id} = $rna;
1144 0 0       0 if ($self->do_gene) {
1145 0         0 $rna->add_tag_value('Parent', $attributes{gene_id});
1146 0         0 $self->{loaded}{ $attributes{gene_id} }->add_SeqFeature($rna);
1147             }
1148             else {
1149 0         0 push @{ $self->{top_features} }, $rna;
  0         0  
1150             }
1151             }
1152             }
1153            
1154             # transcript
1155             elsif ($fields->[2] =~ /transcript|rna/i) {
1156             # these are sometimes present in GTF files, such as from Ensembl
1157            
1158             # transcript information
1159 2         10 $feature->primary_id($attributes{transcript_id});
1160 2         10 delete $attributes{transcript_id};
1161 2 50       5 if (exists $attributes{transcript_name}) {
1162 2         7 $feature->display_name($attributes{transcript_name});
1163 2         10 delete $attributes{transcript_name};
1164             }
1165            
1166             # check gene parent
1167 2 50       5 if ($self->do_gene) {
1168 2   50     5 my $gene_id = $attributes{gene_id} || undef;
1169 2 50 33     9 if ($gene_id and not exists $self->{loaded}{$gene_id}) {
1170 2         8 my $gene = $self->_make_gene_parent($fields, $gene_id);
1171 2         4 $self->{loaded}{$gene_id} = $gene;
1172 2         4 push @{ $self->{top_features} }, $gene;
  2         6  
1173             }
1174 2         8 $feature->add_tag_value('Parent', $gene_id);
1175             }
1176             }
1177            
1178             # gene
1179             elsif ($fields->[2] eq 'gene') {
1180             # these are sometimes present in GTF files, such as from Ensembl
1181             # but are not required and often absent
1182 0         0 $feature->primary_id($attributes{gene_id});
1183 0         0 delete $attributes{gene_id};
1184 0 0       0 if (exists $attributes{gene_name}) {
1185 0         0 $feature->display_name($attributes{gene_name});
1186 0         0 delete $attributes{gene_name};
1187             }
1188             }
1189            
1190             # store remaining attributes, if any
1191 2         14 foreach my $key (keys %attributes) {
1192 6         19 $feature->add_tag_value($key, $attributes{$key});
1193             }
1194 2         14 return $feature;
1195             }
1196              
1197              
1198             sub _gff2_to_seqf {
1199             # generic gff1 or gff2 format or poorly defined gff3 or gtf file
1200             # hope for the best!
1201 0     0   0 my ($self, $fields) = @_;
1202 0         0 my $feature = $self->_gff_to_seqf($fields);
1203            
1204             # process groups
1205             # we have no uniform method of combining features, so we'll leave the tags
1206             # as is and hope for the best
1207 0         0 foreach my $g (split(/\s*;\s*/, $fields->[8])) {
1208 0         0 my ($tag, $value) = split /\s+/, $g;
1209 0 0 0     0 next unless ($tag and $value);
1210 0         0 $feature->add_tag_value($tag, $value);
1211             }
1212 0         0 return $feature;
1213             }
1214              
1215              
1216             sub _gff_to_seqf {
1217 580     580   709 my ($self, $fields) = @_;
1218            
1219             # generate the basic SeqFeature
1220 580         2006 my $feature = $SFCLASS->new(
1221             -seq_id => $fields->[0],
1222             -source => $fields->[1],
1223             -primary_tag => $fields->[2],
1224             -start => $fields->[3],
1225             -end => $fields->[4],
1226             -strand => $fields->[6],
1227             );
1228            
1229             # add more attributes if they're not null
1230 580 50       10084 if ($fields->[5] ne '.') {
1231 0         0 $feature->score($fields->[5]);
1232             }
1233 580 100       895 if ($fields->[7] ne '.') {
1234 199         352 $feature->phase($fields->[7]);
1235             }
1236            
1237             # finished
1238 580         979 return $feature;
1239             }
1240              
1241              
1242             sub unescape {
1243             # Borrowed unashamedly from bioperl Bio::Tools::GFF
1244             # which in turn was borrowed from Bio::DB::GFF
1245 2852     2852 1 2624 my $self = shift;
1246 2852         2644 my $v = shift;
1247 2852         3101 $v =~ tr/+/ /;
1248 2852         3636 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
  5574         10458  
1249 2852         4421 return $v;
1250             }
1251              
1252              
1253             sub _add_orphan {
1254 0     0     my ($self, $feature) = @_;
1255 0           push @{ $self->{orphans} }, $feature;
  0            
1256 0           return 1;
1257             }
1258              
1259              
1260             sub check_orphanage {
1261 0     0 0   my $self = shift;
1262 0 0         return unless scalar @{ $self->{orphans} };
  0            
1263            
1264             # go through the list of orphans
1265 0           my @reunited; # list of indices to delete after reuniting orphan with parent
1266 0           for (my $i = 0; $i < scalar @{ $self->{orphans} }; $i++) {
  0            
1267 0           my $orphan = $self->{orphans}->[$i];
1268 0           my $success = 0;
1269            
1270             # find the parent
1271 0           foreach my $parent ($orphan->get_tag_values('Parent') ) {
1272 0 0         if (exists $self->{loaded}{$parent}) {
1273             # we have loaded the parent
1274             # associate each orphan feature with the parent
1275 0           $self->{loaded}{$parent}->add_SeqFeature($orphan);
1276 0           $success++;
1277             }
1278             }
1279             # delete the orphan from the array if it found it's long lost parent
1280 0 0         push @reunited, $i if $success;
1281             }
1282            
1283             # clean up the orphanage
1284 0           while (@reunited) {
1285 0           my $i = pop @reunited;
1286 0           splice(@{ $self->{orphans} }, $i, 1);
  0            
1287             }
1288             }
1289              
1290             sub orphans {
1291 0     0 1   my $self = shift;
1292 0           my @orphans;
1293 0           foreach (@{ $self->{orphans} }) {
  0            
1294 0           push @orphans, $_;
1295             }
1296 0 0         return wantarray ? @orphans : \@orphans;
1297             }
1298              
1299              
1300             sub comments {
1301 0     0 1   my $self = shift;
1302 0           my @comments;
1303 0           foreach (@{ $self->{comments} }) {
  0            
1304 0           push @comments, $_;
1305             }
1306 0 0         return wantarray ? @comments : \@comments;
1307             }
1308              
1309              
1310             sub seq_ids {
1311 0     0 1   my $self = shift;
1312 0 0         unless (scalar keys %{$self->{seq_ids}}) {
  0            
1313 0           $self->_get_seq_ids;
1314             }
1315 0           my @s = keys %{$self->{seq_ids}};
  0            
1316 0 0         return wantarray ? @s : \@s;
1317             }
1318              
1319              
1320             sub seq_id_lengths {
1321 0     0 1   my $self = shift;
1322 0 0         unless (scalar keys %{$self->{seq_ids}}) {
  0            
1323 0           $self->_get_seq_ids;
1324             }
1325 0           return $self->{seq_ids};
1326             }
1327              
1328             sub _get_seq_ids {
1329 0     0     my $self = shift;
1330 0 0         return unless $self->{'eof'};
1331 0           foreach (@{ $self->{top_features} }) {
  0            
1332 0           my $s = $_->seq_id;
1333 0 0         unless (exists $self->{seq_ids}{$s}) {
1334 0           $self->{seq_ids}{$s} = 1;
1335             }
1336 0 0         $self->{seq_ids}{$s} = $_->end if $_->end > $self->{seq_ids}{$s};
1337             }
1338             }
1339              
1340             __END__