File Coverage

blib/lib/Bio/FeatureIO/gff.pm
Criterion Covered Total %
statement 242 402 60.2
branch 88 228 38.6
condition 23 80 28.7
subroutine 26 34 76.4
pod 11 11 100.0
total 390 755 51.6


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             Bio::FeatureIO::gff - read/write GFF feature files
6              
7             =head1 SYNOPSIS
8              
9             my $feature; #get a Bio::SeqFeature::Annotated somehow
10             my $featureOut = Bio::FeatureIO->new(
11             -format => 'gff',
12             -version => 3,
13             -fh => \*STDOUT,
14             -validate_terms => 1, #boolean. validate ontology terms online? default 0 (false).
15             );
16             $featureOut->write_feature($feature);
17              
18             =head1 DESCRIPTION
19              
20             Currently implemented:
21              
22             version read? write?
23             ------------------------------
24             GFF 1 N N
25             GFF 2 N N
26             GFF 2.5 (GTF) N Y
27             GFF 3 Y Y
28              
29             =head1 FEEDBACK
30              
31             =head2 Mailing Lists
32              
33             User feedback is an integral part of the evolution of this and other
34             Bioperl modules. Send your comments and suggestions preferably to
35             the Bioperl mailing list. Your participation is much appreciated.
36              
37             bioperl-l@bioperl.org - General discussion
38             http://bioperl.org/wiki/Mailing_list - About the mailing lists
39              
40             =head2 Support
41              
42             Please direct usage questions or support issues to the mailing list:
43              
44             I
45              
46             rather than to the module maintainer directly. Many experienced and
47             reponsive experts will be able look at the problem and quickly
48             address it. Please include a thorough description of the problem
49             with code and data examples if at all possible.
50              
51             =head2 Reporting Bugs
52              
53             Report bugs to the Bioperl bug tracking system to help us keep track
54             of the bugs and their resolution. Bug reports can be submitted via
55             the web:
56              
57             http://bugzilla.open-bio.org/
58              
59             =head1 AUTHOR
60              
61             Allen Day,
62              
63             =head1 CONTRIBUTORS
64              
65             Steffen Grossmann,
66             Scott Cain,
67             Rob Edwards
68              
69             =head1 APPENDIX
70              
71             The rest of the documentation details each of the object methods.
72             Internal methods are usually preceded with a _
73              
74             =cut
75              
76              
77             # Let the code begin...
78              
79             package Bio::FeatureIO::gff;
80             BEGIN {
81 2     2   1680 $Bio::FeatureIO::gff::AUTHORITY = 'cpan:BIOPERLML';
82             }
83             $Bio::FeatureIO::gff::VERSION = '1.6.905';
84 2     2   15 use strict;
  2         3  
  2         50  
85              
86             #these are alphabetical, keep them that way.
87 2     2   622 use Bio::Annotation::DBLink;
  2         3406  
  2         47  
88 2     2   517 use Bio::Annotation::OntologyTerm;
  2         25672  
  2         52  
89 2     2   535 use Bio::Annotation::SimpleValue;
  2         903  
  2         72  
90 2     2   525 use Bio::Annotation::Target;
  2         7045  
  2         70  
91 2     2   14 use Bio::FeatureIO;
  2         4  
  2         41  
92 2     2   9 use Bio::Ontology::OntologyStore;
  2         4  
  2         34  
93 2     2   8 use Bio::OntologyIO;
  2         3  
  2         43  
94 2     2   886 use Bio::SeqFeature::Annotated;
  2         6  
  2         62  
95 2     2   714 use Bio::SeqIO;
  2         17774  
  2         61  
96 2     2   15 use URI::Escape;
  2         4  
  2         103  
97              
98 2     2   11 use base qw(Bio::FeatureIO);
  2         4  
  2         121  
99              
100 2     2   11 use constant DEFAULT_VERSION => 3;
  2         4  
  2         6700  
101             my $RESERVED_TAGS = "ID|Name|Alias|Parent|Target|Gap|Derives_from|Note|Dbxref|dbxref|Ontology_term|Index|CRUD";
102              
103             sub _initialize {
104 7     7   25 my($self,%arg) = @_;
105              
106 7         39 $self->SUPER::_initialize(%arg);
107              
108 7   50     1328 $self->version( $arg{-version} || DEFAULT_VERSION);
109 7   50     49 $self->validate($arg{-validate_terms} || 0);
110              
111 7 50       27 if ($arg{-file} =~ /^>.*/ ) {
112 0         0 $self->_print("##gff-version " . $self->version() . "\n");
113             }
114             else {
115 7         13 my $directive;
116 7   66     25 while(($directive = $self->_readline()) && ($directive =~ /^##/) ){
117 12         555 $self->_handle_directive($directive);
118             }
119 7         192 $self->_pushback($directive);
120             }
121            
122             #need to validate against SOFA, no SO
123 7 50       83 if ($self->validate) {
124 0         0 $self->so(
125             Bio::Ontology::OntologyStore->get_ontology('Sequence Ontology Feature Annotation')
126             );
127             }
128             }
129              
130             =head2 next_feature()
131              
132             Usage : my $feature = $featureio->next_feature();
133             Function: reads a feature record from a GFF stream and returns it as an object.
134             Returns : a Bio::SeqFeature::Annotated object
135             Args : N/A
136              
137             =cut
138              
139             sub next_feature {
140 68     68 1 9756 my $self = shift;
141 68         107 my $gff_string;
142              
143 68         159 my($f) = $self->_buffer_feature();
144 68 100       147 if($f){
145 1         5 return $f;
146             }
147              
148 67 100       168 return if $self->fasta_mode();
149              
150             # be graceful about empty lines or comments, and make sure we return undef
151             # if the input is consumed
152 65   33     257 while(($gff_string = $self->_readline()) && defined($gff_string)) {
153 68 50       2146 next if $gff_string =~ /^\s*$/; #skip blank lines
154 68 100       203 next if $gff_string =~ /^\#[^#]/; #skip comments, but not directives
155 65         95 last;
156             }
157              
158 65 50       122 return unless $gff_string;
159              
160             # looks like we went into FASTA mode without a directive.
161 65 100       207 if($gff_string =~ /^>/){
    100          
162            
163 2         9 my $pos = tell($self->_fh);
164            
165             # TODO: SeqIO::fasta uses _fh directly, whereas _pushback uses an in-memory
166             # buffer; this completely breaks with piped data, so fix for files and punt
167             # on the rest for now
168            
169 2 50       16 if ($pos >= 0) {
170 2         6 seek($self->_fh, -length($gff_string), 1);
171             } else {
172 0         0 $self->warn("In-line parsing of FASTA not yet supported");
173 0         0 $self->_pushback($gff_string);
174             }
175 2         33 $self->fasta_mode(1);
176 2         5 return;
177             }
178              
179             # got a directive
180             elsif($gff_string =~ /^##/){
181 6         26 $self->_handle_directive($gff_string);
182 6         11 return $self->_buffer_feature();
183             }
184              
185             # got a feature
186             else {
187 57         167 return $self->_handle_feature($gff_string);
188             }
189             }
190              
191             =head2 next_feature_group
192              
193             Title : next_feature_group
194             Usage : @feature_group = $stream->next_feature_group
195             Function: Reads the next feature_group from $stream and returns it.
196              
197             Feature groups in GFF3 files are separated by '###' directives. The
198             features in a group might form a hierarchical structure. The
199             complete hierarchy of features is returned, i.e. the returned array
200             represents only the top-level features. Lower-level features can
201             be accessed using the 'get_SeqFeatures' method recursively.
202              
203             Example : # getting the complete hierarchy of features in a GFF3 file
204             my @toplevel_features;
205             while (my @fg = $stream->next_feature_group) {
206             push(@toplevel_features, @fg);
207             }
208             Returns : an array of Bio::SeqFeature::Annotated objects
209             Args : none
210              
211             =cut
212              
213             sub next_feature_group {
214 5     5 1 7015 my $self = shift;
215              
216 5         26 my $feat;
217             my %seen_ids;
218 5         0 my @all_feats;
219 5         0 my @toplevel_feats;
220              
221 5         13 $self->{group_not_done} = 1;
222              
223 5   66     31 while ($self->{group_not_done} && ($feat = $self->next_feature()) && defined($feat)) {
      66        
224             # we start by collecting all features in the group and
225             # memorizing those which have an ID attribute
226 30         133 my $anno_ID = $feat->get_Annotations('ID');
227 30 100       71 if(ref($anno_ID)) {
228 6         15 my $attr_ID = $anno_ID->value;
229             $self->throw("Oops! ID $attr_ID exists more than once in your file!")
230 6 50       44 if (exists($seen_ids{$attr_ID}));
231 6         17 $seen_ids{$attr_ID} = $feat;
232             }
233 30         127 push(@all_feats, $feat);
234             }
235              
236             # assemble the top-level features
237 5         15 foreach $feat (@all_feats) {
238 30         84 my @parents = $feat->get_Annotations('Parent');
239 30 100       59 if (@parents) {
240 24         71 foreach my $parent (@parents) {
241 24         53 my $parent_id = $parent->value;
242 24 50       168 $self->throw("Parent with ID $parent_id not found!") unless (exists($seen_ids{$parent_id}));
243 24         57 $seen_ids{$parent_id}->add_SeqFeature($feat);
244             }
245             } else {
246 6         14 push(@toplevel_feats, $feat);
247             }
248             }
249              
250 5         41 return @toplevel_feats;
251             }
252              
253             =head2 next_seq()
254              
255             access the FASTA section (if any) at the end of the GFF stream. note that this method
256             will return undef if not all features in the stream have been handled
257              
258             =cut
259              
260             sub next_seq() {
261 17     17 1 12080 my $self = shift;
262 17 100       48 return unless $self->fasta_mode();
263              
264             #first time next_seq has been called. initialize Bio::SeqIO instance
265 12 100       34 if(!$self->seqio){
266 6         28 $self->seqio( Bio::SeqIO->new(-format => 'fasta', -fh => $self->_fh()) );
267             }
268 12         31 return $self->seqio->next_seq();
269             }
270              
271             =head2 write_feature()
272              
273             Usage : $featureio->write_feature( Bio::SeqFeature::Annotated->new(...) );
274             Function: writes a feature in GFF format. the GFF version used is governed by the
275             '-version' argument passed to Bio::FeatureIO->new(), and defaults to GFF
276             version 3.
277             Returns : ###FIXME
278             Args : a Bio::SeqFeature::Annotated object.
279              
280             =cut
281              
282             sub write_feature {
283 0     0 1 0 my($self,$feature) = @_;
284 0 0       0 if (!$feature) {
285 0         0 $self->throw("gff.pm cannot write_feature unless you give a feature to write.\n");
286             }
287 0 0       0 $self->throw("only Bio::SeqFeature::Annotated objects are writeable") unless $feature->isa('Bio::SeqFeature::Annotated');
288              
289 0 0       0 if($self->version == 1){
    0          
    0          
    0          
290 0         0 return $self->_write_feature_1($feature);
291             } elsif($self->version == 2){
292 0         0 return $self->_write_feature_2($feature);
293             } elsif($self->version == 2.5){
294 0         0 return $self->_write_feature_25($feature);
295             } elsif($self->version == 3){
296 0         0 return $self->_write_feature_3($feature);
297             } else {
298 0         0 $self->throw(sprintf("don't know how to write GFF version %s",$self->version));
299             }
300             }
301              
302             ################################################################################
303              
304             =head1 ACCESSORS
305              
306             =cut
307              
308             =head2 fasta_mode()
309              
310             Usage : $obj->fasta_mode($newval)
311             Function:
312             Example :
313             Returns : value of fasta_mode (a scalar)
314             Args : on set, new value (a scalar or undef, optional)
315              
316             Side effect when setting: rewind the file handle a little bit to get the last
317             carriage return that was swallowed when the previous line was processed.
318              
319             =cut
320              
321             sub fasta_mode {
322 91     91 1 156 my($self,$val) = @_;
323              
324 91 100       211 $self->{'fasta_mode'} = $val if defined($val);
325              
326 91 100 66     202 if ($val && $val == 1) {
327             # seek $self->_fh(), -1, 1; #rewind 1 byte to get the previous line's \n
328 7         29 $self->_pushback("\n");
329             }
330              
331 91         295 return $self->{'fasta_mode'};
332             }
333              
334             =head2 seqio()
335              
336             Usage : $obj->seqio($newval)
337             Function: holds a Bio::SeqIO instance for handling the GFF3 ##FASTA section.
338             Returns : value of seqio (a scalar)
339             Args : on set, new value (a scalar or undef, optional)
340              
341              
342             =cut
343              
344             sub seqio {
345 30     30 1 7519 my($self,$val) = @_;
346 30 100       73 $self->{'seqio'} = $val if defined($val);
347 30         73 return $self->{'seqio'};
348             }
349              
350             =head2 sequence_region()
351              
352             Usage :
353             Function: ###FIXME
354             Returns :
355             Args :
356              
357              
358             =cut
359              
360             sub sequence_region {
361 1     1 1 9 my ($self,$k,$v) = @_;
362 1 50 33     6 if(defined($k) && defined($v)){
    0          
363 1         6 $self->{'sequence_region'}{$k} = $v;
364 1         2 return $v;
365             }
366             elsif(defined($k)){
367 0         0 return $self->{'sequence_region'}{$k};
368             }
369             else {
370 0         0 return;
371             }
372             }
373              
374              
375             =head2 so()
376              
377             Usage : $obj->so($newval)
378             Function: holds a Sequence Ontology instance
379             Returns : value of so (a scalar)
380             Args : on set, new value (a scalar or undef, optional)
381              
382             =cut
383              
384             sub so {
385 0     0 1 0 my $self = shift;
386 0         0 my $val = shift;
387             ###FIXME validate $val object's type
388 0 0       0 $self->{so} = $val if defined($val);
389 0         0 return $self->{so};
390             }
391              
392             =head2 validate()
393              
394             Usage : $obj->validate($newval)
395             Function: true if encountered ontology terms in next_feature()
396             mode should be validated.
397             Returns : value of validate (a scalar)
398             Args : on set, new value (a scalar or undef, optional)
399              
400              
401             =cut
402              
403             sub validate {
404 80     80 1 158 my($self,$val) = @_;
405 80 100       160 $self->{'validate'} = $val if defined($val);
406 80         194 return $self->{'validate'};
407             }
408              
409             =head2 version()
410              
411             Usage : $obj->version($newval)
412             Function: version of GFF to read/write. valid values are 1, 2, 2.5, and 3.
413             Returns : value of version (a scalar)
414             Args : on set, new value (a scalar or undef, optional)
415              
416             =cut
417              
418             sub version {
419 7     7 1 15 my $self = shift;
420 7         15 my $val = shift;
421 7         23 my %valid = map {$_=>1} (1, 2, 2.5, 3);
  28         141  
422 7 50 33     42 if(defined $val && $valid{$val}){
    0          
423 7         27 return $self->{'version'} = $val;
424             }
425             elsif(defined($val)){
426 0         0 $self->throw('invalid version. valid versions: '.join(' ', sort keys %valid));
427             }
428 0         0 return $self->{'version'};
429             }
430              
431             ################################################################################
432              
433             =head1 INTERNAL METHODS
434              
435             =cut
436              
437             =head2 _buffer_feature()
438              
439             Usage :
440             Function: ###FIXME
441             Returns :
442             Args :
443              
444             =cut
445              
446             sub _buffer_feature {
447 75     75   146 my ($self,$f) = @_;
448              
449 75 100       198 if ( $f ) {
    100          
450 1         3 push @{ $self->{'buffer'} }, $f;
  1         3  
451 1         4 return $f;
452             }
453             elsif ( $self->{'buffer'} ) {
454 2         5 return shift @{ $self->{'buffer'} };
  2         9  
455             }
456             else {
457 72         149 return;
458             }
459             }
460              
461              
462             =head1 _handle_directive()
463              
464             this method is called for lines beginning with '##'.
465              
466             =cut
467              
468             sub _handle_directive {
469 18     18   51 my($self,$directive_string) = @_;
470              
471 18         74 $directive_string =~ s/^##//; #remove escape
472 18         107 my($directive,@arg) = split /\s+/, $directive_string;
473              
474 18 100 66     136 if($directive eq 'gff-version'){
    100          
    100          
    100          
    100          
    100          
    50          
    0          
475 6         18 my $version = $arg[0];
476 6 50       30 if($version != 3){
477 0         0 $self->throw("this is not a gff version 3 document, it is version '$version'");
478             }
479             }
480              
481             elsif($directive eq 'sequence-region'){
482             # RAE: Sequence regions are in the format sequence-region seqid start end
483             # for these we want to store the seqid, start, and end. Then when we validate
484             # we want to make sure that the features are within the seqid/start/end
485              
486 1 50 33     6 $self->throw('Both start and end for sequence region should be defined')
487             unless $arg[1] && $arg[2];
488 1         8 my $fta = Bio::Annotation::OntologyTerm->new();
489 1         29 $fta->name( 'region');
490              
491 1         122 my $f = Bio::SeqFeature::Annotated->new();
492 1         8 $f->seq_id( $arg[0] );
493 1         14 $f->start( $arg[1] );
494 1         10 $f->end( $arg[2] );
495 1         28 $f->strand(1);
496              
497 1         15 $f->type( $fta );
498              
499             #cache this in sequence_region(), we may need it for validation later.
500 1         3 $self->sequence_region($f->seq_id => $f);
501              
502             #NOTE: is this the right thing to do -- treat this as a feature? -allenday
503             #buffer it to be returned by next_feature()
504 1         3 $self->_buffer_feature($f);
505             }
506              
507             elsif($directive eq 'feature-ontology'){
508 1         25 $self->warn("'##$directive' directive handling not yet implemented");
509             }
510              
511             elsif($directive eq 'attribute-ontology'){
512 1         7 $self->warn("'##$directive' directive handling not yet implemented");
513             }
514              
515             elsif($directive eq 'source-ontology'){
516 1         5 $self->warn("'##$directive' directive handling not yet implemented");
517             }
518              
519             elsif($directive eq 'FASTA' or $directive =~ /^>/){
520             #next_seq() will take care of this.
521 5         13 $self->fasta_mode(1);
522 5         10 return;
523             }
524              
525             elsif($directive eq '#'){
526             #all forward references resolved
527 3         10 $self->{group_not_done} = 0;
528             }
529              
530             elsif($directive eq 'organism') {
531 0         0 my $organism = $arg[0];
532 0         0 $self->organism($organism);
533             }
534              
535             else {
536 0         0 $self->throw("don't know what do do with directive: '##".$directive."'");
537             }
538             }
539              
540             =head1 _handle_feature()
541              
542             this method is called for each line not beginning with '#'. it parses the line and returns a
543             Bio::SeqFeature::Annotated object.
544              
545             =cut
546              
547             sub _handle_feature {
548 57     57   119 my($self,$feature_string) = @_;
549              
550 57         228 my $feat = Bio::SeqFeature::Annotated->new();
551              
552 57         340 my($seq,$source,$type,$start,$end,$score,$strand,$phase,$attribute_string) = split /\t/, $feature_string;
553              
554 57         220 $feat->seq_id($seq);
555 57         386 $feat->source_tag($source);
556 57 50       925 $feat->start($start) unless $start eq '.';
557 57 50       666 $feat->end($end) unless $end eq '.';
558 57 50       1553 $feat->strand($strand eq '+' ? 1 : $strand eq '-' ? -1 : 0);
    100          
559 57         808 $feat->score($score);
560 57         784 $feat->phase($phase);
561              
562 57         439 my $fta = Bio::Annotation::OntologyTerm->new();
563              
564 57 50       1693 if($self->validate()){
565             # RAE Added a couple of validations based on the GFF3 spec at http://song.sourceforge.net/gff3.shtml
566             # 1. Validate the id
567 0 0       0 if ($seq =~ /[^a-zA-Z0-9\.\-\:\^\*\$\@\!\+\_\?]/) { # I just escaped everything.
568 0         0 $self->throw("Validation Error: seqid ($seq) contains characters that are not [a-zA-Z0-9.:^*\$\@!+_?\-] and not escaped");
569             }
570              
571 0 0       0 if ($seq =~ /\s/) {
572 0         0 $self->throw("Validation Error: seqid ($seq) contains unescaped whitespace")
573             }
574              
575             # NOTE i think we're handling this in as a directive, and this test may be removed -allenday
576 0 0       0 if ($seq =~ /^>/) {
577 0         0 $self->throw("Validation Error: seqid ($seq) begins with a >")
578             }
579              
580             # 2. Validate the starts and stops.
581             # these need to be within the region's bounds, and
582             # also start <= end. bail out if either is not true.
583 0 0       0 if ($start > $end) {
584 0         0 $self->throw("Validation Error: start ($start) must be less than or equal to end in $seq");
585             }
586              
587 0         0 my $region = $self->sequence_region($seq);
588             # NOTE: we can only validate against sequence-region that are declared in the file.
589             # if i reference some region from elsewhere, can't validate. if we want to be really strict
590             # we should bail out here. -allenday
591 0 0 0     0 if ( defined($region) && $start < $region->start() || $end > $region->end() ) {
      0        
592 0         0 $self->throw("Validation Error: sequence location ($seq from $start to $end) does not appear to lie within a defined sequence-region")
593             }
594              
595             # 3. Validate the strand.
596             # In the unvalidated version +=1 and -=-1. Everything else is 0. We just need to warn when it is not [+-.?]
597 0 0       0 $self->throw("Validation Error: strand is not one of [+-.?] at $seq") if ($strand =~ /^[^\+\-\.\?]$/);
598              
599             # 4. Validate the phase to be one of [.012]
600 0 0       0 $self->throw("Validation Error: phase is not one of [.012] at $seq") if ($phase =~ /^[^\.012]$/);
601              
602 0         0 my $feature_type;
603 0 0       0 if($type =~ /^\D+:\d+$/){
604             #looks like an identifier
605 0         0 ($feature_type) = $self->so->find_terms(-identifier => $type);
606             } else {
607             #looks like a name
608 0         0 ($feature_type) = $self->so->find_terms(-name => $type);
609             }
610              
611 0 0       0 if(!$feature_type){
612 0         0 $self->throw("Validation Error: couldn't find ontology term for '$type'.");
613             }
614 0         0 $fta->term($feature_type);
615             } else {
616              
617 57 50       161 if($type =~ /^\D+:\d+$/){
618             #looks like an identifier
619 0         0 $fta->identifier($type)
620             } else {
621 57         145 $fta->name($type);
622             }
623             }
624              
625 57         7788 $feat->type($fta);
626              
627 57         100 my %attr = ();
628 57         117 chomp $attribute_string;
629              
630 57 50       137 unless ( $attribute_string eq '.' ) {
631 57         228 my @attributes = split ';', $attribute_string;
632 57         120 foreach my $attribute (@attributes){
633 93         247 my($key,$values) = split '=', $attribute;
634              
635             # remove leading and trailing quotes from values
636 93         222 $values =~ s/^["']//;
637 93         172 $values =~ s/["']$//; #' terminate the quote for emacs
638              
639 93         195 my @values = map{uri_unescape($_)} split ',', $values;
  159         580  
640              
641             #minor hack to allow for multiple instances of the same tag
642 93 100       862 if ($attr{$key}) {
643 3         5 my @tmparray = @{$attr{$key}};
  3         10  
644 3         9 push @tmparray, @values;
645 3         10 $attr{$key} = [@tmparray];
646             } else {
647 90         299 $attr{$key} = [@values];
648             }
649             }
650             }
651              
652             #Handle Dbxref attributes
653 57 100 66     234 if($attr{Dbxref} or $attr{dbxref}){
654 8         15 foreach my $value (@{ $attr{Dbxref} }, @{ $attr{dbxref} }){
  8         20  
  8         24  
655 71         2667 my $a = Bio::Annotation::DBLink->new();
656 71         2298 my($db,$accession) = $value =~ /^(.+?):(.+)$/;
657              
658 71 50 33     266 if(!$db or !$accession){ #dbxref malformed
659 0         0 $self->throw("Error in line:\n$feature_string\nDbxref value '$value' did not conform to GFF3 specification");
660 0         0 next;
661             }
662              
663 71         194 $a->database($db);
664 71         423 $a->primary_id($accession);
665 71         349 $feat->add_Annotation('Dbxref',$a);
666             }
667             }
668              
669             #Handle Ontology_term attributes
670 57 100       426 if($attr{Ontology_term}){
671 3         6 foreach my $id (@{ $attr{Ontology_term} }){
  3         8  
672 9         249 my $a = Bio::Annotation::OntologyTerm->new();
673              
674 9 50       252 if($self->validate()){
675 0         0 my $ont_name = Bio::Ontology::OntologyStore->guess_ontology($id);
676 0         0 my $ont = Bio::Ontology::OntologyStore->get_ontology($ont_name);
677 0         0 my($term) = $ont->find_terms(-identifier => $id);
678 0         0 $a->term($term);
679             } else {
680 9         24 $a->identifier($id);
681             }
682              
683 9         1017 $feat->add_Annotation('Ontology_term',$a);
684             }
685             }
686              
687             #Handle Gap attributes
688 57 100       239 if($attr{Gap}){
689 3         6 for my $value (@{ $attr{Gap} }) {
  3         8  
690 3         11 my $a = Bio::Annotation::SimpleValue->new();
691 3         69 $a->value($value);
692 3         25 $feat->add_Annotation('Gap',$a);
693             }
694             }
695              
696             #Handle Target attributes
697 57 100       198 if($attr{Target}){
698 3         10 my $target_collection = Bio::Annotation::Collection->new();
699              
700 3         137 foreach my $target_string (@{ $attr{Target} } ) {
  3         11  
701              
702             #only replace + for space if + has been used in place of it
703             #that is, + could also mean plus strand, and we don't want
704             #to accidentally remove it
705            
706             #presumably you can't use + for space and + for strand in the same string.
707 3 50       18 $target_string =~ s/\+/ /g unless $target_string =~ / /;
708              
709 3         26 my ($t_id,$tstart,$tend,$strand,$extra) = split /\s+/, $target_string;
710 3 50 33     19 if (!$tend || $extra) { # too much or too little stuff in the string
711 0         0 $self->throw("The value in the Target string, $target_string, does not conform to the GFF3 specification");
712             }
713              
714 3         40 my $a = Bio::Annotation::Target->new(
715             -target_id => $t_id,
716             -start => $tstart,
717             -end => $tend,
718             );
719              
720 3 50 33     634 if ($strand && $strand eq '+') {
    50 33        
721 0         0 $strand = 1;
722             } elsif ($strand && $strand eq '-') {
723 0         0 $strand = -1;
724             } else {
725 3         7 $strand = '';
726             }
727              
728 3 50       8 $a->strand($strand) if $strand;
729 3         9 $feat->add_Annotation('Target',$a);
730             }
731             }
732              
733             #Handle ID attribute. May only have one ID, throw error otherwise
734              
735 57 100       320 if($attr{ID}){
736 11 50       19 if(scalar( @{ $attr{ID} } ) > 1){
  11         42  
737 0         0 $self->throw("Error in line:\n$feature_string\nA feature may have at most one ID value");
738             }
739              
740             #ID's must be unique in the file
741 11 50 33     24 if ($self->{'allIDs'}->{${$attr{ID}}[0]} && $self->validate()) {
  11         48  
742 0         0 $self->throw("Validation Error: The ID ${$attr{ID}}[0] occurs more than once in the file, but should be unique");
  0         0  
743             }
744 11         24 $self->{'allIDs'}->{${$attr{ID}}[0]} = 1;
  11         34  
745              
746              
747 11         37 my $a = Bio::Annotation::SimpleValue->new();
748 11         256 $a->value( @{ $attr{ID} }[0] );
  11         55  
749 11         94 $feat->add_Annotation('ID',$a);
750             }
751              
752             #Handle Name attribute. May only have one Name, throw error otherwise
753 57 50       533 if($attr{Name}){
754 0 0       0 if(scalar( @{ $attr{Name} } ) > 1){
  0         0  
755 0         0 $self->throw("Error in line:\n$feature_string\nA feature may have at most one Name value");
756             }
757              
758 0         0 my $a = Bio::Annotation::SimpleValue->new();
759 0         0 $a->value( @{ $attr{Name} }[0] );
  0         0  
760 0         0 $feat->add_Annotation('Name',$a);
761             }
762              
763 57         152 foreach my $other_canonical (qw(Alias Parent Note Derives_from Index CRUD)){
764 342 100       2840 if($attr{$other_canonical}){
765 62         79 foreach my $value (@{ $attr{$other_canonical} }){
  62         112  
766 62         179 my $a = Bio::Annotation::SimpleValue->new();
767 62         1409 $a->value($value);
768 62         497 $feat->add_Annotation($other_canonical,$a);
769             }
770             }
771             }
772              
773 57         145 my @non_reserved_tags = grep {/^[a-z]/} keys %attr;
  98         299  
774 57         124 foreach my $non_reserved_tag (@non_reserved_tags) {
775 8 50       29 next if ($non_reserved_tag eq 'dbxref');
776 0         0 foreach my $value (@{ $attr{$non_reserved_tag} }){
  0         0  
777 0         0 $feat = $self->_handle_non_reserved_tag($feat,$non_reserved_tag,$value);
778             }
779             }
780              
781             my @illegal_tags = grep
782 90         791 {!/($RESERVED_TAGS)/}
783 57         111 grep {/^[A-Z]/} keys %attr;
  98         259  
784              
785 57 50       204 if (@illegal_tags > 0) {
786 0         0 my $tags = join(", ", @illegal_tags);
787 0         0 $self->throw("The following tag(s) are illegal and are causing this parser to die: $tags");
788             }
789              
790 57         399 return $feat;
791             }
792              
793             =head2 _handle_non_reserved_tag()
794              
795             Usage : $self->_handle_non_reserved_tag($feature,$tag,$value)
796             Function: Deal with non-reserved word tags in the ninth column
797             Returns : An updated Bio::SeqFeature::Annotated object
798             Args : A Bio::SeqFeature::Annotated and a tag/value pair
799              
800             Note that this method can be overridden in a subclass to provide
801             special handling of non-reserved word tags.
802              
803             =cut
804              
805             sub _handle_non_reserved_tag {
806 0     0     my $self = shift;
807 0           my ($feat,$tag,$value) = @_;
808              
809             # to customize through subclassing and overriding:
810             #if ($tag eq 'someTagOfInterest') {
811             # do something different
812             # else { do what is below
813              
814 0           my $a;
815 0 0         if ($tag eq 'comment') {
816 0           $a = Bio::Annotation::Comment->new();
817             }
818             else {
819 0           $a = Bio::Annotation::SimpleValue->new();
820             }
821 0           $a->value($value);
822 0           $feat->add_Annotation($tag,$a);
823            
824 0           return $feat;
825             }
826              
827             =head2 organism
828              
829             Gets/sets the organims from the organism directive
830              
831             =cut
832              
833             sub organism {
834 0     0 1   my $self = shift;
835 0 0         my $organism = shift if @_;
836 0 0         return $self->{'organism'} = $organism if defined($organism);
837 0           return $self->{'organism'};
838             }
839              
840              
841             =head1 _write_feature_1()
842              
843             write a feature in GFF v1 format. currently not implemented.
844              
845             =cut
846              
847             sub _write_feature_1 {
848 0     0     my($self,$feature) = @_;
849 0           $self->throw(sprintf("write_feature unimplemented for GFF version %s",$self->version));
850             }
851              
852             =head1 _write_feature_2()
853              
854             write a feature in GFF v2 format. currently not implemented.
855              
856             =cut
857              
858             sub _write_feature_2 {
859 0     0     my($self,$feature) = @_;
860 0           $self->throw(sprintf("write_feature unimplemented for GFF version %s",$self->version));
861             }
862              
863             =head1 _write_feature_25()
864              
865             write a feature in GFF v2.5 (aka GTF) format.
866              
867             =cut
868              
869             sub _write_feature_25 {
870 0     0     my($self,$feature,$group) = @_;
871              
872             #the top-level feature is an aggregate of all subfeatures
873 0           my ($transcript_id, $gene_id) = (($feature->get_Annotations('transcript_id'))[0], ($feature->get_Annotations('gene_id'))[0]);
874 0 0         if(!defined($group)){
875 0           $group = ($feature->get_Annotations('ID'))[0];
876 0   0       $transcript_id ||= $group;
877 0   0       $gene_id ||= $group;
878             }
879            
880              
881 0 0         my $seq = ref($feature->seq_id) ? $feature->seq_id->value : $feature->seq_id;
882 0           my $source = $feature->source->value;
883 0           my $type = $feature->type->name;
884 0 0         $type = 'EXON' if $type eq 'exon'; #a GTF peculiarity, incosistent with the sequence ontology.
885 0   0       my $min = $feature->start || '.';
886 0   0       my $max = $feature->end || '.';
887 0 0         my $strand = $feature->strand == 1 ? '+' : $feature->strand == -1 ? '-' : '.';
    0          
888 0 0         my $score = defined($feature->score) ? (ref($feature->score) ? $feature->score->value : $feature->score) : '.'; # score is optional
    0          
889 0 0         my $frame = defined($feature->frame) ? (ref($feature->frame) ? $feature->frame->value : $feature->frame) : (ref($feature->phase) ? $feature->phase->value : $feature->phase);
    0          
    0          
890              
891             #these are the only valid types in a GTF document
892 0 0 0       if($type eq 'EXON' or $type eq 'CDS' or $type eq 'start_codon' or $type eq 'stop_codon'){
      0        
      0        
893 0 0         my $attr = sprintf('gene_id "%s"; transcript_id "%s";',$gene_id ? $gene_id->value : '',$transcript_id ? $transcript_id->value : '');
    0          
894 0 0         my $outstring = sprintf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
895             $seq,$source,$type,$min,$max,$score,$strand,$frame eq '.' ? 0 : $frame,$attr);
896              
897 0           $self->_print($outstring);
898             }
899              
900 0           foreach my $subfeat ($feature->get_SeqFeatures){
901 0           $self->_write_feature_25($subfeat,$group);
902             }
903             }
904              
905             =head1 _write_feature_3()
906              
907             write a feature in GFF v3 format.
908              
909             =cut
910              
911             sub _write_feature_3 {
912 0     0     my($self,$feature) = @_;
913 0 0         my $seq = ref($feature->seq_id) ? $feature->seq_id->value : $feature->seq_id;
914 0           my $source;
915 0 0         if ($feature->source()) {
916 0           $source = $feature->source->value;
917             }
918             else {
919 0   0       $source = $feature->source() || "unknownsource";
920             }
921 0           my $type;
922 0 0         if ($feature->type()) { $type = $feature->type->name; }
  0            
923 0           else { $type = "region"; }
924 0   0       my $min = $feature->start || '.';
925 0   0       my $max = $feature->end || '.';
926 0 0         my $strand = $feature->strand == 1 ? '+' : $feature->strand == -1 ? '-' : '.';
    0          
927 0 0         my $score = defined($feature->score) ? (ref($feature->score) ? $feature->score->value : $feature->score) : undef;
    0          
928 0 0         my $phase = defined($feature->phase) ? (ref($feature->phase) ? $feature->phase->value : $feature->phase) : undef;
    0          
929              
930 0           my @attr;
931 0 0         if(my @v = ($feature->get_Annotations('Name'))){
932 0           my $vstring = join ',', map {uri_escape($_->value)} @v;
  0            
933 0           push @attr, "Name=$vstring";
934             }
935 0 0         if(my @v = ($feature->get_Annotations('ID'))){
936 0           my $vstring = join ',', map {uri_escape($_->value)} @v;
  0            
937 0           push @attr, "ID=$vstring";
938 0 0         $self->throw('GFF3 features may have at most one ID, feature with these IDs is invalid:\n'.$vstring) if scalar(@v) > 1;
939             }
940 0 0         if(my @v = ($feature->get_Annotations('Parent'))){
941 0           my $vstring = join ',', map {uri_escape($_->value)} @v;
  0            
942 0           push @attr, "Parent=$vstring";
943             }
944 0 0         if(my @v = ($feature->get_Annotations('dblink'))){
945 0           my $vstring = join ',', map {uri_escape($_->database .':'. $_->primary_id)} @v;
  0            
946 0           push @attr, "Dbxref=$vstring";
947             }
948 0 0         if(my @v = ($feature->get_Annotations('ontology_term'))){
949 0           my $vstring = join ',', map {uri_escape($_->identifier)} @v;
  0            
950 0           push @attr, "Ontology_term=$vstring";
951             }
952 0 0         if(my @v = ($feature->get_Annotations('comment'))){
953 0           my $vstring = join ',', map {uri_escape($_->text)} @v;
  0            
954 0           push @attr, "Note=$vstring";
955             }
956 0 0         if(my @v = ($feature->get_Annotations('Target'))){
957 0           my %strand_map = ( 1=>'+', 0=>'', -1=>'-', '+' => '+', '-' => '-' );
958             my $vstring = join ',', map {
959 0 0         uri_escape($_->target_id).' '.$_->start.' '.$_->end.(defined $_->strand ? ' '.$strand_map{$_->strand} : '')
  0            
960             } @v;
961 0           push @attr, "Target=$vstring";
962             }
963              
964 0           my $attr = join ';', @attr;
965              
966 0           my $outstring = sprintf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
967             $seq,$source,$type,$min,$max,$score,$strand,$phase,$attr);
968              
969 0           $self->_print($outstring);
970              
971 0           foreach my $subfeat ($feature->get_SeqFeatures){
972 0           $self->_write_feature_3($subfeat);
973             }
974             }
975              
976              
977              
978              
979             1;