File Coverage

Bio/Tools/GFF.pm
Criterion Covered Total %
statement 274 440 62.2
branch 116 264 43.9
condition 29 85 34.1
subroutine 24 35 68.5
pod 12 13 92.3
total 455 837 54.3


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::GFF
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by the Bioperl core team
7             #
8             # Copyright Matthew Pocock
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::GFF;
21              
22             # specify input via -fh or -file
23             my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2);
24             my $feature;
25             # loop over the input stream
26             while($feature = $gffio->next_feature()) {
27             # do something with feature
28             }
29             $gffio->close();
30              
31             # you can also obtain a GFF parser as a SeqAnalasisParserI in
32             # HT analysis pipelines (see Bio::SeqAnalysisParserI and
33             # Bio::Factory::SeqAnalysisParserFactory)
34             my $factory = Bio::Factory::SeqAnalysisParserFactory->new();
35             my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff");
36             while($feature = $parser->next_feature()) {
37             # do something with feature
38             }
39              
40             =head1 DESCRIPTION
41              
42             This class provides a simple GFF parser and writer. In the sense of a
43             SeqAnalysisParser, it parses an input file or stream into SeqFeatureI
44             objects, but is not in any way specific to a particular analysis
45             program and the output that program produces.
46              
47             That is, if you can get your analysis program spit out GFF, here is
48             your result parser.
49              
50             =head1 GFF3 AND SEQUENCE DATA
51              
52             GFF3 supports sequence data; see
53              
54             http://www.sequenceontology.org/gff3.shtml
55              
56             There are a number of ways to deal with this -
57              
58             If you call
59              
60             $gffio->ignore_sequence(1)
61              
62             prior to parsing the sequence data is ignored; this is useful if you
63             just want the features. It avoids the memory overhead in building and
64             caching sequences
65              
66             Alternatively, you can call either
67              
68             $gffio->get_seqs()
69              
70             Or
71              
72             $gffio->seq_id_by_h()
73              
74             At the B of parsing to get either a list or hashref of Bio::Seq
75             objects (see the documentation for each of these methods)
76              
77             Note that these objects will not have the features attached - you have
78             to do this yourself, OR call
79              
80             $gffio->features_attached_to_seqs(1)
81              
82             PRIOR to parsing; this will ensure that the Seqs have the features
83             attached; ie you will then be able to call
84              
85             $seq->get_SeqFeatures();
86              
87             And use Bio::SeqIO methods
88              
89             Note that auto-attaching the features to seqs will incur a higher
90             memory overhead as the features must be cached until the sequence data
91             is found
92              
93             =head1 TODO
94              
95             Make a Bio::SeqIO class specifically for GFF3 with sequence data
96              
97             =head1 FEEDBACK
98              
99             =head2 Mailing Lists
100              
101             User feedback is an integral part of the evolution of this and other
102             Bioperl modules. Send your comments and suggestions preferably to one
103             of the Bioperl mailing lists. Your participation is much appreciated.
104              
105             bioperl-l@bioperl.org - General discussion
106             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
107              
108             =head2 Support
109              
110             Please direct usage questions or support issues to the mailing list:
111              
112             I
113              
114             rather than to the module maintainer directly. Many experienced and
115             reponsive experts will be able look at the problem and quickly
116             address it. Please include a thorough description of the problem
117             with code and data examples if at all possible.
118              
119             =head2 Reporting Bugs
120              
121             Report bugs to the Bioperl bug tracking system to help us keep track
122             the bugs and their resolution. Bug reports can be submitted the web:
123              
124             https://github.com/bioperl/bioperl-live/issues
125              
126             =head1 AUTHOR - Matthew Pocock
127              
128             Email mrp-at-sanger.ac.uk
129              
130             =head1 CONTRIBUTORS
131              
132             Jason Stajich, jason-at-biperl-dot-org
133             Chris Mungall, cjm-at-fruitfly-dot-org
134             Steffen Grossmann [SG], grossman at molgen.mpg.de
135             Malcolm Cook, mec-at-stowers-institute.org
136              
137             =head1 APPENDIX
138              
139             The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
140              
141             =cut
142              
143             # Let the code begin...
144              
145             package Bio::Tools::GFF;
146              
147 146     146   1326 use vars qw($HAS_HTML_ENTITIES);
  146         182  
  146         5988  
148 146     146   533 use strict;
  146         173  
  146         2506  
149              
150 146     146   38664 use Bio::Seq::SeqFactory;
  146         232  
  146         3135  
151 146     146   32906 use Bio::LocatableSeq;
  146         229  
  146         3185  
152 146     146   1122 use Bio::SeqFeature::Generic;
  146         184  
  146         3770  
153              
154 146     146   467 use base qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
  146         161  
  146         70489  
155              
156             my $i = 0;
157             my %GFF3_ID_Tags = map { $_ => $i++ } qw(ID Parent Target);
158              
159             # for skipping data that may be represented elsewhere; currently, this is
160             # only the score
161             my %SKIPPED_TAGS = map { $_ => 1 } qw(score);
162              
163              
164             =head2 new
165              
166             Title : new
167             Usage : my $parser = Bio::Tools::GFF->new(-gff_version => 2,
168             -file => "filename.gff");
169             or
170             my $writer = Bio::Tools::GFF->new(-gff_version => 3,
171             -file => ">filename.gff3");
172             Function: Creates a new instance. Recognized named parameters are -file, -fh,
173             and -gff_version.
174             Returns : a new object
175             Args : named parameters
176             -gff_version => [1,2,3]
177              
178             =cut
179              
180             { # make a class variable such that we can generate unique ID's over
181             # a session, no matter how many instances of GFF.pm we make
182             # since these have to be unique within the scope of a GFF file.
183            
184             my $gff3_featureID = 0;
185            
186             sub _incrementGFF3ID {
187 8     8   9 my ($self) = @_;
188 8         10 return ++ $gff3_featureID;
189             }
190             }
191              
192              
193             sub new {
194 16     16 1 632 my ($class, @args) = @_;
195 16         55 my $self = $class->SUPER::new(@args);
196            
197 16         54 my ($gff_version, $noparse) = $self->_rearrange([qw(GFF_VERSION NOPARSE)],@args);
198              
199             # initialize IO
200 16         65 $self->_initialize_io(@args);
201 16 50       67 $self->_parse_header() unless $noparse;
202              
203 16   50     27 $gff_version ||= 2;
204 16 50       40 if( ! $self->gff_version($gff_version) ) {
205 0         0 $self->throw("Can't build a GFF object with the unknown version ".
206             $gff_version);
207             }
208 16         28 $self->{'_first'} = 1;
209 16         38 return $self;
210             }
211              
212              
213             =head2 _parse_header
214              
215             Title : _parse_header
216             Usage : $gffio->_parse_header()
217             Function: used to turn parse GFF header lines. currently
218             produces Bio::LocatableSeq objects from ##sequence-region
219             lines
220             Returns : 1 on success
221             Args : none
222              
223              
224             =cut
225              
226             sub _parse_header{
227 16     16   21 my ($self) = @_;
228              
229 16         14 my @unhandled;
230 16         63 local $^W = 0; # hide warnings when we try and parse from a file opened
231             # for writing - there isn't really a better way to do
232             # AFAIK - cannot detech if a FH is read or write.
233 16         43 while(my $line = $self->_readline()){
234 46         31 my $handled = 0;
235 46 50       67 next if /^\s+$/;
236 46 100       143 if($line =~ /^\#\#sequence-region\s+(\S+)\s+(\S+)\s+(\S+)\s*/){
    50          
    50          
    50          
    50          
    50          
237 42         86 my($seqid,$start,$end) = ($1,$2,$3);
238 42         29 push @{ $self->{'segments'} }, Bio::LocatableSeq->new(
  42         85  
239             -id => unescape($seqid),
240             -start => $start,
241             -end => $end,
242             -length => ($end - $start + 1), ## make the length explicit
243             );
244 42         51 $handled = 1;
245             } elsif($line =~ /^(\#\#feature-ontology)/) {
246             #to be implemented
247 0         0 $self->warn("$1 header tag parsing unimplemented");
248             } elsif($line =~ /^(\#\#attribute-ontology)/) {
249             #to be implemented
250 0         0 $self->warn("$1 header tag parsing unimplemented");
251             } elsif($line =~ /^(\#\#source-ontology)/) {
252             #to be implemented
253 0         0 $self->warn("$1 header tag parsing unimplemented");
254             } elsif($line =~ /^(\#\#\#)/) {
255             #to be implemented
256 0         0 $self->warn("$1 header tag parsing unimplemented");
257             } elsif($line =~ /^(\#\#FASTA)/) {
258             # initial ##FASTA is optional - artemis does not use it
259 0         0 $line = $self->_readline();
260 0 0       0 if ($line !~ /^\>(\S+)/) {
261 0         0 $self->throw("##FASTA directive must be followed by fasta header, not: $line");
262             }
263             }
264            
265 46 50       79 if ($line =~ /^\>(.*)/) {
266             # seq data can be at header or footer
267 0         0 my $seq = $self->_parse_sequence($line);
268 0 0       0 if ($seq) {
269 0         0 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
270             }
271             }
272            
273              
274 46 100       63 if(!$handled){
275 4         4 push @unhandled, $line;
276             }
277              
278             #looks like the header is over!
279 46 100       146 last unless $line =~ /^\#/;
280             }
281              
282 16         30 foreach my $line (@unhandled){
283 4         15 $self->_pushback($line);
284             }
285              
286 16         33 return 1;
287             }
288              
289             sub _parse_sequence {
290 0     0   0 my ($self, $line) = @_;
291              
292 0 0       0 if ($line =~ /^\>(.*)/) {
293            
294 0         0 my $seqid = $1;
295 0         0 $seqid =~ s/\s+$//;
296 0         0 my $desc = '';
297 0 0       0 if ($seqid =~ /(\S+)\s+(.*)/) {
298 0         0 ($seqid, $desc) = ($1,$2);
299             }
300 0         0 my $res = '';
301 0         0 while (my $line = $self->_readline) {
302 0 0       0 if ($line =~ /^\#/) {
303 0         0 last;
304             }
305 0 0       0 if ($line =~ /^\>/) {
306 0         0 $self->_pushback($line);
307 0         0 last;
308             }
309 0         0 $line =~ s/\s//g;
310 0         0 $res .= $line;
311             }
312 0 0       0 return if $self->ignore_sequence;
313              
314 0         0 my $seqfactory = Bio::Seq::SeqFactory->new('Bio::Seq');
315 0         0 my $seq = $seqfactory->create(-seq=>$res,
316             -id=>$seqid,
317             -desc=>$desc);
318 0         0 $seq->accession_number($seqid);
319 0 0       0 if ($self->features_attached_to_seqs) {
320             my @feats =
321 0         0 @{$self->_feature_idx_by_seq_id->{$seqid}};
  0         0  
322 0         0 $seq->add_SeqFeature($_) foreach @feats;
323 0         0 @{$self->_feature_idx_by_seq_id->{$seqid}} = ();
  0         0  
324             }
325 0         0 return $seq;
326             }
327             else {
328 0         0 $self->throw("expected fasta header, not: $line");
329             }
330             }
331              
332              
333             =head2 next_segment
334              
335             Title : next_segment
336             Usage : my $seq = $gffio->next_segment;
337             Function: Returns a Bio::LocatableSeq object corresponding to a
338             GFF "##sequence-region" header line.
339             Example :
340             Returns : A Bio::LocatableSeq object, or undef if
341             there are no more sequences.
342             Args : none
343              
344              
345             =cut
346              
347             sub next_segment{
348 1     1 1 2 my ($self,@args) = @_;
349 1 50       5 return shift @{ $self->{'segments'} } if defined $self->{'segments'};
  1         3  
350 0         0 return;
351             }
352              
353              
354             =head2 next_feature
355              
356             Title : next_feature
357             Usage : $seqfeature = $gffio->next_feature();
358             Function: Returns the next feature available in the input file or stream, or
359             undef if there are no more features.
360             Example :
361             Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
362             more features.
363             Args : none
364              
365             =cut
366              
367             sub next_feature {
368 4     4 1 184 my ($self) = @_;
369            
370 4         4 my $gff_string;
371            
372             # be graceful about empty lines or comments, and make sure we return undef
373             # if the input's consumed
374 4   66     11 while(($gff_string = $self->_readline()) && defined($gff_string)) {
375 3 50       8 if ($gff_string =~ /^\#\#\#/) {
376             # all forward refs have been seen; TODO
377             }
378 3 50 33     27 next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ ||
      33        
379             $gff_string =~ m{^//});
380              
381 3         7 while ($gff_string =~ /^\>(.+)/) {
382             # fasta can be in header or footer
383 0         0 my $seq = $self->_parse_sequence($gff_string);
384 0 0       0 if ($seq) {
385 0         0 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
386 0         0 $gff_string = $self->_readline;
387 0 0       0 last unless $gff_string;
388             }
389             }
390 3         3 last;
391             }
392 4 100       13 return unless $gff_string;
393              
394 3         13 my $feat = Bio::SeqFeature::Generic->new();
395 3         7 $self->from_gff_string($feat, $gff_string);
396              
397 3 50       9 if ($self->features_attached_to_seqs) {
398 0         0 push(@{$self->_feature_idx_by_seq_id->{$feat->seq_id}},
  0         0  
399             $feat);
400             }
401              
402 3         6 return $feat;
403             }
404              
405             sub _feature_idx_by_seq_id {
406 0     0   0 my $self = shift;
407 0 0       0 $self->{__feature_idx_by_seq_id} = shift if @_;
408             $self->{__feature_idx_by_seq_id} = {}
409 0 0       0 unless $self->{__feature_idx_by_seq_id};
410 0         0 return $self->{__feature_idx_by_seq_id};
411             }
412              
413              
414             =head2 from_gff_string
415              
416             Title : from_gff_string
417             Usage : $gff->from_gff_string($feature, $gff_string);
418             Function: Sets properties of a SeqFeatureI object from a GFF-formatted
419             string. Interpretation of the string depends on the version
420             that has been specified at initialization.
421              
422             This method is used by next_feature(). It actually dispatches to
423             one of the version-specific (private) methods.
424             Example :
425             Returns : void
426             Args : A Bio::SeqFeatureI implementing object to be initialized
427             The GFF-formatted string to initialize it from
428              
429             =cut
430              
431             sub from_gff_string {
432 3     3 1 4 my ($self, $feat, $gff_string) = @_;
433              
434 3 100       6 if($self->gff_version() == 1) {
    100          
435 1         3 return $self->_from_gff1_string($feat, $gff_string);
436             } elsif( $self->gff_version() == 3 ) {
437 1         3 return $self->_from_gff3_string($feat, $gff_string);
438             } else {
439 1         5 return $self->_from_gff2_string($feat, $gff_string);
440             }
441             }
442              
443              
444             =head2 _from_gff1_string
445              
446             Title : _from_gff1_string
447             Usage :
448             Function:
449             Example :
450             Returns : void
451             Args : A Bio::SeqFeatureI implementing object to be initialized
452             The GFF-formatted string to initialize it from
453              
454             =cut
455              
456             sub _from_gff1_string {
457 1     1   1 my ($gff, $feat, $string) = @_;
458 1         3 chomp $string;
459 1         4 my ($seqname, $source, $primary, $start, $end, $score,
460             $strand, $frame, @group) = split(/\t/, $string);
461              
462 1 50       3 if ( !defined $frame ) {
463 0         0 $feat->throw("[$string] does not look like GFF to me");
464             }
465 1 50       5 $frame = 0 unless( $frame =~ /^\d+$/);
466 1         3 $feat->seq_id($seqname);
467 1         2 $feat->source_tag($source);
468 1         2 $feat->primary_tag($primary);
469 1         2 $feat->start($start);
470 1         3 $feat->end($end);
471 1         5 $feat->frame($frame);
472 1 50       4 if ( $score eq '.' ) {
473             #$feat->score(undef);
474             } else {
475 1         3 $feat->score($score);
476             }
477 1 50       3 if ( $strand eq '-' ) { $feat->strand(-1); }
  1         3  
478 1 50       3 if ( $strand eq '+' ) { $feat->strand(1); }
  0         0  
479 1 50       3 if ( $strand eq '.' ) { $feat->strand(0); }
  0         0  
480 1         4 foreach my $g ( @group ) {
481 0 0       0 if ( $g =~ /(\S+)=(\S+)/ ) {
482 0         0 my $tag = $1;
483 0         0 my $value = $2;
484 0         0 $feat->add_tag_value($1, $2);
485             } else {
486 0         0 $feat->add_tag_value('group', $g);
487             }
488             }
489             }
490              
491              
492             =head2 _from_gff2_string
493              
494             Title : _from_gff2_string
495             Usage :
496             Function:
497             Example :
498             Returns : void
499             Args : A Bio::SeqFeatureI implementing object to be initialized
500             The GFF2-formatted string to initialize it from
501              
502              
503             =cut
504              
505             sub _from_gff2_string {
506 1     1   2 my ($gff, $feat, $string) = @_;
507 1         2 chomp($string);
508              
509             # according to the Sanger website, GFF2 should be single-tab
510             # separated elements, and the free-text at the end should contain
511             # text-translated tab symbols but no "real" tabs, so splitting on
512             # \t is safe, and $attribs gets the entire attributes field to be
513             # parsed later
514            
515             # sendu: but the tag value pair can (should?) be separated by a tab. The
516             # 'no tabs' thing seems to apply only to the free text that is allowed for
517             # the value
518              
519 1         9 my ($seqname, $source, $primary, $start,
520             $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string);
521 1         3 my $attribs = join ' ', @attribs;
522            
523 1 50       3 if ( !defined $frame ) {
524 0         0 $feat->throw("[$string] does not look like GFF2 to me");
525             }
526 1         4 $feat->seq_id($seqname);
527 1         3 $feat->source_tag($source);
528 1         2 $feat->primary_tag($primary);
529 1         2 $feat->start($start);
530 1         3 $feat->end($end);
531 1         3 $feat->frame($frame);
532 1 50       3 if ( $score eq '.' ) {
533             # $feat->score(undef);
534             } else {
535 1         3 $feat->score($score);
536             }
537 1 50       3 if ( $strand eq '-' ) { $feat->strand(-1); }
  1         3  
538 1 50       4 if ( $strand eq '+' ) { $feat->strand(1); }
  0         0  
539 1 50       3 if ( $strand eq '.' ) { $feat->strand(0); }
  0         0  
540              
541              
542             #
543             # this routine is necessay to allow the presence of semicolons in
544             # quoted text Semicolons are the delimiting character for new
545             # tag/value attributes. it is more or less a "state" machine, with
546             # the "quoted" flag going up and down as we pass thorugh quotes to
547             # distinguish free-text semicolon and hash symbols from GFF control
548             # characters
549            
550 1         2 my $flag = 0; # this could be changed to a bit and just be twiddled
551 1         1 my @parsed;
552              
553             # run through each character one at a time and check it
554             # NOTE: changed to foreach loop which is more efficient in perl
555             # --jasons
556 1         11 for my $a ( split //, $attribs ) {
557             # flag up on entering quoted text, down on leaving it
558 61 100 100     162 if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 }
  2 100 33     5  
    100          
    50          
559 1         1 elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"}
560 0         0 elsif( $a eq '#' && ! $flag ) { last }
561 61         56 push @parsed, $a;
562             }
563 1         6 $attribs = join "", @parsed; # rejoin into a single string
564              
565             #
566             # Please feel free to fix this and make it more "perlish"
567              
568 1         4 my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited
569              
570 1         2 foreach my $pair ( @key_vals ) {
571             # replace semicolons that were removed from free-text above.
572 3         6 $pair =~ s/INSERT_SEMICOLON_HERE/;/g;
573              
574             # separate the key from the value
575 3         11 my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair;
576              
577 3 50       7 if( defined $values ) {
578 3         3 my @values;
579             # free text is quoted, so match each free-text block
580             # and remove it from the $values string
581 3         8 while ($values =~ s/"(.*?)"//){
582             # and push it on to the list of values (tags may have
583             # more than one value... and the value may be undef)
584 1         4 push @values, $1;
585             }
586              
587             # and what is left over should be space-separated
588             # non-free-text values
589              
590 3         7 my @othervals = split /\s+/, $values;
591 3         4 foreach my $othervalue(@othervals){
592             # get rid of any empty strings which might
593             # result from the split
594 2 50       5 if (CORE::length($othervalue) > 0) {push @values, $othervalue}
  2         4  
595             }
596              
597 3         2 foreach my $value(@values){
598 3         6 $feat->add_tag_value($key, $value);
599             }
600             }
601             }
602             }
603              
604              
605             sub _from_gff3_string {
606 1     1   3 my ($gff, $feat, $string) = @_;
607 1         3 chomp($string);
608              
609             # according to the now nearly final GFF3 spec, columns should
610             # be tab separated, allowing unescaped spaces to occur in
611             # column 9
612              
613 1         6 my ($seqname, $source, $primary, $start, $end,
614             $score, $strand, $frame, $groups) = split(/\t/, $string);
615            
616 1 50       3 if ( ! defined $frame ) {
617 0         0 $feat->throw("[$string] does not look like GFF3 to me");
618             }
619 1         2 $feat->seq_id($seqname);
620 1         2 $feat->source_tag($source);
621 1         3 $feat->primary_tag($primary);
622 1         3 $feat->start($start);
623 1         3 $feat->end($end);
624 1         3 $feat->frame($frame);
625 1 50       4 if ( $score eq '.' ) {
626             #$feat->score(undef);
627             } else {
628 0         0 $feat->score($score);
629             }
630 1 50       2 if ( $strand eq '-' ) { $feat->strand(-1); }
  0         0  
631 1 50       3 if ( $strand eq '+' ) { $feat->strand(1); }
  1         2  
632 1 50       3 if ( $strand eq '.' ) { $feat->strand(0); }
  0         0  
633 1         8 my @groups = split(/\s*;\s*/, $groups);
634              
635 1         7 for my $group (@groups) {
636 3         5 my ($tag,$value) = split /=/,$group;
637 3         5 $tag = unescape($tag);
638 3         4 my @values = map {unescape($_)} split /,/,$value;
  3         4  
639 3         4 for my $v ( @values ) { $feat->add_tag_value($tag,$v); }
  3         4  
640             }
641             }
642              
643             # taken from Bio::DB::GFF
644             sub unescape {
645 48     48 0 40 my $v = shift;
646 48         41 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
  2         8  
647 48         166 return $v;
648             }
649              
650              
651             =head2 write_feature
652              
653             Title : write_feature
654             Usage : $gffio->write_feature($feature);
655             Function: Writes the specified SeqFeatureI object in GFF format to the stream
656             associated with this instance.
657             Returns : none
658             Args : An array of Bio::SeqFeatureI implementing objects to be serialized
659              
660             =cut
661              
662             sub write_feature {
663 3     3 1 266 my ($self, @features) = @_;
664 3 50       9 return unless @features;
665 3 100 66     14 if( $self->{'_first'} && $self->gff_version() == 3 ) {
666 1         5 $self->_print("##gff-version 3\n");
667             }
668 3         4 $self->{'_first'} = 0;
669 3         7 foreach my $feature ( @features ) {
670 3         8 $self->_print($self->gff_string($feature)."\n");
671             }
672             }
673              
674              
675             =head2 gff_string
676              
677             Title : gff_string
678             Usage : $gffstr = $gffio->gff_string($feature);
679             Function: Obtain the GFF-formatted representation of a SeqFeatureI object.
680             The formatting depends on the version specified at initialization.
681              
682             This method is used by write_feature(). It actually dispatches to
683             one of the version-specific (private) methods.
684             Example :
685             Returns : A GFF-formatted string representation of the SeqFeature
686             Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
687              
688             =cut
689              
690             sub gff_string{
691 11     11 1 11 my ($self, $feature) = @_;
692              
693 11 100       19 if($self->gff_version() == 1) {
    100          
    50          
694 1         4 return $self->_gff1_string($feature);
695             } elsif( $self->gff_version() == 3 ) {
696 8         17 return $self->_gff3_string($feature);
697             } elsif( $self->gff_version() == 2.5 ) {
698 0         0 return $self->_gff25_string($feature);
699             } else {
700 2         6 return $self->_gff2_string($feature);
701             }
702             }
703              
704              
705             =head2 _gff1_string
706              
707             Title : _gff1_string
708             Usage : $gffstr = $gffio->_gff1_string
709             Function:
710             Example :
711             Returns : A GFF1-formatted string representation of the SeqFeature
712             Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
713              
714             =cut
715              
716             sub _gff1_string{
717 1     1   3 my ($gff, $feat) = @_;
718 1         2 my ($str,$score,$frame,$name,$strand);
719              
720 1 50       21 if( $feat->can('score') ) {
721 1         5 $score = $feat->score();
722             }
723 1 50       5 $score = '.' unless defined $score;
724              
725 1 50       6 if( $feat->can('frame') ) {
726 1         5 $frame = $feat->frame();
727             }
728 1 50       4 $frame = '.' unless defined $frame;
729              
730 1         3 $strand = $feat->strand();
731 1 50       6 if(! $strand) {
    50          
    50          
732 0         0 $strand = ".";
733             } elsif( $strand == 1 ) {
734 0         0 $strand = '+';
735             } elsif ( $feat->strand == -1 ) {
736 1         2 $strand = '-';
737             }
738            
739 1 50       5 if( $feat->can('seqname') ) {
740 1         3 $name = $feat->seq_id();
741 1   50     5 $name ||= 'SEQ';
742             } else {
743 0         0 $name = 'SEQ';
744             }
745              
746 1         3 $str = join("\t",
747             $name,
748             $feat->source_tag,
749             $feat->primary_tag,
750             $feat->start,
751             $feat->end,
752             $score,
753             $strand,
754             $frame);
755              
756 1         4 foreach my $tag ( $feat->get_all_tags ) {
757 4 100       8 next if exists $SKIPPED_TAGS{$tag};
758 3         5 foreach my $value ( $feat->get_tag_values($tag) ) {
759 3 50       12 $str .= " $tag=$value" if $value;
760             }
761             }
762              
763 1         11 return $str;
764             }
765              
766              
767             =head2 _gff2_string
768              
769             Title : _gff2_string
770             Usage : $gffstr = $gffio->_gff2_string
771             Function:
772             Example :
773             Returns : A GFF2-formatted string representation of the SeqFeature
774             Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified
775              
776             =cut
777              
778             sub _gff2_string{
779 2     2   4 my ($gff, $origfeat) = @_;
780 2         2 my $feat;
781 2 50       22 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
782 0         0 $feat = $origfeat->feature2;
783             } else {
784 2         5 $feat = $origfeat;
785             }
786 2         3 my ($str1, $str2,$score,$frame,$name,$strand);
787              
788 2 50       140 if( $feat->can('score') ) {
789 2         6 $score = $feat->score();
790             }
791 2 100       6 $score = '.' unless defined $score;
792              
793 2 50       9 if( $feat->can('frame') ) {
794 2         5 $frame = $feat->frame();
795             }
796 2 50       5 $frame = '.' unless defined $frame;
797              
798 2         4 $strand = $feat->strand();
799 2 50       20 if(! $strand) {
    100          
    50          
800 0         0 $strand = ".";
801             } elsif( $strand == 1 ) {
802 1         2 $strand = '+';
803             } elsif ( $feat->strand == -1 ) {
804 1         2 $strand = '-';
805             }
806              
807 2 50       9 if( $feat->can('seqname') ) {
808 2         5 $name = $feat->seq_id();
809             }
810 2   50     8 $name ||= 'SEQ';
811              
812 2         5 $str1 = join("\t",
813             $name,
814             $feat->source_tag(),
815             $feat->primary_tag(),
816             $feat->start(),
817             $feat->end(),
818             $score,
819             $strand,
820             $frame);
821             # the routine below is the only modification I made to the original
822             # ->gff_string routine (above) as on November 17th, 2000, the
823             # Sanger webpage describing GFF2 format reads: "From version 2
824             # onwards, the attribute field must have a tag value structure
825             # following the syntax used within objects in a .ace file,
826             # flattened onto one line by semicolon separators. Tags must be
827             # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values
828             # must be quoted with double quotes".
829             # MW
830            
831 2         4 my @group;
832              
833 2         6 foreach my $tag ( $feat->get_all_tags ) {
834 7 100       13 next if exists $SKIPPED_TAGS{$tag};
835 6         6 my @v;
836 6         9 foreach my $value ( $feat->get_tag_values($tag) ) {
837 6 50 33     35 unless( defined $value && length($value) ) {
    100          
838             # quote anything other than valid tag/value characters
839 0         0 $value = '""';
840             } elsif ($value =~ /[^A-Za-z0-9_]/){
841             # substitute tab and newline chars by their UNIX equivalent
842 1         2 $value =~ s/\t/\\t/g;
843 1         3 $value =~ s/\n/\\n/g;
844 1         2 $value = '"' . $value . '" ';
845             }
846 6         9 push @v, $value;
847             # for this tag (allowed in GFF2 and .ace format)
848             }
849 6         15 push @group, "$tag ".join(" ", @v);
850             }
851              
852 2         6 $str2 .= join(' ; ', @group);
853             # Add Target information for Feature Pairs
854 2 50 33     4 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
      33        
855             ! $feat->has_tag('Group') &&
856             $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
857 0 0       0 $str2 = sprintf("Target %s %d %d", $origfeat->feature1->seq_id,
    0          
858             ( $origfeat->feature1->strand < 0 ?
859             ( $origfeat->feature1->end,
860             $origfeat->feature1->start) :
861             ( $origfeat->feature1->start,
862             $origfeat->feature1->end)
863             )) . ($str2?" ; ".$str2:""); # need to put Target information before other tag/value pairs - mw
864             }
865 2         13 return $str1."\t".$str2;
866             }
867              
868              
869             =head2 _gff25_string
870              
871             Title : _gff25_string
872             Usage : $gffstr = $gffio->_gff2_string
873             Function: To get a format of GFF that is peculiar to Gbrowse/Bio::DB::GFF
874             Example :
875             Returns : A GFF2.5-formatted string representation of the SeqFeature
876             Args : A Bio::SeqFeatureI implementing object to be GFF2.5-stringified
877              
878             =cut
879              
880             sub _gff25_string {
881 0     0   0 my ($gff, $origfeat) = @_;
882 0         0 my $feat;
883 0 0       0 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
884 0         0 $feat = $origfeat->feature2;
885             } else {
886 0         0 $feat = $origfeat;
887             }
888 0         0 my ($str1, $str2,$score,$frame,$name,$strand);
889              
890 0 0       0 if( $feat->can('score') ) {
891 0         0 $score = $feat->score();
892             }
893 0 0       0 $score = '.' unless defined $score;
894              
895 0 0       0 if( $feat->can('frame') ) {
896 0         0 $frame = $feat->frame();
897             }
898 0 0       0 $frame = '.' unless defined $frame;
899              
900 0         0 $strand = $feat->strand();
901 0 0       0 if(! $strand) {
    0          
    0          
902 0         0 $strand = ".";
903             } elsif( $strand == 1 ) {
904 0         0 $strand = '+';
905             } elsif ( $feat->strand == -1 ) {
906 0         0 $strand = '-';
907             }
908              
909 0 0       0 if( $feat->can('seqname') ) {
910 0         0 $name = $feat->seq_id();
911 0   0     0 $name ||= 'SEQ';
912             } else {
913 0         0 $name = 'SEQ';
914             }
915 0         0 $str1 = join("\t",
916             $name,
917             $feat->source_tag(),
918             $feat->primary_tag(),
919             $feat->start(),
920             $feat->end(),
921             $score,
922             $strand,
923             $frame);
924              
925 0         0 my @all_tags = $feat->all_tags;
926 0         0 my @group; my @firstgroup;
927 0 0       0 if (@all_tags) { # only play this game if it is worth playing...
928 0         0 foreach my $tag ( @all_tags ) {
929 0         0 my @v;
930 0         0 foreach my $value ( $feat->get_tag_values($tag) ) {
931 0 0       0 next if exists $SKIPPED_TAGS{$tag};
932 0 0 0     0 unless( defined $value && length($value) ) {
    0          
933 0         0 $value = '""';
934             } elsif ($value =~ /[^A-Za-z0-9_]/){
935 0         0 $value =~ s/\t/\\t/g; # substitute tab and newline
936             # characters
937 0         0 $value =~ s/\n/\\n/g; # to their UNIX equivalents
938 0         0 $value = '"' . $value . '" ';
939             } # if the value contains
940             # anything other than valid
941             # tag/value characters, then
942             # quote it
943 0         0 push @v, $value;
944             # for this tag (allowed in GFF2 and .ace format)
945             }
946 0 0 0     0 if (($tag eq 'Group') || ($tag eq 'Target')){ # hopefully we wont get both...
947 0         0 push @firstgroup, "$tag ".join(" ", @v);
948             } else {
949 0         0 push @group, "$tag ".join(" ", @v);
950             }
951             }
952             }
953 0         0 $str2 = join(' ; ', (@firstgroup, @group));
954             # Add Target information for Feature Pairs
955 0 0 0     0 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
      0        
956             ! $feat->has_tag('Group') &&
957             $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
958 0 0       0 $str2 = sprintf("Target %s ; tstart %d ; tend %d", $origfeat->feature1->seq_id,
    0          
959             ( $origfeat->feature1->strand < 0 ?
960             ( $origfeat->feature1->end,
961             $origfeat->feature1->start) :
962             ( $origfeat->feature1->start,
963             $origfeat->feature1->end)
964             )) . ($str2?" ; ".$str2:""); # need to put the target info before other tag/value pairs - mw
965             }
966 0         0 return $str1 . "\t". $str2;
967             }
968              
969              
970             =head2 _gff3_string
971              
972             Title : _gff3_string
973             Usage : $gffstr = $gffio->_gff3_string
974             Function:
975             Example :
976             Returns : A GFF3-formatted string representation of the SeqFeature
977             Args : A Bio::SeqFeatureI implementing object to be GFF3-stringified
978              
979             =cut
980              
981             sub _gff3_string {
982 8     8   8 my ($gff, $origfeat) = @_;
983 8         7 my $feat;
984 8 50       38 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
985 0         0 $feat = $origfeat->feature2;
986             } else {
987 8         9 $feat = $origfeat;
988             }
989              
990 8         12 my $ID = $gff->_incrementGFF3ID();
991              
992 8         8 my ($score,$frame,$name,$strand);
993              
994 8 50       24 if( $feat->can('score') ) {
995 8         14 $score = $feat->score();
996             }
997 8 100       16 $score = '.' unless defined $score;
998              
999 8 50       19 if( $feat->can('frame') ) {
1000 8         14 $frame = $feat->frame();
1001             }
1002 8 100       12 $frame = '1' unless defined $frame;
1003              
1004 8         16 $strand = $feat->strand();
1005              
1006 8 50       19 if(! $strand) {
    50          
    0          
1007 0         0 $strand = ".";
1008             } elsif( $strand == 1 ) {
1009 8         8 $strand = '+';
1010             } elsif ( $feat->strand == -1 ) {
1011 0         0 $strand = '-';
1012             }
1013              
1014 8 50       19 if( $feat->can('seqname') ) {
1015 8         14 $name = $feat->seq_id();
1016 8   50     18 $name ||= 'SEQ';
1017             } else {
1018 0         0 $name = 'SEQ';
1019             }
1020              
1021 8         7 my @groups;
1022              
1023             # force leading ID and Parent tags
1024 8         21 my @all_tags = grep { ! exists $GFF3_ID_Tags{$_} } $feat->all_tags;
  36         46  
1025 8         25 for my $t ( sort { $GFF3_ID_Tags{$b} <=> $GFF3_ID_Tags{$a} }
  17         27  
1026             keys %GFF3_ID_Tags ) {
1027 24 100       32 unshift @all_tags, $t if $feat->has_tag($t);
1028             }
1029              
1030 8         12 for my $tag ( @all_tags ) {
1031 36 100       56 next if exists $SKIPPED_TAGS{$tag};
1032             # next if $tag eq 'Target';
1033 29 100 66     82 if ($tag eq 'Target' && ! $origfeat->isa('Bio::SeqFeature::FeaturePair')){
1034 7         14 my @values = $feat->get_tag_values($tag);
1035 7 50       20 if(scalar(@values) > 1){ # How is it possible that Target is has a value list ??
1036             # simple Target,start,stop
1037 0         0 my ($target_id, $b,$e,$strand) = $feat->get_tag_values($tag);
1038 0 0 0     0 next unless(defined($e) && defined($b) && $target_id);
      0        
1039 0 0 0     0 ($b,$e)= ($e,$b) if(defined $strand && $strand<0);
1040             #if we have the strand we will print it
1041 0 0       0 if($strand){ push @groups, sprintf("Target=%s %d %d %s", $target_id,$b,$e,$strand); }
  0         0  
1042 0         0 else{ push @groups, sprintf("Target=%s %d %d", $target_id,$b,$e); }
1043 0         0 next;
1044             }
1045             }
1046              
1047 29         30 my $valuestr;
1048             # a string which will hold one or more values
1049             # for this tag, with quoted free text and
1050             # space-separated individual values.
1051             my @v;
1052 29         41 for my $value ( $feat->get_tag_values($tag) ) {
1053 29 100 66     88 if( defined $value && length($value) ) {
1054             #$value =~ tr/ /+/; #spaces are allowed now
1055 27 50       35 if ( ref $value eq 'Bio::Annotation::Comment') {
1056 0         0 $value = $value->text;
1057             }
1058              
1059 27 100       57 if ($value =~ /[^a-zA-Z0-9\,\;\=\.:\%\^\*\$\@\!\+\_\?\-]/) {
1060 8         12 $value =~ s/\t/\\t/g; # substitute tab and newline
1061             # characters
1062 8         11 $value =~ s/\n/\\n/g; # to their UNIX equivalents
1063              
1064             # Unescaped quotes are not allowed in GFF3
1065             # $value = '"' . $value . '"';
1066             }
1067 27         33 $value =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
  2         11  
1068             } else {
1069             # if it is completely empty, then just make empty double quotes
1070 2         2 $value = '""';
1071             }
1072 29         35 push @v, $value;
1073             }
1074             # can we figure out how to improve this?
1075 29 100       87 $tag = lcfirst($tag) unless ( $tag =~
1076             /^(ID|Name|Alias|Parent|Gap|Target|Derives_from|Note|Dbxref|Ontology_term)$/);
1077              
1078 29         65 push @groups, "$tag=".join(",",@v);
1079             }
1080             # Add Target information for Feature Pairs
1081 8 50 66     14 if( $feat->has_tag('Target') &&
      66        
1082             ! $feat->has_tag('Group') &&
1083             $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
1084              
1085 0         0 my $target_id = $origfeat->feature1->seq_id;
1086 0         0 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
  0         0  
1087              
1088 0 0       0 push @groups, sprintf("Target=%s %d %d",
1089             $target_id,
1090             ( $origfeat->feature1->strand < 0 ?
1091             ( $origfeat->feature1->end,
1092             $origfeat->feature1->start) :
1093             ( $origfeat->feature1->start,
1094             $origfeat->feature1->end)
1095             ));
1096             }
1097              
1098             # unshift @groups, "ID=autogenerated$ID" unless ($feat->has_tag('ID'));
1099 8 50 33     48 if ( $feat->can('name') && defined($feat->name) ) {
1100             # such as might be for Bio::DB::SeqFeature
1101 0         0 unshift @groups, 'Name=' . $feat->name;
1102             }
1103              
1104 8         9 my $gff_string = "";
1105 8 50       14 if ($feat->location->isa("Bio::Location::SplitLocationI")) {
1106 0         0 my @locs = $feat->location->each_Location;
1107 0         0 foreach my $loc (@locs) {
1108 0   0     0 $gff_string .= join("\t",
1109             $name,
1110             $feat->source_tag() || '.',
1111             $feat->primary_tag(),
1112             $loc->start(),
1113             $loc->end(),
1114             $score,
1115             $strand,
1116             $frame,
1117             join(';', @groups)) . "\n";
1118             }
1119 0         0 chop $gff_string;
1120 0         0 return $gff_string;
1121             } else {
1122 8   50     16 $gff_string = join("\t",
1123             $name,
1124             $feat->source_tag() || '.',
1125             $feat->primary_tag(),
1126             $feat->start(),
1127             $feat->end(),
1128             $score,
1129             $strand,
1130             $frame,
1131             join(';', @groups));
1132             }
1133 8         56 return $gff_string;
1134             }
1135              
1136              
1137             =head2 gff_version
1138              
1139             Title : _gff_version
1140             Usage : $gffversion = $gffio->gff_version
1141             Function:
1142             Example :
1143             Returns : The GFF version this parser will accept and emit.
1144             Args : none
1145              
1146             =cut
1147              
1148             sub gff_version {
1149 47     47 1 35 my ($self, $value) = @_;
1150 47 100 66     90 if(defined $value && grep {$value == $_ } ( 1, 2, 2.5, 3)) {
  64         106  
1151 16         23 $self->{'GFF_VERSION'} = $value;
1152             }
1153 47         111 return $self->{'GFF_VERSION'};
1154             }
1155              
1156              
1157             # Make filehandles
1158              
1159             =head2 newFh
1160              
1161             Title : newFh
1162             Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1163             Function: does a new() followed by an fh()
1164             Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1165             $feature = <$fh>; # read a feature object
1166             print $fh $feature; # write a feature object
1167             Returns : filehandle tied to the Bio::Tools::GFF class
1168             Args :
1169              
1170             =cut
1171              
1172             sub newFh {
1173 0     0 1 0 my $class = shift;
1174 0 0       0 return unless my $self = $class->new(@_);
1175 0         0 return $self->fh;
1176             }
1177              
1178              
1179             =head2 fh
1180              
1181             Title : fh
1182             Usage : $obj->fh
1183             Function:
1184             Example : $fh = $obj->fh; # make a tied filehandle
1185             $feature = <$fh>; # read a feature object
1186             print $fh $feature; # write a feature object
1187             Returns : filehandle tied to Bio::Tools::GFF class
1188             Args : none
1189              
1190             =cut
1191              
1192              
1193             sub fh {
1194 0     0 1 0 my $self = shift;
1195 0   0     0 my $class = ref($self) || $self;
1196 0         0 my $s = Symbol::gensym;
1197 0         0 tie $$s,$class,$self;
1198 0         0 return $s;
1199             }
1200              
1201             # This accessor is used for accessing the Bio::Seq objects from a GFF3
1202             # file; if the file you are using has no sequence data you can ignore
1203             # this accessor
1204              
1205             # This accessor returns a hash reference containing Bio::Seq objects,
1206             # indexed by Bio::Seq->primary_id
1207              
1208             sub _seq_by_id_h {
1209 0     0   0 my $self = shift;
1210              
1211 0 0       0 return $self->{'_seq_by_id_h'} = shift if @_;
1212             $self->{'_seq_by_id_h'} = {}
1213 0 0       0 unless $self->{'_seq_by_id_h'};
1214 0         0 return $self->{'_seq_by_id_h'};
1215             }
1216              
1217              
1218             =head2 get_seqs
1219              
1220             Title : get_seqs
1221             Usage :
1222             Function: Returns all Bio::Seq objects populated by GFF3 file
1223             Example :
1224             Returns :
1225             Args :
1226              
1227             =cut
1228              
1229             sub get_seqs {
1230 0     0 1 0 my ($self,@args) = @_;
1231 0         0 return values %{$self->_seq_by_id_h};
  0         0  
1232             }
1233              
1234              
1235             =head2 features_attached_to_seqs
1236              
1237             Title : features_attached_to_seqs
1238             Usage : $obj->features_attached_to_seqs(1);
1239             Function: For use with GFF3 containg sequence only
1240              
1241             Setting this B parsing ensures that all Bio::Seq object
1242             created will have the appropriate features added to them
1243              
1244             defaults to false (off)
1245              
1246             Note that this mode will incur higher memory usage because features
1247             will have to be cached until the relevant feature comes along
1248              
1249             Example :
1250             Returns : value of features_attached_to_seqs (a boolean)
1251             Args : on set, new value (a boolean, optional)
1252              
1253              
1254             =cut
1255              
1256             sub features_attached_to_seqs{
1257 3     3 1 4 my $self = shift;
1258              
1259 3 50       6 return $self->{'_features_attached_to_seqs'} = shift if @_;
1260 3         7 return $self->{'_features_attached_to_seqs'};
1261             }
1262              
1263              
1264             =head2 ignore_sequence
1265              
1266             Title : ignore_sequence
1267             Usage : $obj->ignore_sequence(1);
1268             Function: For use with GFF3 containg sequence only
1269              
1270             Setting this B parsing means that all sequence data will be
1271             ignored
1272              
1273             Example :
1274             Returns : value of ignore_sequence (a boolean)
1275             Args : on set, new value (a boolean, optional)
1276              
1277             =cut
1278              
1279             sub ignore_sequence{
1280 0     0 1 0 my $self = shift;
1281              
1282 0 0       0 return $self->{'_ignore_sequence'} = shift if @_;
1283 0         0 return $self->{'_ignore_sequence'};
1284             }
1285              
1286              
1287             sub DESTROY {
1288 15     15   22 my $self = shift;
1289 15         33 $self->close();
1290             }
1291              
1292             sub TIEHANDLE {
1293 0     0     my ($class,$val) = @_;
1294 0           return bless {'gffio' => $val}, $class;
1295             }
1296              
1297             sub READLINE {
1298 0     0     my $self = shift;
1299 0 0 0       return $self->{'gffio'}->next_feature() || undef unless wantarray;
1300 0           my (@list, $obj);
1301 0           push @list, $obj while $obj = $self->{'gffio'}->next_feature();
1302 0           return @list;
1303             }
1304              
1305             sub PRINT {
1306 0     0     my $self = shift;
1307 0           $self->{'gffio'}->write_feature(@_);
1308             }
1309              
1310             1;
1311