File Coverage

Bio/Tools/GFF.pm
Criterion Covered Total %
statement 275 439 62.6
branch 115 260 44.2
condition 31 85 36.4
subroutine 24 35 68.5
pod 12 13 92.3
total 457 832 54.9


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::GFF
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by the Bioperl core team
7             #
8             # Copyright Matthew Pocock
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::GFF;
21              
22             # specify input via -fh or -file
23             my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2);
24             my $feature;
25             # loop over the input stream
26             while($feature = $gffio->next_feature()) {
27             # do something with feature
28             }
29             $gffio->close();
30              
31             # you can also obtain a GFF parser as a SeqAnalasisParserI in
32             # HT analysis pipelines (see Bio::SeqAnalysisParserI and
33             # Bio::Factory::SeqAnalysisParserFactory)
34             my $factory = Bio::Factory::SeqAnalysisParserFactory->new();
35             my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff");
36             while($feature = $parser->next_feature()) {
37             # do something with feature
38             }
39              
40             =head1 DESCRIPTION
41              
42             This class provides a simple GFF parser and writer. In the sense of a
43             SeqAnalysisParser, it parses an input file or stream into SeqFeatureI
44             objects, but is not in any way specific to a particular analysis
45             program and the output that program produces.
46              
47             That is, if you can get your analysis program spit out GFF, here is
48             your result parser.
49              
50             =head1 GFF3 AND SEQUENCE DATA
51              
52             GFF3 supports sequence data; see
53              
54             http://www.sequenceontology.org/gff3.shtml
55              
56             There are a number of ways to deal with this -
57              
58             If you call
59              
60             $gffio->ignore_sequence(1)
61              
62             prior to parsing the sequence data is ignored; this is useful if you
63             just want the features. It avoids the memory overhead in building and
64             caching sequences
65              
66             Alternatively, you can call either
67              
68             $gffio->get_seqs()
69              
70             Or
71              
72             $gffio->seq_id_by_h()
73              
74             At the B of parsing to get either a list or hashref of Bio::Seq
75             objects (see the documentation for each of these methods)
76              
77             Note that these objects will not have the features attached - you have
78             to do this yourself, OR call
79              
80             $gffio->features_attached_to_seqs(1)
81              
82             PRIOR to parsing; this will ensure that the Seqs have the features
83             attached; ie you will then be able to call
84              
85             $seq->get_SeqFeatures();
86              
87             And use Bio::SeqIO methods
88              
89             Note that auto-attaching the features to seqs will incur a higher
90             memory overhead as the features must be cached until the sequence data
91             is found
92              
93             =head1 TODO
94              
95             Make a Bio::SeqIO class specifically for GFF3 with sequence data
96              
97             =head1 FEEDBACK
98              
99             =head2 Mailing Lists
100              
101             User feedback is an integral part of the evolution of this and other
102             Bioperl modules. Send your comments and suggestions preferably to one
103             of the Bioperl mailing lists. Your participation is much appreciated.
104              
105             bioperl-l@bioperl.org - General discussion
106             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
107              
108             =head2 Support
109              
110             Please direct usage questions or support issues to the mailing list:
111              
112             I
113              
114             rather than to the module maintainer directly. Many experienced and
115             reponsive experts will be able look at the problem and quickly
116             address it. Please include a thorough description of the problem
117             with code and data examples if at all possible.
118              
119             =head2 Reporting Bugs
120              
121             Report bugs to the Bioperl bug tracking system to help us keep track
122             the bugs and their resolution. Bug reports can be submitted the web:
123              
124             https://github.com/bioperl/bioperl-live/issues
125              
126             =head1 AUTHOR - Matthew Pocock
127              
128             Email mrp-at-sanger.ac.uk
129              
130             =head1 CONTRIBUTORS
131              
132             Jason Stajich, jason-at-biperl-dot-org
133             Chris Mungall, cjm-at-fruitfly-dot-org
134             Steffen Grossmann [SG], grossman at molgen.mpg.de
135             Malcolm Cook, mec-at-stowers-institute.org
136              
137             =head1 APPENDIX
138              
139             The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
140              
141             =cut
142              
143             # Let the code begin...
144              
145             package Bio::Tools::GFF;
146              
147 146     146   998 use vars qw($HAS_HTML_ENTITIES);
  146         174  
  146         5833  
148 146     146   503 use strict;
  146         159  
  146         2354  
149              
150 146     146   38297 use Bio::Seq::SeqFactory;
  146         207  
  146         3083  
151 146     146   32111 use Bio::LocatableSeq;
  146         225  
  146         3183  
152 146     146   959 use Bio::SeqFeature::Generic;
  146         181  
  146         3034  
153              
154 146     146   469 use base qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
  146         162  
  146         68642  
155              
156             my $i = 0;
157             my %GFF3_ID_Tags = map { $_ => $i++ } qw(ID Parent Target);
158              
159             # for skipping data that may be represented elsewhere; currently, this is
160             # only the score
161             my %SKIPPED_TAGS = map { $_ => 1 } qw(score);
162              
163              
164             =head2 new
165              
166             Title : new
167             Usage : my $parser = Bio::Tools::GFF->new(-gff_version => 2,
168             -file => "filename.gff");
169             or
170             my $writer = Bio::Tools::GFF->new(-gff_version => 3,
171             -file => ">filename.gff3");
172             Function: Creates a new instance. Recognized named parameters are -file, -fh,
173             and -gff_version.
174             Returns : a new object
175             Args : named parameters
176             -gff_version => [1,2,3]
177              
178             =cut
179              
180             { # make a class variable such that we can generate unique ID's over
181             # a session, no matter how many instances of GFF.pm we make
182             # since these have to be unique within the scope of a GFF file.
183            
184             my $gff3_featureID = 0;
185            
186             sub _incrementGFF3ID {
187 8     8   7 my ($self) = @_;
188 8         11 return ++ $gff3_featureID;
189             }
190             }
191              
192              
193             sub new {
194 16     16 1 514 my ($class, @args) = @_;
195 16         52 my $self = $class->SUPER::new(@args);
196            
197 16         50 my ($gff_version, $noparse) = $self->_rearrange([qw(GFF_VERSION NOPARSE)],@args);
198              
199             # initialize IO
200 16         54 $self->_initialize_io(@args);
201 16 50       48 $self->_parse_header() unless $noparse;
202              
203 16   50     27 $gff_version ||= 2;
204 16 50       33 if( ! $self->gff_version($gff_version) ) {
205 0         0 $self->throw("Can't build a GFF object with the unknown version ".
206             $gff_version);
207             }
208 16         20 $self->{'_first'} = 1;
209 16         36 return $self;
210             }
211              
212              
213             =head2 _parse_header
214              
215             Title : _parse_header
216             Usage : $gffio->_parse_header()
217             Function: used to turn parse GFF header lines. currently
218             produces Bio::LocatableSeq objects from ##sequence-region
219             lines
220             Returns : 1 on success
221             Args : none
222              
223              
224             =cut
225              
226             sub _parse_header{
227 16     16   16 my ($self) = @_;
228              
229 16         15 my @unhandled;
230 16         55 local $^W = 0; # hide warnings when we try and parse from a file opened
231             # for writing - there isn't really a better way to do
232             # AFAIK - cannot detech if a FH is read or write.
233 16         41 while(my $line = $self->_readline()){
234 46         35 my $handled = 0;
235 46 50       63 next if /^\s+$/;
236 46 100       131 if($line =~ /^\#\#sequence-region\s+(\S+)\s+(\S+)\s+(\S+)\s*/){
    50          
    50          
    50          
    50          
    50          
237 42         93 my($seqid,$start,$end) = ($1,$2,$3);
238 42         23 push @{ $self->{'segments'} }, Bio::LocatableSeq->new(
  42         83  
239             -id => unescape($seqid),
240             -start => $start,
241             -end => $end,
242             -length => ($end - $start + 1), ## make the length explicit
243             );
244 42         48 $handled = 1;
245             } elsif($line =~ /^(\#\#feature-ontology)/) {
246             #to be implemented
247 0         0 $self->warn("$1 header tag parsing unimplemented");
248             } elsif($line =~ /^(\#\#attribute-ontology)/) {
249             #to be implemented
250 0         0 $self->warn("$1 header tag parsing unimplemented");
251             } elsif($line =~ /^(\#\#source-ontology)/) {
252             #to be implemented
253 0         0 $self->warn("$1 header tag parsing unimplemented");
254             } elsif($line =~ /^(\#\#\#)/) {
255             #to be implemented
256 0         0 $self->warn("$1 header tag parsing unimplemented");
257             } elsif($line =~ /^(\#\#FASTA)/) {
258             # initial ##FASTA is optional - artemis does not use it
259 0         0 $line = $self->_readline();
260 0 0       0 if ($line !~ /^\>(\S+)/) {
261 0         0 $self->throw("##FASTA directive must be followed by fasta header, not: $line");
262             }
263             }
264            
265 46 50       77 if ($line =~ /^\>(.*)/) {
266             # seq data can be at header or footer
267 0         0 my $seq = $self->_parse_sequence($line);
268 0 0       0 if ($seq) {
269 0         0 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
270             }
271             }
272            
273              
274 46 100       55 if(!$handled){
275 4         5 push @unhandled, $line;
276             }
277              
278             #looks like the header is over!
279 46 100       148 last unless $line =~ /^\#/;
280             }
281              
282 16         30 foreach my $line (@unhandled){
283 4         13 $self->_pushback($line);
284             }
285              
286 16         29 return 1;
287             }
288              
289             sub _parse_sequence {
290 0     0   0 my ($self, $line) = @_;
291              
292 0 0       0 if ($line =~ /^\>(.*)/) {
293            
294 0         0 my $seqid = $1;
295 0         0 $seqid =~ s/\s+$//;
296 0         0 my $desc = '';
297 0 0       0 if ($seqid =~ /(\S+)\s+(.*)/) {
298 0         0 ($seqid, $desc) = ($1,$2);
299             }
300 0         0 my $res = '';
301 0         0 while (my $line = $self->_readline) {
302 0 0       0 if ($line =~ /^\#/) {
303 0         0 last;
304             }
305 0 0       0 if ($line =~ /^\>/) {
306 0         0 $self->_pushback($line);
307 0         0 last;
308             }
309 0         0 $line =~ s/\s//g;
310 0         0 $res .= $line;
311             }
312 0 0       0 return if $self->ignore_sequence;
313              
314 0         0 my $seqfactory = Bio::Seq::SeqFactory->new('Bio::Seq');
315 0         0 my $seq = $seqfactory->create(-seq=>$res,
316             -id=>$seqid,
317             -desc=>$desc);
318 0         0 $seq->accession_number($seqid);
319 0 0       0 if ($self->features_attached_to_seqs) {
320             my @feats =
321 0         0 @{$self->_feature_idx_by_seq_id->{$seqid}};
  0         0  
322 0         0 $seq->add_SeqFeature($_) foreach @feats;
323 0         0 @{$self->_feature_idx_by_seq_id->{$seqid}} = ();
  0         0  
324             }
325 0         0 return $seq;
326             }
327             else {
328 0         0 $self->throw("expected fasta header, not: $line");
329             }
330             }
331              
332              
333             =head2 next_segment
334              
335             Title : next_segment
336             Usage : my $seq = $gffio->next_segment;
337             Function: Returns a Bio::LocatableSeq object corresponding to a
338             GFF "##sequence-region" header line.
339             Example :
340             Returns : A Bio::LocatableSeq object, or undef if
341             there are no more sequences.
342             Args : none
343              
344              
345             =cut
346              
347             sub next_segment{
348 1     1 1 3 my ($self,@args) = @_;
349 1 50       4 return shift @{ $self->{'segments'} } if defined $self->{'segments'};
  1         2  
350 0         0 return;
351             }
352              
353              
354             =head2 next_feature
355              
356             Title : next_feature
357             Usage : $seqfeature = $gffio->next_feature();
358             Function: Returns the next feature available in the input file or stream, or
359             undef if there are no more features.
360             Example :
361             Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
362             more features.
363             Args : none
364              
365             =cut
366              
367             sub next_feature {
368 4     4 1 183 my ($self) = @_;
369            
370 4         4 my $gff_string;
371            
372             # be graceful about empty lines or comments, and make sure we return undef
373             # if the input's consumed
374 4   66     29 while(($gff_string = $self->_readline()) && defined($gff_string)) {
375 3 50       9 if ($gff_string =~ /^\#\#\#/) {
376             # all forward refs have been seen; TODO
377             }
378 3 50 33     22 next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ ||
      33        
379             $gff_string =~ m{^//});
380              
381 3         8 while ($gff_string =~ /^\>(.+)/) {
382             # fasta can be in header or footer
383 0         0 my $seq = $self->_parse_sequence($gff_string);
384 0 0       0 if ($seq) {
385 0         0 $self->_seq_by_id_h->{$seq->primary_id} = $seq;
386 0         0 $gff_string = $self->_readline;
387 0 0       0 last unless $gff_string;
388             }
389             }
390 3         3 last;
391             }
392 4 100       12 return unless $gff_string;
393              
394 3         9 my $feat = Bio::SeqFeature::Generic->new();
395 3         8 $self->from_gff_string($feat, $gff_string);
396              
397 3 50       9 if ($self->features_attached_to_seqs) {
398 0         0 push(@{$self->_feature_idx_by_seq_id->{$feat->seq_id}},
  0         0  
399             $feat);
400             }
401              
402 3         7 return $feat;
403             }
404              
405             sub _feature_idx_by_seq_id {
406 0     0   0 my $self = shift;
407 0 0       0 $self->{__feature_idx_by_seq_id} = shift if @_;
408             $self->{__feature_idx_by_seq_id} = {}
409 0 0       0 unless $self->{__feature_idx_by_seq_id};
410 0         0 return $self->{__feature_idx_by_seq_id};
411             }
412              
413              
414             =head2 from_gff_string
415              
416             Title : from_gff_string
417             Usage : $gff->from_gff_string($feature, $gff_string);
418             Function: Sets properties of a SeqFeatureI object from a GFF-formatted
419             string. Interpretation of the string depends on the version
420             that has been specified at initialization.
421              
422             This method is used by next_feature(). It actually dispatches to
423             one of the version-specific (private) methods.
424             Example :
425             Returns : void
426             Args : A Bio::SeqFeatureI implementing object to be initialized
427             The GFF-formatted string to initialize it from
428              
429             =cut
430              
431             sub from_gff_string {
432 3     3 1 4 my ($self, $feat, $gff_string) = @_;
433              
434 3 100       5 if($self->gff_version() == 1) {
    100          
435 1         2 return $self->_from_gff1_string($feat, $gff_string);
436             } elsif( $self->gff_version() == 3 ) {
437 1         5 return $self->_from_gff3_string($feat, $gff_string);
438             } else {
439 1         4 return $self->_from_gff2_string($feat, $gff_string);
440             }
441             }
442              
443              
444             =head2 _from_gff1_string
445              
446             Title : _from_gff1_string
447             Usage :
448             Function:
449             Example :
450             Returns : void
451             Args : A Bio::SeqFeatureI implementing object to be initialized
452             The GFF-formatted string to initialize it from
453              
454             =cut
455              
456             sub _from_gff1_string {
457 1     1   2 my ($gff, $feat, $string) = @_;
458 1         2 chomp $string;
459 1         5 my ($seqname, $source, $primary, $start, $end, $score,
460             $strand, $frame, @group) = split(/\t/, $string);
461              
462 1 50       3 if ( !defined $frame ) {
463 0         0 $feat->throw("[$string] does not look like GFF to me");
464             }
465 1 50       6 $frame = 0 unless( $frame =~ /^\d+$/);
466 1         3 $feat->seq_id($seqname);
467 1         3 $feat->source_tag($source);
468 1         2 $feat->primary_tag($primary);
469 1         2 $feat->start($start);
470 1         2 $feat->end($end);
471 1         2 $feat->frame($frame);
472 1 50       2 if ( $score eq '.' ) {
473             #$feat->score(undef);
474             } else {
475 1         2 $feat->score($score);
476             }
477 1 50       3 if ( $strand eq '-' ) { $feat->strand(-1); }
  1         2  
478 1 50       3 if ( $strand eq '+' ) { $feat->strand(1); }
  0         0  
479 1 50       3 if ( $strand eq '.' ) { $feat->strand(0); }
  0         0  
480 1         3 foreach my $g ( @group ) {
481 0 0       0 if ( $g =~ /(\S+)=(\S+)/ ) {
482 0         0 my $tag = $1;
483 0         0 my $value = $2;
484 0         0 $feat->add_tag_value($1, $2);
485             } else {
486 0         0 $feat->add_tag_value('group', $g);
487             }
488             }
489             }
490              
491              
492             =head2 _from_gff2_string
493              
494             Title : _from_gff2_string
495             Usage :
496             Function:
497             Example :
498             Returns : void
499             Args : A Bio::SeqFeatureI implementing object to be initialized
500             The GFF2-formatted string to initialize it from
501              
502              
503             =cut
504              
505             sub _from_gff2_string {
506 1     1   2 my ($gff, $feat, $string) = @_;
507 1         2 chomp($string);
508              
509             # according to the Sanger website, GFF2 should be single-tab
510             # separated elements, and the free-text at the end should contain
511             # text-translated tab symbols but no "real" tabs, so splitting on
512             # \t is safe, and $attribs gets the entire attributes field to be
513             # parsed later
514            
515             # sendu: but the tag value pair can (should?) be separated by a tab. The
516             # 'no tabs' thing seems to apply only to the free text that is allowed for
517             # the value
518              
519 1         8 my ($seqname, $source, $primary, $start,
520             $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string);
521 1         3 my $attribs = join ' ', @attribs;
522            
523 1 50       3 if ( !defined $frame ) {
524 0         0 $feat->throw("[$string] does not look like GFF2 to me");
525             }
526 1         3 $feat->seq_id($seqname);
527 1         2 $feat->source_tag($source);
528 1         3 $feat->primary_tag($primary);
529 1         2 $feat->start($start);
530 1         3 $feat->end($end);
531 1         2 $feat->frame($frame);
532 1 50       3 if ( $score eq '.' ) {
533             # $feat->score(undef);
534             } else {
535 1         3 $feat->score($score);
536             }
537 1 50       4 if ( $strand eq '-' ) { $feat->strand(-1); }
  1         2  
538 1 50       4 if ( $strand eq '+' ) { $feat->strand(1); }
  0         0  
539 1 50       3 if ( $strand eq '.' ) { $feat->strand(0); }
  0         0  
540              
541              
542             #
543             # this routine is necessay to allow the presence of semicolons in
544             # quoted text Semicolons are the delimiting character for new
545             # tag/value attributes. it is more or less a "state" machine, with
546             # the "quoted" flag going up and down as we pass thorugh quotes to
547             # distinguish free-text semicolon and hash symbols from GFF control
548             # characters
549            
550 1         1 my $flag = 0; # this could be changed to a bit and just be twiddled
551 1         1 my @parsed;
552              
553             # run through each character one at a time and check it
554             # NOTE: changed to foreach loop which is more efficient in perl
555             # --jasons
556 1         9 for my $a ( split //, $attribs ) {
557             # flag up on entering quoted text, down on leaving it
558 61 100 100     168 if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 }
  2 100 33     4  
    100          
    50          
559 1         1 elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"}
560 0         0 elsif( $a eq '#' && ! $flag ) { last }
561 61         48 push @parsed, $a;
562             }
563 1         6 $attribs = join "", @parsed; # rejoin into a single string
564              
565             #
566             # Please feel free to fix this and make it more "perlish"
567              
568 1         3 my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited
569              
570 1         2 foreach my $pair ( @key_vals ) {
571             # replace semicolons that were removed from free-text above.
572 3         5 $pair =~ s/INSERT_SEMICOLON_HERE/;/g;
573              
574             # separate the key from the value
575 3         12 my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair;
576              
577 3 50       5 if( defined $values ) {
578 3         2 my @values;
579             # free text is quoted, so match each free-text block
580             # and remove it from the $values string
581 3         9 while ($values =~ s/"(.*?)"//){
582             # and push it on to the list of values (tags may have
583             # more than one value... and the value may be undef)
584 1         3 push @values, $1;
585             }
586              
587             # and what is left over should be space-separated
588             # non-free-text values
589              
590 3         7 my @othervals = split /\s+/, $values;
591 3         4 foreach my $othervalue(@othervals){
592             # get rid of any empty strings which might
593             # result from the split
594 2 50       3 if (CORE::length($othervalue) > 0) {push @values, $othervalue}
  2         3  
595             }
596              
597 3         3 foreach my $value(@values){
598 3         6 $feat->add_tag_value($key, $value);
599             }
600             }
601             }
602             }
603              
604              
605             sub _from_gff3_string {
606 1     1   3 my ($gff, $feat, $string) = @_;
607 1         3 chomp($string);
608              
609             # according to the now nearly final GFF3 spec, columns should
610             # be tab separated, allowing unescaped spaces to occur in
611             # column 9
612              
613 1         5 my ($seqname, $source, $primary, $start, $end,
614             $score, $strand, $frame, $groups) = split(/\t/, $string);
615            
616 1 50       4 if ( ! defined $frame ) {
617 0         0 $feat->throw("[$string] does not look like GFF3 to me");
618             }
619 1         3 $feat->seq_id($seqname);
620 1         2 $feat->source_tag($source);
621 1         3 $feat->primary_tag($primary);
622 1         2 $feat->start($start);
623 1         2 $feat->end($end);
624 1         6 $feat->frame($frame);
625 1 50       3 if ( $score eq '.' ) {
626             #$feat->score(undef);
627             } else {
628 0         0 $feat->score($score);
629             }
630 1 50       4 if ( $strand eq '-' ) { $feat->strand(-1); }
  0         0  
631 1 50       2 if ( $strand eq '+' ) { $feat->strand(1); }
  1         2  
632 1 50       4 if ( $strand eq '.' ) { $feat->strand(0); }
  0         0  
633 1         9 my @groups = split(/\s*;\s*/, $groups);
634              
635 1         2 for my $group (@groups) {
636 3         7 my ($tag,$value) = split /=/,$group;
637 3         5 $tag = unescape($tag);
638 3         5 my @values = map {unescape($_)} split /,/,$value;
  3         4  
639 3         4 for my $v ( @values ) { $feat->add_tag_value($tag,$v); }
  3         5  
640             }
641             }
642              
643             # taken from Bio::DB::GFF
644             sub unescape {
645 48     48 0 40 my $v = shift;
646 48         39 $v =~ tr/+/ /;
647 48         52 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
  2         7  
648 48         157 return $v;
649             }
650              
651              
652             =head2 write_feature
653              
654             Title : write_feature
655             Usage : $gffio->write_feature($feature);
656             Function: Writes the specified SeqFeatureI object in GFF format to the stream
657             associated with this instance.
658             Returns : none
659             Args : An array of Bio::SeqFeatureI implementing objects to be serialized
660              
661             =cut
662              
663             sub write_feature {
664 3     3 1 186 my ($self, @features) = @_;
665 3 50       7 return unless @features;
666 3 100 66     11 if( $self->{'_first'} && $self->gff_version() == 3 ) {
667 1         4 $self->_print("##gff-version 3\n");
668             }
669 3         5 $self->{'_first'} = 0;
670 3         4 foreach my $feature ( @features ) {
671 3         8 $self->_print($self->gff_string($feature)."\n");
672             }
673             }
674              
675              
676             =head2 gff_string
677              
678             Title : gff_string
679             Usage : $gffstr = $gffio->gff_string($feature);
680             Function: Obtain the GFF-formatted representation of a SeqFeatureI object.
681             The formatting depends on the version specified at initialization.
682              
683             This method is used by write_feature(). It actually dispatches to
684             one of the version-specific (private) methods.
685             Example :
686             Returns : A GFF-formatted string representation of the SeqFeature
687             Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
688              
689             =cut
690              
691             sub gff_string{
692 11     11 1 11 my ($self, $feature) = @_;
693              
694 11 100       22 if($self->gff_version() == 1) {
    100          
    50          
695 1         3 return $self->_gff1_string($feature);
696             } elsif( $self->gff_version() == 3 ) {
697 8         19 return $self->_gff3_string($feature);
698             } elsif( $self->gff_version() == 2.5 ) {
699 0         0 return $self->_gff25_string($feature);
700             } else {
701 2         9 return $self->_gff2_string($feature);
702             }
703             }
704              
705              
706             =head2 _gff1_string
707              
708             Title : _gff1_string
709             Usage : $gffstr = $gffio->_gff1_string
710             Function:
711             Example :
712             Returns : A GFF1-formatted string representation of the SeqFeature
713             Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
714              
715             =cut
716              
717             sub _gff1_string{
718 1     1   2 my ($gff, $feat) = @_;
719 1         1 my ($str,$score,$frame,$name,$strand);
720              
721 1 50       12 if( $feat->can('score') ) {
722 1         3 $score = $feat->score();
723             }
724 1 50       3 $score = '.' unless defined $score;
725              
726 1 50       4 if( $feat->can('frame') ) {
727 1         3 $frame = $feat->frame();
728             }
729 1 50       5 $frame = '.' unless defined $frame;
730              
731 1         2 $strand = $feat->strand();
732 1 50       6 if(! $strand) {
    50          
    50          
733 0         0 $strand = ".";
734             } elsif( $strand == 1 ) {
735 0         0 $strand = '+';
736             } elsif ( $feat->strand == -1 ) {
737 1         1 $strand = '-';
738             }
739            
740 1 50       5 if( $feat->can('seqname') ) {
741 1         4 $name = $feat->seq_id();
742 1   50     4 $name ||= 'SEQ';
743             } else {
744 0         0 $name = 'SEQ';
745             }
746              
747 1         4 $str = join("\t",
748             $name,
749             $feat->source_tag,
750             $feat->primary_tag,
751             $feat->start,
752             $feat->end,
753             $score,
754             $strand,
755             $frame);
756              
757 1         4 foreach my $tag ( $feat->get_all_tags ) {
758 4 100       7 next if exists $SKIPPED_TAGS{$tag};
759 3         4 foreach my $value ( $feat->get_tag_values($tag) ) {
760 3 50       12 $str .= " $tag=$value" if $value;
761             }
762             }
763              
764 1         10 return $str;
765             }
766              
767              
768             =head2 _gff2_string
769              
770             Title : _gff2_string
771             Usage : $gffstr = $gffio->_gff2_string
772             Function:
773             Example :
774             Returns : A GFF2-formatted string representation of the SeqFeature
775             Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified
776              
777             =cut
778              
779             sub _gff2_string{
780 2     2   3 my ($gff, $origfeat) = @_;
781 2         3 my $feat;
782 2 50       17 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
783 0         0 $feat = $origfeat->feature2;
784             } else {
785 2         3 $feat = $origfeat;
786             }
787 2         4 my ($str1, $str2,$score,$frame,$name,$strand);
788              
789 2 50       108 if( $feat->can('score') ) {
790 2         7 $score = $feat->score();
791             }
792 2 100       6 $score = '.' unless defined $score;
793              
794 2 50       8 if( $feat->can('frame') ) {
795 2         4 $frame = $feat->frame();
796             }
797 2 50       6 $frame = '.' unless defined $frame;
798              
799 2         3 $strand = $feat->strand();
800 2 50       18 if(! $strand) {
    100          
    50          
801 0         0 $strand = ".";
802             } elsif( $strand == 1 ) {
803 1         2 $strand = '+';
804             } elsif ( $feat->strand == -1 ) {
805 1         2 $strand = '-';
806             }
807              
808 2 50       9 if( $feat->can('seqname') ) {
809 2         5 $name = $feat->seq_id();
810             }
811 2   50     8 $name ||= 'SEQ';
812              
813 2         5 $str1 = join("\t",
814             $name,
815             $feat->source_tag(),
816             $feat->primary_tag(),
817             $feat->start(),
818             $feat->end(),
819             $score,
820             $strand,
821             $frame);
822             # the routine below is the only modification I made to the original
823             # ->gff_string routine (above) as on November 17th, 2000, the
824             # Sanger webpage describing GFF2 format reads: "From version 2
825             # onwards, the attribute field must have a tag value structure
826             # following the syntax used within objects in a .ace file,
827             # flattened onto one line by semicolon separators. Tags must be
828             # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values
829             # must be quoted with double quotes".
830             # MW
831            
832 2         3 my @group;
833              
834 2         5 foreach my $tag ( $feat->get_all_tags ) {
835 7 100       14 next if exists $SKIPPED_TAGS{$tag};
836 6         5 my @v;
837 6         10 foreach my $value ( $feat->get_tag_values($tag) ) {
838 6 50 33     29 unless( defined $value && length($value) ) {
    100          
839             # quote anything other than valid tag/value characters
840 0         0 $value = '""';
841             } elsif ($value =~ /[^A-Za-z0-9_]/){
842             # substitute tab and newline chars by their UNIX equivalent
843 1         2 $value =~ s/\t/\\t/g;
844 1         1 $value =~ s/\n/\\n/g;
845 1         3 $value = '"' . $value . '" ';
846             }
847 6         8 push @v, $value;
848             # for this tag (allowed in GFF2 and .ace format)
849             }
850 6         14 push @group, "$tag ".join(" ", @v);
851             }
852              
853 2         6 $str2 .= join(' ; ', @group);
854             # Add Target information for Feature Pairs
855 2 50 33     4 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
      33        
856             ! $feat->has_tag('Group') &&
857             $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
858 0 0       0 $str2 = sprintf("Target %s %d %d", $origfeat->feature1->seq_id,
    0          
859             ( $origfeat->feature1->strand < 0 ?
860             ( $origfeat->feature1->end,
861             $origfeat->feature1->start) :
862             ( $origfeat->feature1->start,
863             $origfeat->feature1->end)
864             )) . ($str2?" ; ".$str2:""); # need to put Target information before other tag/value pairs - mw
865             }
866 2         12 return $str1."\t".$str2;
867             }
868              
869              
870             =head2 _gff25_string
871              
872             Title : _gff25_string
873             Usage : $gffstr = $gffio->_gff2_string
874             Function: To get a format of GFF that is peculiar to Gbrowse/Bio::DB::GFF
875             Example :
876             Returns : A GFF2.5-formatted string representation of the SeqFeature
877             Args : A Bio::SeqFeatureI implementing object to be GFF2.5-stringified
878              
879             =cut
880              
881             sub _gff25_string {
882 0     0   0 my ($gff, $origfeat) = @_;
883 0         0 my $feat;
884 0 0       0 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
885 0         0 $feat = $origfeat->feature2;
886             } else {
887 0         0 $feat = $origfeat;
888             }
889 0         0 my ($str1, $str2,$score,$frame,$name,$strand);
890              
891 0 0       0 if( $feat->can('score') ) {
892 0         0 $score = $feat->score();
893             }
894 0 0       0 $score = '.' unless defined $score;
895              
896 0 0       0 if( $feat->can('frame') ) {
897 0         0 $frame = $feat->frame();
898             }
899 0 0       0 $frame = '.' unless defined $frame;
900              
901 0         0 $strand = $feat->strand();
902 0 0       0 if(! $strand) {
    0          
    0          
903 0         0 $strand = ".";
904             } elsif( $strand == 1 ) {
905 0         0 $strand = '+';
906             } elsif ( $feat->strand == -1 ) {
907 0         0 $strand = '-';
908             }
909              
910 0 0       0 if( $feat->can('seqname') ) {
911 0         0 $name = $feat->seq_id();
912 0   0     0 $name ||= 'SEQ';
913             } else {
914 0         0 $name = 'SEQ';
915             }
916 0         0 $str1 = join("\t",
917             $name,
918             $feat->source_tag(),
919             $feat->primary_tag(),
920             $feat->start(),
921             $feat->end(),
922             $score,
923             $strand,
924             $frame);
925              
926 0         0 my @all_tags = $feat->all_tags;
927 0         0 my @group; my @firstgroup;
928 0 0       0 if (@all_tags) { # only play this game if it is worth playing...
929 0         0 foreach my $tag ( @all_tags ) {
930 0         0 my @v;
931 0         0 foreach my $value ( $feat->get_tag_values($tag) ) {
932 0 0       0 next if exists $SKIPPED_TAGS{$tag};
933 0 0 0     0 unless( defined $value && length($value) ) {
    0          
934 0         0 $value = '""';
935             } elsif ($value =~ /[^A-Za-z0-9_]/){
936 0         0 $value =~ s/\t/\\t/g; # substitute tab and newline
937             # characters
938 0         0 $value =~ s/\n/\\n/g; # to their UNIX equivalents
939 0         0 $value = '"' . $value . '" ';
940             } # if the value contains
941             # anything other than valid
942             # tag/value characters, then
943             # quote it
944 0         0 push @v, $value;
945             # for this tag (allowed in GFF2 and .ace format)
946             }
947 0 0 0     0 if (($tag eq 'Group') || ($tag eq 'Target')){ # hopefully we wont get both...
948 0         0 push @firstgroup, "$tag ".join(" ", @v);
949             } else {
950 0         0 push @group, "$tag ".join(" ", @v);
951             }
952             }
953             }
954 0         0 $str2 = join(' ; ', (@firstgroup, @group));
955             # Add Target information for Feature Pairs
956 0 0 0     0 if( ! $feat->has_tag('Target') && # This is a bad hack IMHO
      0        
957             ! $feat->has_tag('Group') &&
958             $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
959 0 0       0 $str2 = sprintf("Target %s ; tstart %d ; tend %d", $origfeat->feature1->seq_id,
    0          
960             ( $origfeat->feature1->strand < 0 ?
961             ( $origfeat->feature1->end,
962             $origfeat->feature1->start) :
963             ( $origfeat->feature1->start,
964             $origfeat->feature1->end)
965             )) . ($str2?" ; ".$str2:""); # need to put the target info before other tag/value pairs - mw
966             }
967 0         0 return $str1 . "\t". $str2;
968             }
969              
970              
971             =head2 _gff3_string
972              
973             Title : _gff3_string
974             Usage : $gffstr = $gffio->_gff3_string
975             Function:
976             Example :
977             Returns : A GFF3-formatted string representation of the SeqFeature
978             Args : A Bio::SeqFeatureI implementing object to be GFF3-stringified
979              
980             =cut
981              
982             sub _gff3_string {
983 8     8   8 my ($gff, $origfeat) = @_;
984 8         6 my $feat;
985 8 50       39 if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){
986 0         0 $feat = $origfeat->feature2;
987             } else {
988 8         8 $feat = $origfeat;
989             }
990              
991 8         15 my $ID = $gff->_incrementGFF3ID();
992              
993 8         8 my ($score,$frame,$name,$strand);
994              
995 8 50       30 if( $feat->can('score') ) {
996 8         13 $score = $feat->score();
997             }
998 8 100       15 $score = '.' unless defined $score;
999              
1000 8 50       22 if( $feat->can('frame') ) {
1001 8         15 $frame = $feat->frame();
1002             }
1003 8 100       13 $frame = '1' unless defined $frame;
1004              
1005 8         16 $strand = $feat->strand();
1006              
1007 8 50       20 if(! $strand) {
    50          
    0          
1008 0         0 $strand = ".";
1009             } elsif( $strand == 1 ) {
1010 8         10 $strand = '+';
1011             } elsif ( $feat->strand == -1 ) {
1012 0         0 $strand = '-';
1013             }
1014              
1015 8 50       24 if( $feat->can('seqname') ) {
1016 8         18 $name = $feat->seq_id();
1017 8   50     17 $name ||= 'SEQ';
1018             } else {
1019 0         0 $name = 'SEQ';
1020             }
1021              
1022 8         8 my @groups;
1023              
1024             # force leading ID and Parent tags
1025 8         17 my @all_tags = grep { ! exists $GFF3_ID_Tags{$_} } $feat->all_tags;
  36         57  
1026 8         30 for my $t ( sort { $GFF3_ID_Tags{$b} <=> $GFF3_ID_Tags{$a} }
  23         37  
1027             keys %GFF3_ID_Tags ) {
1028 24 100       40 unshift @all_tags, $t if $feat->has_tag($t);
1029             }
1030              
1031 8         13 for my $tag ( @all_tags ) {
1032 36 100       63 next if exists $SKIPPED_TAGS{$tag};
1033             # next if $tag eq 'Target';
1034 29 100 66     84 if ($tag eq 'Target' && ! $origfeat->isa('Bio::SeqFeature::FeaturePair')){
1035             # simple Target,start,stop
1036 7         17 my($target_id, $b,$e,$strand) = $feat->get_tag_values($tag);
1037 7 0 33     24 next unless(defined($e) && defined($b) && $target_id);
      33        
1038 0 0 0     0 ($b,$e)= ($e,$b) if(defined $strand && $strand<0);
1039 0         0 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
  0         0  
1040 0         0 push @groups, sprintf("Target=%s %d %d", $target_id,$b,$e);
1041 0         0 next;
1042             }
1043              
1044 22         14 my $valuestr;
1045             # a string which will hold one or more values
1046             # for this tag, with quoted free text and
1047             # space-separated individual values.
1048             my @v;
1049 22         39 for my $value ( $feat->get_tag_values($tag) ) {
1050 22 100 66     67 if( defined $value && length($value) ) {
1051             #$value =~ tr/ /+/; #spaces are allowed now
1052 20 50       32 if ( ref $value eq 'Bio::Annotation::Comment') {
1053 0         0 $value = $value->text;
1054             }
1055              
1056 20 100       46 if ($value =~ /[^a-zA-Z0-9\,\;\=\.:\%\^\*\$\@\!\+\_\?\-]/) {
1057 1         3 $value =~ s/\t/\\t/g; # substitute tab and newline
1058             # characters
1059 1         1 $value =~ s/\n/\\n/g; # to their UNIX equivalents
1060              
1061             # Unescaped quotes are not allowed in GFF3
1062             # $value = '"' . $value . '"';
1063             }
1064 20         35 $value =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
  2         15  
1065             } else {
1066             # if it is completely empty, then just make empty double quotes
1067 2         4 $value = '""';
1068             }
1069 22         27 push @v, $value;
1070             }
1071             # can we figure out how to improve this?
1072 22 100       77 $tag = lcfirst($tag) unless ( $tag =~
1073             /^(ID|Name|Alias|Parent|Gap|Target|Derives_from|Note|Dbxref|Ontology_term)$/);
1074              
1075 22         60 push @groups, "$tag=".join(",",@v);
1076             }
1077             # Add Target information for Feature Pairs
1078 8 50 66     15 if( $feat->has_tag('Target') &&
      66        
1079             ! $feat->has_tag('Group') &&
1080             $origfeat->isa('Bio::SeqFeature::FeaturePair') ) {
1081              
1082 0         0 my $target_id = $origfeat->feature1->seq_id;
1083 0         0 $target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge;
  0         0  
1084              
1085 0 0       0 push @groups, sprintf("Target=%s %d %d",
1086             $target_id,
1087             ( $origfeat->feature1->strand < 0 ?
1088             ( $origfeat->feature1->end,
1089             $origfeat->feature1->start) :
1090             ( $origfeat->feature1->start,
1091             $origfeat->feature1->end)
1092             ));
1093             }
1094              
1095             # unshift @groups, "ID=autogenerated$ID" unless ($feat->has_tag('ID'));
1096 8 50 33     53 if ( $feat->can('name') && defined($feat->name) ) {
1097             # such as might be for Bio::DB::SeqFeature
1098 0         0 unshift @groups, 'Name=' . $feat->name;
1099             }
1100              
1101 8         11 my $gff_string = "";
1102 8 50       16 if ($feat->location->isa("Bio::Location::SplitLocationI")) {
1103 0         0 my @locs = $feat->location->each_Location;
1104 0         0 foreach my $loc (@locs) {
1105 0   0     0 $gff_string .= join("\t",
1106             $name,
1107             $feat->source_tag() || '.',
1108             $feat->primary_tag(),
1109             $loc->start(),
1110             $loc->end(),
1111             $score,
1112             $strand,
1113             $frame,
1114             join(';', @groups)) . "\n";
1115             }
1116 0         0 chop $gff_string;
1117 0         0 return $gff_string;
1118             } else {
1119 8   50     15 $gff_string = join("\t",
1120             $name,
1121             $feat->source_tag() || '.',
1122             $feat->primary_tag(),
1123             $feat->start(),
1124             $feat->end(),
1125             $score,
1126             $strand,
1127             $frame,
1128             join(';', @groups));
1129             }
1130 8         70 return $gff_string;
1131             }
1132              
1133              
1134             =head2 gff_version
1135              
1136             Title : _gff_version
1137             Usage : $gffversion = $gffio->gff_version
1138             Function:
1139             Example :
1140             Returns : The GFF version this parser will accept and emit.
1141             Args : none
1142              
1143             =cut
1144              
1145             sub gff_version {
1146 47     47 1 40 my ($self, $value) = @_;
1147 47 100 66     84 if(defined $value && grep {$value == $_ } ( 1, 2, 2.5, 3)) {
  64         117  
1148 16         23 $self->{'GFF_VERSION'} = $value;
1149             }
1150 47         383 return $self->{'GFF_VERSION'};
1151             }
1152              
1153              
1154             # Make filehandles
1155              
1156             =head2 newFh
1157              
1158             Title : newFh
1159             Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1160             Function: does a new() followed by an fh()
1161             Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
1162             $feature = <$fh>; # read a feature object
1163             print $fh $feature; # write a feature object
1164             Returns : filehandle tied to the Bio::Tools::GFF class
1165             Args :
1166              
1167             =cut
1168              
1169             sub newFh {
1170 0     0 1 0 my $class = shift;
1171 0 0       0 return unless my $self = $class->new(@_);
1172 0         0 return $self->fh;
1173             }
1174              
1175              
1176             =head2 fh
1177              
1178             Title : fh
1179             Usage : $obj->fh
1180             Function:
1181             Example : $fh = $obj->fh; # make a tied filehandle
1182             $feature = <$fh>; # read a feature object
1183             print $fh $feature; # write a feature object
1184             Returns : filehandle tied to Bio::Tools::GFF class
1185             Args : none
1186              
1187             =cut
1188              
1189              
1190             sub fh {
1191 0     0 1 0 my $self = shift;
1192 0   0     0 my $class = ref($self) || $self;
1193 0         0 my $s = Symbol::gensym;
1194 0         0 tie $$s,$class,$self;
1195 0         0 return $s;
1196             }
1197              
1198             # This accessor is used for accessing the Bio::Seq objects from a GFF3
1199             # file; if the file you are using has no sequence data you can ignore
1200             # this accessor
1201              
1202             # This accessor returns a hash reference containing Bio::Seq objects,
1203             # indexed by Bio::Seq->primary_id
1204              
1205             sub _seq_by_id_h {
1206 0     0   0 my $self = shift;
1207              
1208 0 0       0 return $self->{'_seq_by_id_h'} = shift if @_;
1209             $self->{'_seq_by_id_h'} = {}
1210 0 0       0 unless $self->{'_seq_by_id_h'};
1211 0         0 return $self->{'_seq_by_id_h'};
1212             }
1213              
1214              
1215             =head2 get_seqs
1216              
1217             Title : get_seqs
1218             Usage :
1219             Function: Returns all Bio::Seq objects populated by GFF3 file
1220             Example :
1221             Returns :
1222             Args :
1223              
1224             =cut
1225              
1226             sub get_seqs {
1227 0     0 1 0 my ($self,@args) = @_;
1228 0         0 return values %{$self->_seq_by_id_h};
  0         0  
1229             }
1230              
1231              
1232             =head2 features_attached_to_seqs
1233              
1234             Title : features_attached_to_seqs
1235             Usage : $obj->features_attached_to_seqs(1);
1236             Function: For use with GFF3 containg sequence only
1237              
1238             Setting this B parsing ensures that all Bio::Seq object
1239             created will have the appropriate features added to them
1240              
1241             defaults to false (off)
1242              
1243             Note that this mode will incur higher memory usage because features
1244             will have to be cached until the relevant feature comes along
1245              
1246             Example :
1247             Returns : value of features_attached_to_seqs (a boolean)
1248             Args : on set, new value (a boolean, optional)
1249              
1250              
1251             =cut
1252              
1253             sub features_attached_to_seqs{
1254 3     3 1 3 my $self = shift;
1255              
1256 3 50       7 return $self->{'_features_attached_to_seqs'} = shift if @_;
1257 3         5 return $self->{'_features_attached_to_seqs'};
1258             }
1259              
1260              
1261             =head2 ignore_sequence
1262              
1263             Title : ignore_sequence
1264             Usage : $obj->ignore_sequence(1);
1265             Function: For use with GFF3 containg sequence only
1266              
1267             Setting this B parsing means that all sequence data will be
1268             ignored
1269              
1270             Example :
1271             Returns : value of ignore_sequence (a boolean)
1272             Args : on set, new value (a boolean, optional)
1273              
1274             =cut
1275              
1276             sub ignore_sequence{
1277 0     0 1 0 my $self = shift;
1278              
1279 0 0       0 return $self->{'_ignore_sequence'} = shift if @_;
1280 0         0 return $self->{'_ignore_sequence'};
1281             }
1282              
1283              
1284             sub DESTROY {
1285 15     15   57 my $self = shift;
1286 15         41 $self->close();
1287             }
1288              
1289             sub TIEHANDLE {
1290 0     0     my ($class,$val) = @_;
1291 0           return bless {'gffio' => $val}, $class;
1292             }
1293              
1294             sub READLINE {
1295 0     0     my $self = shift;
1296 0 0 0       return $self->{'gffio'}->next_feature() || undef unless wantarray;
1297 0           my (@list, $obj);
1298 0           push @list, $obj while $obj = $self->{'gffio'}->next_feature();
1299 0           return @list;
1300             }
1301              
1302             sub PRINT {
1303 0     0     my $self = shift;
1304 0           $self->{'gffio'}->write_feature(@_);
1305             }
1306              
1307             1;
1308