File Coverage

blib/lib/Bio/ToolBox/SeqFeature.pm
Criterion Covered Total %
statement 191 288 66.3
branch 107 210 50.9
condition 43 105 40.9
subroutine 28 37 75.6
pod 31 32 96.8
total 400 672 59.5


line stmt bran cond sub pod time code
1             package Bio::ToolBox::SeqFeature;
2             our $VERSION = '1.67';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::SeqFeature - Fast, simple SeqFeature implementation
7              
8             =head1 SYNOPSIS
9              
10             # create a transcript
11             my $transcript = Bio::ToolBox::SeqFeature->new(
12             -seq_id => chr1,
13             -start => 1001,
14             -stop => 1500,
15             -strand => '+',
16             );
17             $seqf->primary_tag('mRNA'); # set parameters individually
18            
19             # create an exon
20             my $exon = Bio::ToolBox::SeqFeature->new(
21             -start => 1001,
22             -end => 1200,
23             -type => 'exon',
24             );
25            
26             # associate exon with transcript
27             $transcript->add_SeqFeature($exon);
28             my $exon_strand = $exon->strand; # inherits from parent
29            
30             # walk through subfeatures
31             foreach my $f ($transcript->get_all_SeqFeatures) {
32             printf "%s is a %s\n", $f->display_name, $f->type;
33             }
34            
35             # add attribute
36             $transcript->add_tag_value('Status', $status);
37            
38             # get attribute
39             my $value = $transcript->get_tag_values($key);
40              
41             =head1 DESCRIPTION
42              
43             SeqFeature objects represent functional elements on a genomic or chromosomal
44             sequence, such as genes, transcripts, exons, etc. In many cases, especially
45             genes, they have a hierarchical structure, typically in this order
46              
47             gene
48             mRNA or transcript
49             exon
50             CDS
51              
52             SeqFeature objects have at a minimum coordinate information, including
53             chromosome, start, stop, and strand, and a name or unique identifier. They
54             often also have type or source information, which usually follows the
55             Sequence Ontology key words.
56              
57             This is a fast, efficient, simplified SeqFeature implementation that mostly
58             implements the L API, and could be substituted for other
59             implementations, such L and L.
60             Unlike the others, however, it inherits no classes or methods and uses an
61             unorthodox blessed array to store feature attributes, decreasing memory
62             requirements and overhead complexity.
63              
64             =head1 METHODS
65              
66             Refer to the L documentation for general implementation and
67             ideas, which this module tries to implement.
68              
69             =head2 Creating new SeqFeature objects
70              
71             =over 4
72              
73             =item new
74              
75             Generate a new SeqFeature object. Pass an array of key =E value pairs
76             with the feature parameters. At a minimum, location information (C,
77             C, and C) should be provided, with name and ID recommended.
78             Most of the accession methods may be used as key tags to the C method.
79             The following attribute keys are accepted.
80              
81             =over 4
82              
83             =item -seq_id
84              
85             =item -start
86              
87             =item -end
88              
89             =item -stop
90              
91             =item -strand
92              
93             =item -name
94              
95             =item -display_name
96              
97             =item -id
98              
99             =item -primary_id
100              
101             =item -type
102              
103             =item -source
104              
105             =item -source_tag
106              
107             =item -primary_tag
108              
109             =item -score
110              
111             =item -phase
112              
113             =item -attributes
114              
115             Provide an anonymous array of key value pairs representing attribute
116             keys and their values.
117              
118             =item -segments
119              
120             Provide an anonymous array of SeqFeature objects to add as child
121             objects.
122              
123             =back
124              
125             =item duplicate
126              
127             This method will duplicate a SeqFeature object as a new object, with
128             limitations. The C tag, attribute key/values, and subfeatures
129             are B duplicated. This basically limits the object to coordinates,
130             C, C, C, C, and C.
131             Remaining attributes should be explicitly set by the user.
132              
133             =back
134              
135             =head2 Accession methods
136              
137             These are methods to set and/or retrieve attribute values. Pass a
138             single value to set the attribute. The attribute value is always
139             returned.
140              
141             =over 4
142              
143             =item seq_id
144              
145             The name of the chromosome or reference sequence. If you are generating
146             a new child object, e.g. exon, then seq_id does not need to be provided.
147             In these cases, the parent's seq_id is inherited.
148              
149             =item start
150              
151             The start coordinate of the feature. SeqFeature objects use the 1-base
152             numbering system, following BioPerl convention. Start coordinates are
153             always less than the stop coordinate.
154              
155             =item end
156              
157             =item stop
158              
159             The end coordinate of the feature. Stop coordinates are always greater
160             than the start coordinate.
161              
162             =item strand
163              
164             The strand that the feature is on. The default value is always unstranded,
165             or 0. Any of the following may be supplied: "C<1 0 -1 + . ->". Numeric
166             integers 1, 0, and -1 are always returned.
167              
168             =item source
169              
170             =item source_tag
171              
172             A text string representing the source of the feature. This corresponds to
173             the second field in a GFF file. The source tag is optional. If not supplied,
174             the source tag can be inherited from a parent feature if present.
175              
176             =item primary_tag
177              
178             The type of feature. These usually follow Sequence Ontology terms, but are
179             not required. The default value is the generic term "region". Examples
180             include gene, mRNA, transcript, exon, CDS, etc. This corresponds to the
181             third field in a GFF file.
182              
183             =item type
184              
185             A shortcut method which can represent either "primary_tag:source_tag" or,
186             if no source_tag is defined, simply "primary_tag".
187              
188             =item name
189              
190             =item display_name
191              
192             A text string representing the name of the feature. The name is not
193             required to be a unique value, but generally is. The default name, if
194             none is provided, is the C.
195              
196             =item id
197              
198             =item primary_id
199              
200             A text string representing a unique identifier of the feature. If not
201             explicitly defined, a unique ID is automatically generated.
202              
203             =item score
204              
205             A numeric (integer or floating) value representing the feature.
206              
207             =item phase
208              
209             An integer (0,1,2) representing the coding frame. Only required for
210             CDS features.
211              
212             =back
213              
214             =head2 Special Attributes
215              
216             Special attributes are key value pairs that do not fall under the above
217             conventions. It is possible to have more than one value assigned to a
218             given key. In a GFF file, this corresponds to the attributes in the 9th
219             field, with the exception of special reserved attributes such as C,
220             C, and C.
221              
222             =over 4
223              
224             =item add_tag_value
225              
226             $seqf->add_tag_value($key, $value);
227              
228             Sets the special attribute C<$key> to C<$value>. If you have more than one
229             value, C<$value> should be an anonymous array of text values. Following
230             GFF convention, C<$key> should not comprise of special characters, including
231             ";,= ".
232              
233             =item all_tags
234              
235             =item get_all_tags
236              
237             Returns an array of all attribute keys.
238              
239             =item has_tag($key)
240              
241             Boolean method whether the SeqFeature object contains the attribute.
242              
243             =item get_tag_values($key)
244              
245             =item each_tag_value($key)
246              
247             Returns the value for attribute C<$key>. If multiple values are present,
248             it may return an array or array reference.
249              
250             =item attributes
251              
252             Returns an array or reference to the key value hash;
253              
254             =item remove_tag($key)
255              
256             Deletes the indicated attribute.
257              
258             =back
259              
260             =head2 Subfeature Hierarchy
261              
262             Following Sequence Ontology and GFF conventions, SeqFeatures can have
263             subfeatures (children) representing a hierarchical structure, for example
264             genes beget transcripts which beget exons.
265              
266             Child SeqFeature objects may have more than one parent, for example, shared
267             exons between alternate transcripts. In which case, only one exon
268             SeqFeature object is generated, but is added to both transcript objects.
269              
270             =over 4
271              
272             =item add_SeqFeature
273              
274             =item add_segment
275              
276             Pass one or more SeqFeature objects to be associated as children.
277              
278             =item get_SeqFeatures
279              
280             =item get_all_SeqFeatures
281              
282             =item segments
283              
284             Returns an array of all sub SeqFeature objects.
285              
286             =item delete_SeqFeature($id)
287              
288             This method will delete a specific child SeqFeature. Pass the method the
289             C of the SeqFeature. All SeqFeatures will have a C.
290              
291             =back
292              
293             =head2 Range Methods
294              
295             These are range methods for comparing one SeqFeature object to another.
296             They are analogous to L methods.
297              
298             They currently do not support strand checks or strand options.
299              
300             =over 4
301              
302             =item length
303              
304             Returns the length of the SeqFeature object.
305              
306             =item overlaps
307              
308             my $overlap_check = $seqf1->overlaps($other_seqf);
309              
310             Returns a boolean value whether a second SeqFeature object overlaps
311             with the self object.
312              
313             =item contains
314              
315             my $check = $seqf1->contains($seqf2);
316              
317             Returns a boolean value whether the self object completely
318             contains a second SeqFeature object.
319              
320             =item equals
321              
322             my $check = $seqf1->equals($seqf2);
323              
324             Returns a boolean value whether the self object coordinates are
325             equivalent to a second SeqFeature object.
326              
327             =item intersection
328              
329             my $check = $seqf1->intersection($seqf2);
330              
331             Returns a new SeqFeature object representing the intersection or
332             overlap area between the self object and a second SeqFeature
333             object.
334              
335             =item union($other)
336              
337             my $new_seqf = $seqf1->union($seqf2);
338              
339             Returns a new SeqFeature object representing the merged interval
340             between the self and a second SeqFeature object.
341              
342             =item subtract
343              
344             my $new_seqf = $seqf1->subtract($seqf2);
345              
346             Returns a new SeqFeature object representing the interval of the
347             self object after subtracting a second SeqFeature object.
348              
349             =back
350              
351             =head2 Export Strings
352              
353             These methods export the SeqFeature object as a text string in the
354             specified format. New line characters are included.
355              
356             =over 4
357              
358             =item gff_string($recurse)
359              
360             Exports the SeqFeature object as a GFF3 formatted string. Pass a
361             boolean value if you wish to recurse through the hierarchy and
362             print subfeatures as a multi-line string. Child-Parent ID attributes
363             are smartly handled, including multiple parentage.
364              
365             See L for methods for printing GTF transcript models.
366              
367             =item bed_string
368              
369             Exports the SeqFeature object as a simple BED6 formatted string.
370             See L for methods for printing BED12 transcript models.
371              
372             =back
373              
374             =head1 LIMITATIONS
375              
376             Because of their underlying array structure, Bio::ToolBox::SeqFeature objects
377             should generally not be used as a base class (unless you know the ramifications
378             of doing so). The following Bio classes and Interfaces are similar and their API
379             was used as a model. However, in most cases they are not likely to work with this
380             module because of object structure incompatibility, although this has not been
381             explicitly tested.
382              
383             =over 4
384              
385             =item Bio::AnnotationI
386              
387             =item Bio::LocationI
388              
389             =item Bio::RangeI
390              
391             =item Bio::SeqI
392              
393             =item Bio::Tools::GFF
394              
395             =item Bio::DB::SeqFeature::Store
396              
397             =item Bio::Graphics
398              
399             =back
400              
401             =head1 SEE ALSO
402              
403             L, L, L,
404             L, L, L
405              
406             =cut
407              
408 2     2   11 use strict;
  2         4  
  2         58  
409 2     2   9 use Carp qw(carp cluck croak confess);
  2         22  
  2         164  
410             use constant {
411 2         6153 SEQID => 0,
412             START => 1,
413             STOP => 2,
414             STRND => 3,
415             NAME => 4,
416             ID => 5,
417             TYPE => 6,
418             SRC => 7,
419             SCORE => 8,
420             PHASE => 9,
421             ATTRB => 10,
422             SUBF => 11,
423 2     2   11 };
  2         3  
424             our $IDCOUNT = 0;
425              
426             1;
427              
428             #### Aliases ####
429             # to maintain compatibility with Bio::SeqFeature::Lite and Bio::SeqFeatureI we
430             # put in plenty of aliases to some of the methods
431             *stop = \&end;
432             *name = \&display_name;
433             *id = \&primary_id;
434             *method = \&primary_tag;
435             *source = \&source_tag;
436             *add_segment = \&add_SeqFeature;
437             *get_all_SeqFeatures = *segments = \&get_SeqFeatures;
438             *each_tag_value = \&get_tag_values;
439             *get_all_tags = \&all_tags;
440             *gff3_string = \&gff_string;
441              
442              
443             #### METHODS ####
444             sub new {
445 1658     1658 1 1989 my $class = shift;
446 1658 100       2446 if (ref($class)) {
447 723         751 $class = ref($class);
448             }
449 1658         5011 my %args = @_;
450 1658         2765 my $self = bless [], $class;
451             # bless ourselves early to take advantage of some methods
452             # but otherwise do what we can simply and quickly
453            
454             # primary options
455             $self->[SEQID] = $args{-seq_id} || $args{-seqid} || $args{'-ref'} ||
456 1658   0     3343 $args{chrom} || undef;
457 1658   50     2605 $self->[START] = $args{-start} || undef;
458 1658   0     2549 $self->[STOP] = $args{-end} || $args{-stop} || undef;
459 1658 50       3498 $self->strand($args{-strand}) if exists $args{-strand};
460 1658   100     4053 $self->[NAME] = $args{-display_name} || $args{-name} || undef;
461 1658   100     4278 $self->[ID] = $args{-primary_id} || $args{-id} || undef;
462            
463             # check orientation
464 1658 50 33     5563 if (defined $self->[START] and defined $self->[STOP] and
      33        
465             $self->[START] > $self->[STOP]
466             ) {
467             # flip the coordinates around
468 0         0 ($self->[START], $self->[STOP]) = ($self->[STOP], $self->[START]);
469 0         0 $self->[STRND] *= -1;
470             }
471            
472             # additional options
473 1658   50     4350 $args{-type} ||= $args{-primary_tag} || undef;
      66        
474 1658 50       2382 if (defined $args{-type}) {
475 1658         2460 $self->type($args{-type});
476             }
477 1658   100     2523 $args{-source} ||= $args{-source_tag} || undef;
      100        
478 1658 100       2243 if (defined $args{-source}) {
479 1638         1851 $self->[SRC] = $args{-source};
480             }
481 1658 100       2151 if (exists $args{-score}) {
482 32         54 $self->[SCORE] = $args{-score};
483             }
484 1658 100       2293 if (exists $args{-phase}) {
485 240         426 $self->phase($args{-phase});
486             }
487 1658 100 66     3681 if (exists $args{-attributes} or exists $args{-tags}) {
488 1   33     5 $args{-attributes} ||= $args{-tags};
489 1 50       6 if (ref($args{-attributes}) eq 'HASH') {
490 1         2 $self->[ATTRB] = $args{-attributes};
491             }
492             }
493 1658 100       2126 if (exists $args{-segments}) {
494 2         8 $self->[SUBF] = [];
495 2         5 foreach my $s (@{ $args{-segments} }) {
  2         7  
496 16 50       21 unless (ref($s) eq $class) {
497 0         0 croak "segments should be passed as $class objects!";
498             }
499 16         16 push @{$self->[SUBF]}, $s;
  16         23  
500             }
501             }
502            
503 1658         3774 return $self;
504             }
505              
506              
507             sub seq_id {
508 1671     1671 1 6981 my $self = shift;
509 1671 50       2162 if (@_) {
510 0         0 $self->[SEQID] = $_[0];
511             }
512 1671 50       3602 return defined $self->[SEQID] ? $self->[SEQID] : undef;
513             }
514              
515             sub start {
516 6484     6484 1 20360 my $self = shift;
517 6484 50       7667 if (@_) {
518 0         0 $self->[START] = $_[0];
519             }
520 6484 50       14956 return defined $self->[START] ? $self->[START] : undef;
521             }
522              
523             sub end {
524 2362     2362 1 2373 my $self = shift;
525 2362 100       2904 if (@_) {
526 9         12 $self->[STOP] = $_[0];
527             }
528 2362 50       5069 return defined $self->[STOP] ? $self->[STOP] : undef;
529             }
530              
531             sub strand {
532 4244     4244 1 4138 my $self = shift;
533 4244 100       5490 if (@_) {
534 1699         1660 my $str = $_[0];
535 1699 100       5129 if ($str eq '+') {
    100          
    100          
    100          
    50          
536 262         334 $self->[STRND] = 1;
537             }
538             elsif ($str eq '-') {
539 116         160 $self->[STRND] = -1;
540             }
541             elsif ($str eq '.') {
542 16         28 $self->[STRND] = 0;
543             }
544             elsif ($str =~ /^[\+\-]?1$/) {
545 1207         1702 $self->[STRND] = $str;
546             }
547             elsif ($str eq '0') {
548 98         115 $self->[STRND] = 0;
549             }
550             }
551 4244 50       6849 return defined $self->[STRND] ? $self->[STRND] : 0;
552             }
553              
554             sub display_name {
555 2189     2189 1 5904 my $self = shift;
556 2189 100       2627 if (@_) {
557 223         318 $self->[NAME] = $_[0];
558             }
559 2189 100       5442 return defined $self->[NAME] ? $self->[NAME] : $self->primary_id;
560             }
561              
562             sub primary_id {
563 2011     2011 1 2148 my $self = shift;
564 2011 100       3544 if (@_) {
    100          
565 235         326 $self->[ID] = $_[0];
566             }
567             elsif (not defined $self->[ID]) {
568             # automatically assign a new ID
569 205         335 $self->[ID] = sprintf("%s_%09d", $self->primary_tag, $IDCOUNT++);
570             }
571 2011         4237 return $self->[ID];
572             }
573              
574             sub primary_tag {
575 9333     9333 1 9138 my $self = shift;
576 9333 100       11140 if (@_) {
577 16         22 $self->[TYPE] = $_[0];
578             }
579 9333   50     11265 $self->[TYPE] ||= 'region';
580 9333         16337 return $self->[TYPE];
581             }
582              
583             sub source_tag {
584 1925     1925 1 1912 my $self = shift;
585 1925 50       2451 if (@_) {
586 0         0 $self->[SRC] = $_[0];
587             }
588 1925 50       3360 return defined $self->[SRC] ? $self->[SRC] : undef;
589             }
590              
591             sub type {
592 1901     1901 1 3448 my $self = shift;
593 1901 100       2519 if (@_) {
594 1658         1626 my $type = $_[0];
595 1658 50       2523 if ($type =~ /:/) {
596 0         0 my ($t, $s) = split /:/, $type;
597 0         0 $self->[TYPE] = $t;
598 0         0 $self->[SRC] = $s;
599             }
600             else {
601 1658         2665 $self->[TYPE] = $type;
602             }
603             }
604 1901   50     2651 $self->[TYPE] ||= undef;
605 1901   100     4399 $self->[SRC] ||= undef;
606 1901 100       2425 if (defined $self->[SRC]) {
607 243         763 return join(':', $self->[TYPE], $self->[SRC]);
608             }
609 1658         1721 return $self->[TYPE];
610             }
611              
612             sub score {
613 119     119 1 138 my $self = shift;
614 119 100       176 if (@_) {
615 68         101 $self->[SCORE] = $_[0];
616             }
617 119 100       225 return defined $self->[SCORE] ? $self->[SCORE] : undef;
618             }
619              
620             sub phase {
621 420     420 1 484 my $self = shift;
622 420 100       639 if (@_) {
623 356         401 my $p = $_[0];
624 356 50       825 unless ($p =~ /^[012\.]$/) {
625 0         0 cluck "phase must be 0, 1, 2 or .!";
626             }
627 356         630 $self->[PHASE] = $p;
628             }
629 420 100       783 return defined $self->[PHASE] ? $self->[PHASE] : undef;
630             }
631              
632             sub duplicate {
633 471     471 1 501 my $self = shift;
634 471         654 my $n = $self->new(
635             -seq_id => $self->[SEQID],
636             -start => $self->[START],
637             -end => $self->[STOP],
638             -strand => $self->strand,
639             -type => $self->primary_tag,
640             -source => $self->source_tag,
641             -display_name => $self->name
642             );
643 471 50       843 $n->score( $self->[SCORE] ) if defined $self->[SCORE];
644 471 50       564 $n->phase( $self->[PHASE] ) if defined $self->[PHASE];
645 471         650 return $n;
646             }
647              
648             sub add_SeqFeature {
649 773     773 1 786 my $self = shift;
650 773   100     1509 $self->[SUBF] ||= [];
651 773         810 my $count = 0;
652 773         955 foreach my $s (@_) {
653 773 50       1155 if (ref($s) eq ref($self)) {
654 773 50       997 $s->seq_id($self->seq_id) unless ($s->seq_id);
655 773 100       1222 $s->strand($self->strand) unless ($s->strand);
656 773 50       1126 $s->source_tag($self->source_tag) unless ($s->source_tag);
657 773         786 push @{ $self->[SUBF] }, $s;
  773         1083  
658 773         979 $count++;
659             }
660             else {
661 0         0 cluck "please use SeqFeature objects when adding sub features!";
662             }
663             }
664 773         1236 return $count;
665             }
666              
667             sub get_SeqFeatures {
668 1215     1215 1 1219 my $self = shift;
669 1215 100       1780 return unless $self->[SUBF];
670 1067         1041 my @children;
671 1067         1010 foreach (@{ $self->[SUBF] }) {
  1067         1425  
672 10867         11123 push @children, $_;
673             }
674 1067 50       2506 return wantarray ? @children : \@children;
675             }
676              
677             sub delete_SeqFeature {
678 0     0 1 0 my $self = shift;
679 0 0       0 return unless $self->[SUBF];
680 0         0 my $id = shift;
681 0         0 my $d;
682 0         0 my $i = 0;
683 0         0 foreach (@{ $self->[SUBF] }) {
  0         0  
684 0 0       0 if ($_->[ID] eq $id) {
685 0         0 $d = $i;
686 0         0 last;
687             }
688 0         0 $i++;
689             }
690 0 0       0 if (defined $d) {
691 0         0 splice(@{$self->[SUBF]}, $d, 1);
  0         0  
692 0         0 return 1;
693             }
694 0         0 return;
695             }
696              
697             sub add_tag_value {
698 518     518 1 816 my ($self, $key, $value) = @_;
699 518 100 66     1253 return unless ($key and $value);
700 450   100     1124 $self->[ATTRB] ||= {};
701 450 50       630 if (exists $self->[ATTRB]->{$key}) {
702 0 0       0 if (ref($self->[ATTRB]->{$key}) eq 'ARRAY') {
703 0         0 push @{ $self->[ATTRB]->{$key} }, $value;
  0         0  
704             }
705             else {
706 0         0 my $current = $self->[ATTRB]->{$key};
707 0         0 $self->[ATTRB]->{$key} = [$current, $value];
708             }
709             }
710             else {
711 450         881 $self->[ATTRB]->{$key} = $value;
712             }
713             }
714              
715             sub has_tag {
716 515     515 1 677 my ($self, $key) = @_;
717 515 100       905 return unless $self->[ATTRB];
718 366         727 return exists $self->[ATTRB]->{$key};
719             }
720              
721             sub get_tag_values {
722 424     424 1 509 my ($self, $key) = @_;
723 424 100       567 return unless $self->[ATTRB];
724 423 100       542 if (exists $self->[ATTRB]->{$key}) {
725 410 50       567 if (ref($self->[ATTRB]->{$key}) eq 'ARRAY') {
726 0 0       0 return wantarray ? @{ $self->[ATTRB]->{$key} } : $self->[ATTRB]->{$key};
  0         0  
727             }
728             else {
729 410         771 return $self->[ATTRB]->{$key};
730             }
731             }
732             else {
733 13         29 return;
734             }
735             }
736              
737             sub attributes {
738 1     1 1 2 my $self = shift;
739 1 50       5 return unless $self->[ATTRB];
740 0 0       0 return wantarray ? %{ $self->[ATTRB] } : $self->[ATTRB];
  0         0  
741             }
742              
743             sub all_tags {
744 49     49 1 53 my $self = shift;
745 49 100       88 return unless $self->[ATTRB];
746 15         14 my @k = keys %{ $self->[ATTRB] };
  15         40  
747 15 50       45 return wantarray ? @k : \@k;
748             }
749              
750             sub remove_tag {
751 0     0 1 0 my ($self, $key) = @_;
752 0 0       0 return unless $self->[ATTRB];
753 0 0       0 if (exists $self->[ATTRB]->{$key}) {
754 0         0 delete $self->[ATTRB]->{$key};
755             }
756             }
757              
758              
759              
760             ### Range Methods
761             # Borrowed from Bio::RangeI, but does not do strong/weak strand checks
762              
763             sub length {
764 232     232 1 222 my $self = shift;
765 232         292 return $self->end - $self->start + 1;
766             }
767              
768             sub overlaps {
769 0     0 1 0 my ($self, $other) = @_;
770 0 0 0     0 return unless ($other and ref($other));
771 0 0       0 return unless ($self->seq_id eq $other->seq_id);
772             return not (
773 0   0     0 $self->start > $other->end or
774             $self->end < $other->start
775             );
776             }
777              
778             sub contains {
779 0     0 1 0 my ($self, $other) = @_;
780 0 0 0     0 return unless ($other and ref($other));
781 0 0       0 return unless ($self->seq_id eq $other->seq_id);
782             return (
783 0   0     0 $other->start >= $self->start and
784             $other->end <= $self->end
785             );
786             }
787              
788             sub equals {
789 0     0 1 0 my ($self, $other) = @_;
790 0 0 0     0 return unless ($other and ref($other));
791 0 0       0 return unless ($self->seq_id eq $other->seq_id);
792             return (
793 0   0     0 $other->start == $self->start and
794             $other->end == $self->end
795             );
796             }
797              
798             sub intersection {
799 0     0 1 0 my ($self, $other) = @_;
800 0 0 0     0 return unless ($other and ref($other));
801 0 0       0 return unless ($self->seq_id eq $other->seq_id);
802 0         0 my ($start, $stop);
803 0 0       0 if ($self->start >= $other->start) {
804 0         0 $start = $self->start;
805             }
806             else {
807 0         0 $start = $other->start;
808             }
809 0 0       0 if ($self->end <= $other->end) {
810 0         0 $stop = $self->end;
811             }
812             else {
813 0         0 $stop = $other->end;
814             }
815 0 0       0 return if $start > $stop;
816 0         0 return $self->new(
817             -seq_id => $self->seq_id,
818             -start => $start,
819             -end => $stop,
820             );
821             }
822              
823             sub union {
824 0     0 1 0 my ($self, $other) = @_;
825 0 0 0     0 return unless ($other and ref($other));
826 0 0       0 return unless ($self->seq_id eq $other->seq_id);
827 0         0 my ($start, $stop);
828 0 0       0 if ($self->start <= $other->start) {
829 0         0 $start = $self->start;
830             }
831             else {
832 0         0 $start = $other->start;
833             }
834 0 0       0 if ($self->end >= $other->end) {
835 0         0 $stop = $self->end;
836             }
837             else {
838 0         0 $stop = $other->end;
839             }
840 0 0       0 return if $start > $stop;
841 0         0 return $self->new(
842             -seq_id => $self->seq_id,
843             -start => $start,
844             -end => $stop,
845             );
846             }
847              
848             sub subtract {
849 0     0 1 0 my ($self, $other) = @_;
850 0 0 0     0 return unless ($other and ref($other));
851 0 0       0 return unless ($self->seq_id eq $other->seq_id);
852 0 0       0 return if $self->equals($other);
853 0         0 my @pieces;
854 0 0       0 if ($self->overlaps($other)) {
855 0         0 my $int = $self->intersection($other);
856 0 0       0 if ($self->start < $int->start) {
857 0         0 push @pieces, $self->new(
858             -seq_id => $self->seq_id,
859             -start => $self->start,
860             -end => $int->start - 1,
861             );
862             }
863 0 0       0 if ($self->end > $int->end) {
864 0         0 push @pieces, $self->new(
865             -seq_id => $self->seq_id,
866             -start => $int->end + 1,
867             -end => $self->end,
868             );
869             }
870             }
871             else {
872 0         0 push @pieces, $self->new(
873             -seq_id => $self->seq_id,
874             -start => $self->start,
875             -end => $self->end,
876             );
877             }
878 0 0       0 return wantarray ? @pieces : \@pieces;
879             }
880              
881              
882             ### Export methods
883              
884             sub version {
885 2     2 0 4 my $self = shift;
886 2 50 33     27 if (@_ and $_[0] ne '3') {
887 0         0 carp "sorry, only GFF version 3 is currently supported!";
888             }
889 2         5 return 3;
890             }
891              
892             sub gff_string {
893             # exports version 3 GFF, sorry, no mechanism for others....
894 101     101 1 98 my $self = shift;
895 101   50     127 my $recurse = shift || 0;
896 101   100     126 my $childIDs = shift || undef;
897            
898             # skip myself if we've already printed myself
899             return if ($recurse and $childIDs and
900 101 100 66     233 not exists $childIDs->{ $self->primary_id } );
      100        
901            
902             # map all parent-child relationships
903 49 100 66     121 if ($recurse and not defined $childIDs) {
904             # we have a top level SeqFeature and we need to go spelunking for IDs
905 2         3 $childIDs = {};
906 2         6 foreach my $f ($self->get_SeqFeatures) {
907 14         20 $f->_spelunk($self->primary_id, $childIDs);
908             }
909             }
910            
911             # basics
912 49 50 50     78 my $string = join("\t", (
    0 50        
    50 50        
    100 50        
913             $self->seq_id || '.',
914             $self->source_tag || '.',
915             $self->primary_tag,
916             $self->start || 1,
917             $self->end || 1,
918             defined $self->score ? $self->score : '.',
919             $self->strand > 0 ? '+' : $self->strand < 0 ? '-' : '.',
920             defined $self->phase ? $self->phase : '.',
921             ) );
922            
923             # group attributes
924 49         70 my $attributes;
925 49         65 my $name = $self->display_name;
926 49 50       60 if ($name) {
927             # names are optional and may not exist
928 49         66 $attributes = sprintf "Name=%s;ID=%s", $self->_encode($name),
929             $self->_encode( $self->primary_id );
930             }
931             else {
932             # IDs always exist
933 0         0 $attributes = sprintf "ID=%s", $self->_encode( $self->primary_id );
934             }
935 49 50       72 if ($childIDs) {
936 49 100       63 if (exists $childIDs->{$self->primary_id}) {
937             $attributes .= ';Parent=' . join(',',
938 100         113 sort {$a cmp $b}
939 99         111 map { $self->_encode($_) }
940 47         54 keys %{ $childIDs->{$self->primary_id} });
  47         53  
941 47         74 delete $childIDs->{$self->primary_id};
942             }
943             }
944 49         67 foreach my $tag ($self->all_tags) {
945 41 50       54 next if $tag eq 'Name';
946 41 50       56 next if $tag eq 'ID';
947 41 100       53 next if $tag eq 'Parent';
948 27         76 my $value = $self->get_tag_values($tag);
949 27 50       37 if (ref($value) eq 'ARRAY') {
950 0         0 $value = join(",", map { $self->_encode($_) } @$value);
  0         0  
951             }
952             else {
953 27         39 $value = $self->_encode($value);
954             }
955 27         48 $attributes .= ";$tag=$value";
956             }
957 49         93 $string .= "\t$attributes\n";
958            
959             # recurse
960 49 50       72 if ($recurse) {
961 49         62 foreach my $f ($self->get_SeqFeatures) {
962 99         137 my $s = $f->gff_string(1, $childIDs);
963 99 100       187 $string .= $s if $s;
964             }
965             }
966 49         85 return $string;
967             }
968              
969             sub _encode {
970 224     224   254 my ($self, $value) = @_;
971 224         259 $value =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
972 224         399 return $value;
973             }
974              
975             sub _spelunk {
976 99     99   107 my ($self, $parentID, $childIDs) = @_;
977 99         103 $childIDs->{$self->primary_id}{$parentID} += 1;
978 99         116 foreach my $f ($self->get_SeqFeatures) {
979 85         93 $f->_spelunk($self->primary_id, $childIDs);
980             }
981             }
982              
983             sub bed_string {
984 0     0 1   my $self = shift;
985 0 0 0       my $string = join("\t", (
    0 0        
      0        
      0        
986             $self->seq_id || '.',
987             ($self->start || 1) - 1,
988             $self->end || 1,
989             $self->display_name || $self->primary_id,
990             defined $self->score ? $self->score : 0,
991             $self->strand >= 0 ? '+' : '-',
992             ) );
993 0           return "$string\n";
994             }
995              
996              
997             __END__