File Coverage

blib/lib/IntervalTree.pm
Criterion Covered Total %
statement 44 217 20.2
branch 1 118 0.8
condition 0 24 0.0
subroutine 10 38 26.3
pod 10 11 90.9
total 65 408 15.9


line stmt bran cond sub pod time code
1             package IntervalTree;
2              
3 1     1   30229 use 5.006;
  1         5  
  1         45  
4 1     1   613 use POSIX qw(ceil);
  1         9060  
  1         11  
5 1     1   1722 use List::Util qw(max min);
  1         10  
  1         142  
6 1     1   8 use strict;
  1         2  
  1         53  
7 1     1   5 use warnings;
  1         2  
  1         50  
8 1     1   7 no warnings 'once';
  1         1  
  1         1069  
9              
10             our $VERSION = '0.01';
11              
12             =head1 NAME
13              
14             IntervalTree.pm
15              
16             =head1 VERSION
17              
18             Version 0.01
19              
20             =head1 DESCRIPTION
21              
22             Data structure for performing intersect queries on a set of intervals which
23             preserves all information about the intervals (unlike bitset projection methods).
24              
25             =cut
26              
27             # Historical note:
28             # This module original contained an implementation based on sorted endpoints
29             # and a binary search, using an idea from Scott Schwartz and Piotr Berman.
30             # Later an interval tree implementation was implemented by Ian for Galaxy's
31             # join tool (see `bx.intervals.operations.quicksect.py`). This was then
32             # converted to Cython by Brent, who also added support for
33             # upstream/downstream/neighbor queries. This was modified by James to
34             # handle half-open intervals strictly, to maintain sort order, and to
35             # implement the same interface as the original Intersecter.
36              
37             =head1 SYNOPSIS
38              
39              
40             Data structure for performing window intersect queries on a set of
41             of possibly overlapping 1d intervals.
42              
43             Usage
44             =====
45              
46             Create an empty IntervalTree
47              
48             >>> use IntervalTree;
49             >>> my $intersecter = IntervalTree->new();
50              
51             An interval is a start and end position and a value (possibly None).
52             You can add any object as an interval:
53              
54             >>> $intersecter->insert( 0, 10, "food" );
55             >>> $intersecter->insert( 3, 7, {foo=>'bar'} );
56              
57             >>> $intersecter->find( 2, 5 );
58             ['food', {'foo'=>'bar'}]
59              
60             If the object has start and end attributes (like the Interval class) there
61             is are some shortcuts:
62              
63             >>> my $intersecter = IntervalTree->new();
64             >>> $intersecter->insert_interval( Interval->new( 0, 10 ) );
65             >>> $intersecter->insert_interval( Interval->new( 3, 7 ) );
66             >>> $intersecter->insert_interval( Interval->new( 3, 40 ) );
67             >>> $intersecter->insert_interval( Interval->new( 13, 50 ) );
68              
69             >>> $intersecter->find( 30, 50 );
70             [Interval(3, 40), Interval(13, 50)]
71             >>> $intersecter->find( 100, 200 );
72             []
73              
74             Before/after for intervals
75              
76             >>> $intersecter->before_interval( Interval->new( 10, 20 ) );
77             [Interval(3, 7)]
78             >>> $intersecter->before_interval( Interval->new( 5, 20 ) );
79             []
80              
81             Upstream/downstream
82              
83             >>> $intersecter->upstream_of_interval(Interval->new(11, 12));
84             [Interval(0, 10)]
85             >>> $intersecter->upstream_of_interval(Interval->new(11, 12, undef, undef, "-"));
86             [Interval(13, 50)]
87              
88             >>> intersecter.upstream_of_interval(Interval(1, 2, undef, undef, "-"), 3);
89             [Interval(3, 7), Interval(3, 40), Interval(13, 50)]
90              
91             =cut
92            
93             sub new {
94 0     0 0 0 my ( $class ) = @_;
95 0         0 my $self = {};
96 0         0 $self->{root} = undef;
97 0         0 return bless $self, $class;
98             }
99            
100             # ---- Position based interfaces -----------------------------------------
101              
102             =head2 insert
103              
104             Insert the interval [start,end) associated with value `value`.
105              
106             =cut
107              
108             sub insert {
109 0     0 1 0 my ( $self, $start, $end, $value) = @_;
110 0 0       0 if (!defined $self->{root}) {
111 0         0 $self->{root} = IntervalNode->new( $start, $end, $value );
112             }
113             else {
114 0         0 $self->{root} = $self->{root}->insert( $start, $end, $value );
115             }
116             }
117              
118             *add = \&insert;
119              
120             =head2 find
121              
122             Return a sorted list of all intervals overlapping [start,end).
123              
124             =cut
125              
126             sub find {
127 0     0 1 0 my ( $self, $start, $end ) = @_;
128 0 0       0 if (!defined $self->{root}) {
129 0         0 return [];
130             }
131 0         0 return $self->{root}->find( $start, $end );
132             }
133              
134             =head2 before
135              
136             Find `num_intervals` intervals that lie before `position` and are no
137             further than `max_dist` positions away
138              
139             =cut
140            
141             sub before {
142 0     0 1 0 my ( $self, $position, $num_intervals, $max_dist ) = @_;
143 0 0       0 $num_intervals = 1 if !defined $num_intervals;
144 0 0       0 $max_dist = 2500 if !defined $max_dist;
145            
146 0 0       0 if (!defined $self->{root}) {
147 0         0 return [];
148             }
149 0         0 return $self->{root}->left( $position, $num_intervals, $max_dist );
150             }
151              
152             =head2 after
153              
154             Find `num_intervals` intervals that lie after `position` and are no
155             further than `max_dist` positions away
156              
157             =cut
158              
159             sub after {
160 0     0 1 0 my ( $self, $position, $num_intervals, $max_dist) = @_;
161 0 0       0 $num_intervals = 1 if !defined $num_intervals;
162 0 0       0 $max_dist = 2500 if !defined $max_dist;
163              
164 0 0       0 if (!defined $self->{root}) {
165 0         0 return [];
166             }
167 0         0 return $self->{root}->right( $position, $num_intervals, $max_dist );
168             }
169              
170             # ---- Interval-like object based interfaces -----------------------------
171              
172             =head2 insert_interval
173              
174             Insert an "interval" like object (one with at least start and end
175             attributes)
176              
177             =cut
178              
179             sub insert_interval {
180 0     0 1 0 my ( $self, $interval ) = @_;
181 0         0 $self->insert( $interval->{start}, $interval->{end}, $interval );
182             }
183              
184             *add_interval = \&insert_interval;
185              
186             =head2 before_interval
187              
188             Find `num_intervals` intervals that lie completely before `interval`
189             and are no further than `max_dist` positions away
190              
191             =cut
192              
193             sub before_interval {
194 0     0 1 0 my ( $self, $interval, $num_intervals, $max_dist ) = @_;
195 0 0       0 $num_intervals = 1 if !defined $num_intervals;
196 0 0       0 $max_dist = 2500 if !defined $max_dist;
197            
198 0 0       0 if (!defined $self->{root}) {
199 0         0 return [];
200             }
201 0         0 return $self->{root}->left( $interval->{start}, $num_intervals, $max_dist );
202             }
203              
204             =head2 after_interval
205              
206             Find `num_intervals` intervals that lie completely after `interval` and
207             are no further than `max_dist` positions away
208              
209             =cut
210              
211             sub after_interval {
212 0     0 1 0 my ( $self, $interval, $num_intervals, $max_dist ) = @_;
213 0 0       0 $num_intervals = 1 if !defined $num_intervals;
214 0 0       0 $max_dist = 2500 if !defined $max_dist;
215              
216 0 0       0 if (!defined $self->{root}) {
217 0         0 return [];
218             }
219 0         0 return $self->{root}->right( $interval->{end}, $num_intervals, $max_dist );
220             }
221              
222             =head2 upstream_of_interval
223              
224             Find `num_intervals` intervals that lie completely upstream of
225             `interval` and are no further than `max_dist` positions away
226              
227             =cut
228              
229             sub upstream_of_interval {
230 0     0 1 0 my ( $self, $interval, $num_intervals, $max_dist ) = @_;
231 0 0       0 $num_intervals = 1 if !defined $num_intervals;
232 0 0       0 $max_dist = 2500 if !defined $max_dist;
233              
234 0 0       0 if (!defined $self->{root}) {
235 0         0 return [];
236             }
237 0 0 0     0 if ($interval->{strand} == -1 || $interval->{strand} eq "-") {
238 0         0 return $self->{root}->right( $interval->{end}, $num_intervals, $max_dist );
239             }
240             else {
241 0         0 return $self->{root}->left( $interval->{start}, $num_intervals, $max_dist );
242             }
243             }
244              
245             =head2 downstream_of_interval
246              
247             Find `num_intervals` intervals that lie completely downstream of
248             `interval` and are no further than `max_dist` positions away
249              
250             =cut
251              
252              
253             sub downstream_of_interval {
254 0     0 1 0 my ( $self, $interval, $num_intervals, $max_dist ) = @_;
255 0 0       0 $num_intervals = 1 if !defined $num_intervals;
256 0 0       0 $max_dist = 2500 if !defined $max_dist;
257              
258 0 0       0 if (!defined $self->{root}) {
259 0         0 return [];
260             }
261 0 0 0     0 if ($interval->{strand} == -1 || $interval->{strand} eq "-") {
262 0         0 return $self->{root}->left( $interval->{start}, $num_intervals, $max_dist );
263             }
264             else {
265 0         0 return $self->{root}->right( $interval->{end}, $num_intervals, $max_dist );
266             }
267             }
268              
269             =head2 traverse
270            
271             call fn for each element in the tree
272              
273             =cut
274              
275             sub traverse {
276 0     0 1 0 my ($self, $fn) = @_;
277 0 0       0 if (!defined $self->{root}) {
278 0         0 return undef;
279             }
280 0         0 return $self->{root}->traverse($fn);
281             }
282              
283             =head2 IntervalNode
284              
285             A single node of an `IntervalTree`.
286              
287             NOTE: Unless you really know what you are doing, you probably should us
288             `IntervalTree` rather than using this directly.
289              
290             =cut
291              
292             package IntervalNode;
293 1     1   14 use List::Util qw(min max);
  1         2  
  1         2596  
294              
295             our $EmptyNode = IntervalNode->new( 0, 0, Interval->new(0, 0));
296              
297             sub nlog {
298 1     1   98 return -1.0 / log(0.5);
299             }
300              
301             sub left_node {
302 0     0   0 my ($self) = @_;
303 0 0       0 return $self->{cleft} != $EmptyNode ? $self->{cleft} : undef;
304             }
305              
306             sub right_node {
307 0     0   0 my ($self) = @_;
308 0 0       0 return $self->{cright} != $EmptyNode ? $self->{cright} : undef;
309             }
310              
311             sub root_node {
312 0     0   0 my ($self) = @_;
313 0 0       0 return $self->{croot} != $EmptyNode ? $self->{croot} : undef;
314             }
315            
316             sub str {
317 0     0   0 my ($self) = @_;
318 0         0 return "IntervalNode($self->{start}, $self->{end})";
319             }
320              
321             sub new {
322 1     1   4 my ($class, $start, $end, $interval) = @_;
323             # Perl lacks the binomial distribution, so we convert a
324             # uniform into a binomial because it naturally scales with
325             # tree size. Also, perl's uniform is perfect since the
326             # upper limit is not inclusive, which gives us undefined here.
327 1         3 my $self = {};
328 1         3 $self->{priority} = POSIX::ceil(nlog() * log(-1.0/(1.0 * rand() - 1)));
329 1         4 $self->{start} = $start;
330 1         16 $self->{end} = $end;
331 1         2 $self->{interval} = $interval;
332 1         3 $self->{maxend} = $end;
333 1         2 $self->{minstart} = $start;
334 1         2 $self->{minend} = $end;
335 1         2 $self->{cleft} = $EmptyNode;
336 1         2 $self->{cright} = $EmptyNode;
337 1         2 $self->{croot} = $EmptyNode;
338 1         4 return bless $self, $class;
339             }
340              
341             =head2 insert
342            
343             Insert a new IntervalNode into the tree of which this node is
344             currently the root. The return value is the new root of the tree (which
345             may or may not be this node!)
346              
347             =cut
348              
349             sub insert {
350 0     0   0 my ($self, $start, $end, $interval) = @_;
351 0         0 my $croot = $self;
352             # If starts are the same, decide which to add interval to based on
353             # end, thus maintaining sortedness relative to start/end
354 0         0 my $decision_endpoint = $start;
355 0 0       0 if ($start == $self->{start}) {
356 0         0 $decision_endpoint = $end;
357             }
358              
359 0 0       0 if ($decision_endpoint > $self->{start}) {
360             # insert to cright tree
361 0 0       0 if ($self->{cright} != $EmptyNode) {
362 0         0 $self->{cright} = $self->{cright}->insert( $start, $end, $interval );
363             }
364             else {
365 0         0 $self->{cright} = IntervalNode->new( $start, $end, $interval );
366             }
367             # rebalance tree
368 0 0       0 if ($self->{priority} < $self->{cright}{priority}) {
369 0         0 $croot = $self->rotate_left();
370             }
371             }
372             else {
373             # insert to cleft tree
374 0 0       0 if ($self->{cleft} != $EmptyNode) {
375 0         0 $self->{cleft} = $self->{cleft}->insert( $start, $end, $interval);
376             }
377             else {
378 0         0 $self->{cleft} = IntervalNode->new( $start, $end, $interval);
379             }
380             # rebalance tree
381 0 0       0 if ($self->{priority} < $self->{cleft}{priority}) {
382 0         0 $croot = $self->rotate_right();
383             }
384             }
385              
386 0         0 $croot->set_ends();
387 0         0 $self->{cleft}{croot} = $croot;
388 0         0 $self->{cright}{croot} = $croot;
389 0         0 return $croot;
390             }
391              
392             sub rotate_right {
393 0     0   0 my ($self) = @_;
394 0         0 my $croot = $self->{cleft};
395 0         0 $self->{cleft} = $self->{cleft}{cright};
396 0         0 $croot->{cright} = $self;
397 0         0 $self->set_ends();
398 0         0 return $croot;
399             }
400              
401             sub rotate_left {
402 0     0   0 my ($self) = @_;
403 0         0 my $croot = $self->{cright};
404 0         0 $self->{cright} = $self->{cright}{cleft};
405 0         0 $croot->{cleft} = $self;
406 0         0 $self->set_ends();
407 0         0 return $croot;
408             }
409              
410             sub set_ends {
411 0     0   0 my ($self) = @_;
412 0 0 0     0 if ($self->{cright} != $EmptyNode && $self->{cleft} != $EmptyNode) {
    0          
    0          
413 0         0 $self->{maxend} = max($self->{end}, $self->{cright}{maxend}, $self->{cleft}{maxend});
414 0         0 $self->{minend} = min($self->{end}, $self->{cright}{minend}, $self->{cleft}{minend});
415 0         0 $self->{minstart} = min($self->{start}, $self->{cright}{minstartz}, $self->{cleft}{minstart});
416             }
417             elsif ( $self->{cright} != $EmptyNode) {
418 0         0 $self->{maxend} = max($self->{end}, $self->{cright}{maxend});
419 0         0 $self->{minend} = min($self->{end}, $self->{cright}{minend});
420 0         0 $self->{minstart} = min($self->{start}, $self->{cright}{minstart});
421             }
422             elsif ( $self->{cleft} != $EmptyNode) {
423 0         0 $self->{maxend} = max($self->{end}, $self->{cleft}{maxend});
424 0         0 $self->{minend} = min($self->{end}, $self->{cleft}{minend});
425 0         0 $self->{minstart} = min($self->{start}, $self->{cleft}{minstart});
426             }
427             }
428              
429              
430             =head2 intersect
431              
432             given a start and a end, return a list of features
433             falling within that range
434              
435             =cut
436              
437             sub intersect {
438 0     0   0 my ( $self, $start, $end, $sort ) = @_;
439 0 0       0 $sort = 1 if !defined $sort;
440 0         0 my $results = [];
441 0         0 $self->_intersect( $start, $end, $results );
442 0         0 return $results;
443             }
444              
445             *find = \&intersect;
446              
447             sub _intersect {
448 0     0   0 my ( $self, $start, $end, $results) = @_;
449             # Left subtree
450 0 0 0     0 if ($self->{cleft} != $EmptyNode && $self->{cleft}{maxend} > $start) {
451 0         0 $self->{cleft}->_intersect( $start, $end, $results );
452             }
453             # This interval
454 0 0 0     0 if (( $self->{end} > $start ) && ( $self->{start} < $end )) {
455 0         0 push @$results, $self->{interval};
456             }
457             # Right subtree
458 0 0 0     0 if ($self->{cright} != $EmptyNode && $self->{start} < $end) {
459 0         0 $self->{cright}->_intersect( $start, $end, $results );
460             }
461             }
462            
463              
464             sub _seek_left {
465 0     0   0 my ($self, $position, $results, $n, $max_dist) = @_;
466             # we know we can bail in these 2 cases.
467 0 0       0 if ($self->{maxend} + $max_dist < $position) {
468 0         0 return;
469             }
470 0 0       0 if ($self->{minstart} > $position) {
471 0         0 return;
472             }
473              
474             # the ordering of these 3 blocks makes it so the results are
475             # ordered nearest to farest from the query position
476 0 0       0 if ($self->{cright} != $EmptyNode) {
477 0         0 $self->{cright}->_seek_left($position, $results, $n, $max_dist);
478             }
479              
480 0 0 0     0 if (-1 < $position - $self->{end} && $position - $self->{end} < $max_dist) {
481 0         0 push @$results, $self->{interval};
482             }
483              
484             # TODO: can these conditionals be more stringent?
485 0 0       0 if ($self->{cleft} != $EmptyNode) {
486 0         0 $self->{cleft}->_seek_left($position, $results, $n, $max_dist);
487             }
488             }
489              
490              
491            
492             sub _seek_right {
493 0     0   0 my ($self, $position, $results, $n, $max_dist) = @_;
494             # we know we can bail in these 2 cases.
495 0 0       0 return if $self->{maxend} < $position;
496 0 0       0 return if $self->{minstart} - $max_dist > $position;
497              
498             #print "SEEK_RIGHT:",self, self.cleft, self.maxend, self.minstart, position
499              
500             # the ordering of these 3 blocks makes it so the results are
501             # ordered nearest to farest from the query position
502 0 0       0 if ($self->{cleft} != $EmptyNode) {
503 0         0 $self->{cleft}->_seek_right($position, $results, $n, $max_dist);
504             }
505              
506 0 0 0     0 if (-1 < $self->{start} - $position && $self->{start} - $position < $max_dist) {
507 0         0 push @$results, $self->{interval};
508             }
509              
510 0 0       0 if ($self->{cright} != $EmptyNode) {
511 0         0 $self->{cright}->_seek_right($position, $results, $n, $max_dist);
512             }
513             }
514              
515             =head2 left
516              
517             find n features with a start > than `position`
518             f: a Interval object (or anything with an `end` attribute)
519             n: the number of features to return
520             max_dist: the maximum distance to look before giving up.
521              
522             =cut
523            
524             sub left {
525 0     0   0 my ($self, $position, $n, $max_dist) = @_;
526 0 0       0 $n = 1 if !defined $n;
527 0 0       0 $max_dist = 2500 if !defined $max_dist;
528              
529 0         0 my $results = [];
530             # use start - 1 becuase .left() assumes strictly left-of
531 0         0 $self->_seek_left( $position - 1, $results, $n, $max_dist );
532 0 0       0 return $results if length(@$results) == $n;
533              
534 0         0 my $r = $results;
535 0         0 @$r = sort {$b->{end} <=> $a->{end}} @$r;
  0         0  
536 0         0 return @$r[0..$n-1];
537             }
538              
539             =head2 right
540              
541             find n features with a end < than position
542             f: a Interval object (or anything with a `start` attribute)
543             n: the number of features to return
544             max_dist: the maximum distance to look before giving up.
545              
546             =cut
547              
548             sub right {
549 0     0   0 my ($self, $position, $n, $max_dist) = @_;
550 0 0       0 $n = 1 if !defined $n;
551 0 0       0 $max_dist = 2500 if !defined $max_dist;
552              
553 0         0 my $results = [];
554             # use end + 1 because .right() assumes strictly right-of
555 0         0 $self->_seek_right($position + 1, $results, $n, $max_dist);
556 0 0       0 return $results if length(@$results) == $n;
557              
558 0         0 my $r = $results;
559 0         0 @$r = sort {$a->{start} <=> $b->{start}} @$r;
  0         0  
560 0         0 return @$r[0..$n-1];
561             }
562              
563             sub traverse {
564 0     0   0 my ($self, $func) = @_;
565 0         0 $self->_traverse($func);
566             }
567              
568             sub _traverse {
569 0     0   0 my ($self, $func) = @_;
570 0 0       0 $self->{cleft}->_traverse($func) if $self->{cleft} != $EmptyNode;
571 0         0 $func->($self);
572 0 0       0 $self->{cright}->_traverse($func) if $self->{cright} != $EmptyNode;
573             }
574              
575             ## ---- Wrappers that retain the old interface -------------------------------
576              
577             =head2 Interval
578              
579             Basic feature, with required integer start and end properties.
580             Also accepts optional strand as +1 or -1 (used for up/downstream queries),
581             a name, and any arbitrary data is sent in on the info keyword argument
582              
583             >>> from bx.intervals.intersection import Interval
584              
585             >>> f1 = Interval->new(23, 36);
586             >>> f2 = Interval->new(34, 48, {'chr':12, 'anno':'transposon'});
587             >>> f2
588             Interval(34, 48, {'anno': 'transposon', 'chr': 12})
589              
590             =cut
591              
592             package Interval;
593              
594             sub new {
595 1     1   5 my ($class, $start, $end, $value, $chrom, $strand) = @_;
596 1 50       6 die "start must be less than end" unless $start <= $end;
597 1         3 my $self = {};
598 1         3 $self->{start} = $start;
599 1         2 $self->{end} = $end;
600 1         2 $self->{value} = $value;
601 1         2 $self->{chrom} = $chrom;
602 1         1 $self->{strand} = $strand;
603 1         9 return bless $self, $class;
604             }
605              
606             sub str {
607 0     0     my ($self) = @_;
608 0           my $fstr = "Interval($self->{start}, $self->{end}";
609 0 0         if (defined($self->{value})) {
610 0           $fstr .= ", value=$self->{value}";
611             }
612 0           $fstr .= ")";
613 0           return $fstr;
614             }
615              
616             =head2 AUTHOR
617              
618             Ben Booth, C<< >>
619              
620             Original Authors: James Taylor (james@jamestaylor.org),
621             Ian Schenk (ian.schenck@gmail.com),
622             Brent Pedersen (bpederse@gmail.com)
623              
624             =head2 BUGS
625              
626             Please report any bugs or feature requests to C, or through
627             the web interface at L. I will be notified, and then you'll
628             automatically be notified of progress on your bug as I make changes.
629              
630             =head2 SUPPORT
631              
632             You can find documentation for this module with the perldoc command.
633              
634             perldoc IntervalTree
635              
636              
637             You can also look for information at:
638              
639             =over 4
640              
641             =item * RT: CPAN's request tracker (report bugs here)
642              
643             L
644              
645             =item * AnnoCPAN: Annotated CPAN documentation
646              
647             L
648              
649             =item * CPAN Ratings
650              
651             L
652              
653             =item * Search CPAN
654              
655             L
656              
657             =back
658              
659             =head2 ACKNOWLEDGEMENTS
660              
661             This code was directly ported from the bx-python project:
662              
663             https://bitbucket.org/james_taylor/bx-python/src/tip/lib/bx/intervals/intersection.pyx
664              
665             Original Authors: James Taylor (james@jamestaylor.org),
666             Ian Schenk (ian.schenck@gmail.com),
667             Brent Pedersen (bpederse@gmail.com)
668              
669             =head2 LICENSE AND COPYRIGHT
670              
671             Copyright 2012 Ben Booth.
672              
673             This program is free software; you can redistribute it and/or modify it
674             under the terms of either: the GNU General Public License as published
675             by the Free Software Foundation; or the Artistic License.
676              
677             See http://dev.perl.org/licenses/ for more information.
678              
679              
680             =cut
681              
682             1; # End of IntervalTree