File Coverage

blib/lib/IntervalTree.pm
Criterion Covered Total %
statement 44 217 20.2
branch 1 118 0.8
condition 0 30 0.0
subroutine 10 38 26.3
pod 10 11 90.9
total 65 414 15.7


line stmt bran cond sub pod time code
1             package IntervalTree;
2              
3 1     1   23016 use 5.006;
  1         5  
  1         50  
4 1     1   959 use POSIX qw(ceil);
  1         9253  
  1         8  
5 1     1   1254 use List::Util qw(max min);
  1         7  
  1         89  
6 1     1   5 use strict;
  1         2  
  1         38  
7 1     1   5 use warnings;
  1         1  
  1         36  
8 1     1   5 no warnings 'once';
  1         1  
  1         1016  
9              
10             our $VERSION = '0.05';
11              
12             =head1 NAME
13              
14             IntervalTree.pm
15              
16             =head1 VERSION
17              
18             Version 0.01
19              
20             =head1 DESCRIPTION
21              
22             Data structure for performing intersect queries on a set of intervals which
23             preserves all information about the intervals (unlike bitset projection methods).
24              
25             =cut
26              
27             # Historical note:
28             # This module original contained an implementation based on sorted endpoints
29             # and a binary search, using an idea from Scott Schwartz and Piotr Berman.
30             # Later an interval tree implementation was implemented by Ian for Galaxy's
31             # join tool (see `bx.intervals.operations.quicksect.py`). This was then
32             # converted to Cython by Brent, who also added support for
33             # upstream/downstream/neighbor queries. This was modified by James to
34             # handle half-open intervals strictly, to maintain sort order, and to
35             # implement the same interface as the original Intersecter.
36              
37             =head1 SYNOPSIS
38              
39              
40             Data structure for performing window intersect queries on a set of
41             of possibly overlapping 1d intervals.
42              
43             Usage
44             =====
45              
46             Create an empty IntervalTree
47              
48             use IntervalTree;
49             my $intersecter = IntervalTree->new();
50              
51             An interval is a start and end position and a value (possibly None).
52             You can add any object as an interval:
53              
54             $intersecter->insert( 0, 10, "food" );
55             $intersecter->insert( 3, 7, {foo=>'bar'} );
56              
57             $intersecter->find( 2, 5 );
58             # output: ['food', {'foo'=>'bar'}]
59              
60             If the object has start and end attributes (like the IntervalTree::Interval class) there
61             is are some shortcuts:
62              
63             my $intersecter = IntervalTree->new();
64             $intersecter->insert_interval( IntervalTree::Interval->new( 0, 10 ) );
65             $intersecter->insert_interval( IntervalTree::Interval->new( 3, 7 ) );
66             $intersecter->insert_interval( IntervalTree::Interval->new( 3, 40 ) );
67             $intersecter->insert_interval( IntervalTree::Interval->new( 13, 50 ) );
68              
69             $intersecter->find( 30, 50 );
70             # output: [IntervalTree::Interval(3, 40), IntervalTree::Interval(13, 50)]
71              
72             $intersecter->find( 100, 200 );
73             # output: []
74              
75             Before/after for intervals
76              
77             $intersecter->before_interval( IntervalTree::Interval->new( 10, 20 ) );
78             # output: [IntervalTree::Interval(3, 7)]
79              
80             $intersecter->before_interval( IntervalTree::Interval->new( 5, 20 ) );
81             # output: []
82              
83             Upstream/downstream
84              
85             $intersecter->upstream_of_interval(IntervalTree::Interval->new(11, 12));
86             # output: [IntervalTree::Interval(0, 10)]
87             $intersecter->upstream_of_interval(IntervalTree::Interval->new(11, 12, undef, undef, "-"));
88             # output: [IntervalTree::Interval(13, 50)]
89              
90             $intersecter.upstream_of_interval(IntervalTree::Interval->new(1, 2, undef, undef, "-"), 3);
91             # output: [IntervalTree::Interval(3, 7), IntervalTree::Interval(3, 40), IntervalTree::Interval(13, 50)]
92              
93             =cut
94            
95             sub new {
96 0     0 0 0 my ( $class ) = @_;
97 0         0 my $self = {};
98 0         0 $self->{root} = undef;
99 0         0 return bless $self, $class;
100             }
101            
102             # ---- Position based interfaces -----------------------------------------
103              
104             =head2 insert
105              
106             Insert the interval [start,end) associated with value `value`.
107              
108             =cut
109              
110             sub insert {
111 0     0 1 0 my ( $self, $start, $end, $value) = @_;
112 0 0       0 if (!defined $self->{root}) {
113 0         0 $self->{root} = IntervalTree::Node->new( $start, $end, $value );
114             }
115             else {
116 0         0 $self->{root} = $self->{root}->insert( $start, $end, $value );
117             }
118             }
119              
120             *add = \&insert;
121              
122             =head2 find
123              
124             Return a sorted list of all intervals overlapping [start,end).
125              
126             =cut
127              
128             sub find {
129 0     0 1 0 my ( $self, $start, $end ) = @_;
130 0 0       0 if (!defined $self->{root}) {
131 0         0 return [];
132             }
133 0         0 return $self->{root}->find( $start, $end );
134             }
135              
136             =head2 before
137              
138             Find `num_intervals` intervals that lie before `position` and are no
139             further than `max_dist` positions away
140              
141             =cut
142            
143             sub before {
144 0     0 1 0 my ( $self, $position, $num_intervals, $max_dist ) = @_;
145 0 0       0 $num_intervals = 1 if !defined $num_intervals;
146 0 0       0 $max_dist = 2500 if !defined $max_dist;
147            
148 0 0       0 if (!defined $self->{root}) {
149 0         0 return [];
150             }
151 0         0 return $self->{root}->left( $position, $num_intervals, $max_dist );
152             }
153              
154             =head2 after
155              
156             Find `num_intervals` intervals that lie after `position` and are no
157             further than `max_dist` positions away
158              
159             =cut
160              
161             sub after {
162 0     0 1 0 my ( $self, $position, $num_intervals, $max_dist) = @_;
163 0 0       0 $num_intervals = 1 if !defined $num_intervals;
164 0 0       0 $max_dist = 2500 if !defined $max_dist;
165              
166 0 0       0 if (!defined $self->{root}) {
167 0         0 return [];
168             }
169 0         0 return $self->{root}->right( $position, $num_intervals, $max_dist );
170             }
171              
172             # ---- Interval-like object based interfaces -----------------------------
173              
174             =head2 insert_interval
175              
176             Insert an "interval" like object (one with at least start and end
177             attributes)
178              
179             =cut
180              
181             sub insert_interval {
182 0     0 1 0 my ( $self, $interval ) = @_;
183 0         0 $self->insert( $interval->{start}, $interval->{end}, $interval );
184             }
185              
186             *add_interval = \&insert_interval;
187              
188             =head2 before_interval
189              
190             Find `num_intervals` intervals that lie completely before `interval`
191             and are no further than `max_dist` positions away
192              
193             =cut
194              
195             sub before_interval {
196 0     0 1 0 my ( $self, $interval, $num_intervals, $max_dist ) = @_;
197 0 0       0 $num_intervals = 1 if !defined $num_intervals;
198 0 0       0 $max_dist = 2500 if !defined $max_dist;
199            
200 0 0       0 if (!defined $self->{root}) {
201 0         0 return [];
202             }
203 0         0 return $self->{root}->left( $interval->{start}, $num_intervals, $max_dist );
204             }
205              
206             =head2 after_interval
207              
208             Find `num_intervals` intervals that lie completely after `interval` and
209             are no further than `max_dist` positions away
210              
211             =cut
212              
213             sub after_interval {
214 0     0 1 0 my ( $self, $interval, $num_intervals, $max_dist ) = @_;
215 0 0       0 $num_intervals = 1 if !defined $num_intervals;
216 0 0       0 $max_dist = 2500 if !defined $max_dist;
217              
218 0 0       0 if (!defined $self->{root}) {
219 0         0 return [];
220             }
221 0         0 return $self->{root}->right( $interval->{end}, $num_intervals, $max_dist );
222             }
223              
224             =head2 upstream_of_interval
225              
226             Find `num_intervals` intervals that lie completely upstream of
227             `interval` and are no further than `max_dist` positions away
228              
229             =cut
230              
231             sub upstream_of_interval {
232 0     0 1 0 my ( $self, $interval, $num_intervals, $max_dist ) = @_;
233 0 0       0 $num_intervals = 1 if !defined $num_intervals;
234 0 0       0 $max_dist = 2500 if !defined $max_dist;
235              
236 0 0       0 if (!defined $self->{root}) {
237 0         0 return [];
238             }
239 0 0 0     0 if ($interval->{strand} && ($interval->{strand} eq "-" || $interval->{strand} eq "-1")) {
      0        
240 0         0 return $self->{root}->right( $interval->{end}, $num_intervals, $max_dist );
241             }
242             else {
243 0         0 return $self->{root}->left( $interval->{start}, $num_intervals, $max_dist );
244             }
245             }
246              
247             =head2 downstream_of_interval
248              
249             Find `num_intervals` intervals that lie completely downstream of
250             `interval` and are no further than `max_dist` positions away
251              
252             =cut
253              
254              
255             sub downstream_of_interval {
256 0     0 1 0 my ( $self, $interval, $num_intervals, $max_dist ) = @_;
257 0 0       0 $num_intervals = 1 if !defined $num_intervals;
258 0 0       0 $max_dist = 2500 if !defined $max_dist;
259              
260 0 0       0 if (!defined $self->{root}) {
261 0         0 return [];
262             }
263 0 0 0     0 if ($interval->{strand} && ($interval->{strand} eq "-" || $interval->{strand} eq "-1")) {
      0        
264 0         0 return $self->{root}->left( $interval->{start}, $num_intervals, $max_dist );
265             }
266             else {
267 0         0 return $self->{root}->right( $interval->{end}, $num_intervals, $max_dist );
268             }
269             }
270              
271             =head2 traverse
272            
273             call fn for each element in the tree
274              
275             =cut
276              
277             sub traverse {
278 0     0 1 0 my ($self, $fn) = @_;
279 0 0       0 if (!defined $self->{root}) {
280 0         0 return undef;
281             }
282 0         0 return $self->{root}->traverse($fn);
283             }
284              
285             =head2 IntervalTree::Node
286              
287             A single node of an `IntervalTree`.
288              
289             NOTE: Unless you really know what you are doing, you probably should us
290             `IntervalTree` rather than using this directly.
291              
292             =cut
293              
294             package IntervalTree::Node;
295 1     1   7 use List::Util qw(min max);
  1         9  
  1         2637  
296              
297             our $EmptyNode = IntervalTree::Node->new( 0, 0, IntervalTree::Interval->new(0, 0));
298              
299             sub nlog {
300 1     1   86 return -1.0 / log(0.5);
301             }
302              
303             sub left_node {
304 0     0   0 my ($self) = @_;
305 0 0       0 return $self->{cleft} != $EmptyNode ? $self->{cleft} : undef;
306             }
307              
308             sub right_node {
309 0     0   0 my ($self) = @_;
310 0 0       0 return $self->{cright} != $EmptyNode ? $self->{cright} : undef;
311             }
312              
313             sub root_node {
314 0     0   0 my ($self) = @_;
315 0 0       0 return $self->{croot} != $EmptyNode ? $self->{croot} : undef;
316             }
317            
318             sub str {
319 0     0   0 my ($self) = @_;
320 0         0 return "Node($self->{start}, $self->{end})";
321             }
322              
323             sub new {
324 1     1   2 my ($class, $start, $end, $interval) = @_;
325             # Perl lacks the binomial distribution, so we convert a
326             # uniform into a binomial because it naturally scales with
327             # tree size. Also, perl's uniform is perfect since the
328             # upper limit is not inclusive, which gives us undefined here.
329 1         2 my $self = {};
330 1         3 $self->{priority} = POSIX::ceil(nlog() * log(-1.0/(1.0 * rand() - 1)));
331 1         3 $self->{start} = $start;
332 1         12 $self->{end} = $end;
333 1         2 $self->{interval} = $interval;
334 1         2 $self->{maxend} = $end;
335 1         2 $self->{minstart} = $start;
336 1         1 $self->{minend} = $end;
337 1         3 $self->{cleft} = $EmptyNode;
338 1         2 $self->{cright} = $EmptyNode;
339 1         2 $self->{croot} = $EmptyNode;
340 1         4 return bless $self, $class;
341             }
342              
343             =head2 insert
344            
345             Insert a new IntervalTree::Node into the tree of which this node is
346             currently the root. The return value is the new root of the tree (which
347             may or may not be this node!)
348              
349             =cut
350              
351             sub insert {
352 0     0   0 my ($self, $start, $end, $interval) = @_;
353 0         0 my $croot = $self;
354             # If starts are the same, decide which to add interval to based on
355             # end, thus maintaining sortedness relative to start/end
356 0         0 my $decision_endpoint = $start;
357 0 0       0 if ($start == $self->{start}) {
358 0         0 $decision_endpoint = $end;
359             }
360              
361 0 0       0 if ($decision_endpoint > $self->{start}) {
362             # insert to cright tree
363 0 0       0 if ($self->{cright} != $EmptyNode) {
364 0         0 $self->{cright} = $self->{cright}->insert( $start, $end, $interval );
365             }
366             else {
367 0         0 $self->{cright} = IntervalTree::Node->new( $start, $end, $interval );
368             }
369             # rebalance tree
370 0 0       0 if ($self->{priority} < $self->{cright}{priority}) {
371 0         0 $croot = $self->rotate_left();
372             }
373             }
374             else {
375             # insert to cleft tree
376 0 0       0 if ($self->{cleft} != $EmptyNode) {
377 0         0 $self->{cleft} = $self->{cleft}->insert( $start, $end, $interval);
378             }
379             else {
380 0         0 $self->{cleft} = IntervalTree::Node->new( $start, $end, $interval);
381             }
382             # rebalance tree
383 0 0       0 if ($self->{priority} < $self->{cleft}{priority}) {
384 0         0 $croot = $self->rotate_right();
385             }
386             }
387              
388 0         0 $croot->set_ends();
389 0         0 $self->{cleft}{croot} = $croot;
390 0         0 $self->{cright}{croot} = $croot;
391 0         0 return $croot;
392             }
393              
394             sub rotate_right {
395 0     0   0 my ($self) = @_;
396 0         0 my $croot = $self->{cleft};
397 0         0 $self->{cleft} = $self->{cleft}{cright};
398 0         0 $croot->{cright} = $self;
399 0         0 $self->set_ends();
400 0         0 return $croot;
401             }
402              
403             sub rotate_left {
404 0     0   0 my ($self) = @_;
405 0         0 my $croot = $self->{cright};
406 0         0 $self->{cright} = $self->{cright}{cleft};
407 0         0 $croot->{cleft} = $self;
408 0         0 $self->set_ends();
409 0         0 return $croot;
410             }
411              
412             sub set_ends {
413 0     0   0 my ($self) = @_;
414 0 0 0     0 if ($self->{cright} != $EmptyNode && $self->{cleft} != $EmptyNode) {
    0          
    0          
415 0         0 $self->{maxend} = max($self->{end}, $self->{cright}{maxend}, $self->{cleft}{maxend});
416 0         0 $self->{minend} = min($self->{end}, $self->{cright}{minend}, $self->{cleft}{minend});
417 0         0 $self->{minstart} = min($self->{start}, $self->{cright}{minstart}, $self->{cleft}{minstart});
418             }
419             elsif ( $self->{cright} != $EmptyNode) {
420 0         0 $self->{maxend} = max($self->{end}, $self->{cright}{maxend});
421 0         0 $self->{minend} = min($self->{end}, $self->{cright}{minend});
422 0         0 $self->{minstart} = min($self->{start}, $self->{cright}{minstart});
423             }
424             elsif ( $self->{cleft} != $EmptyNode) {
425 0         0 $self->{maxend} = max($self->{end}, $self->{cleft}{maxend});
426 0         0 $self->{minend} = min($self->{end}, $self->{cleft}{minend});
427 0         0 $self->{minstart} = min($self->{start}, $self->{cleft}{minstart});
428             }
429             }
430              
431              
432             =head2 intersect
433              
434             given a start and a end, return a list of features
435             falling within that range
436              
437             =cut
438              
439             sub intersect {
440 0     0   0 my ( $self, $start, $end, $sort ) = @_;
441 0 0       0 $sort = 1 if !defined $sort;
442 0         0 my $results = [];
443 0         0 $self->_intersect( $start, $end, $results );
444 0         0 return $results;
445             }
446              
447             *find = \&intersect;
448              
449             sub _intersect {
450 0     0   0 my ( $self, $start, $end, $results) = @_;
451             # Left subtree
452 0 0 0     0 if ($self->{cleft} != $EmptyNode && $self->{cleft}{maxend} > $start) {
453 0         0 $self->{cleft}->_intersect( $start, $end, $results );
454             }
455             # This interval
456 0 0 0     0 if (( $self->{end} > $start ) && ( $self->{start} < $end )) {
457 0         0 push @$results, $self->{interval};
458             }
459             # Right subtree
460 0 0 0     0 if ($self->{cright} != $EmptyNode && $self->{start} < $end) {
461 0         0 $self->{cright}->_intersect( $start, $end, $results );
462             }
463             }
464            
465              
466             sub _seek_left {
467 0     0   0 my ($self, $position, $results, $n, $max_dist) = @_;
468             # we know we can bail in these 2 cases.
469 0 0       0 if ($self->{maxend} + $max_dist < $position) {
470 0         0 return;
471             }
472 0 0       0 if ($self->{minstart} > $position) {
473 0         0 return;
474             }
475              
476             # the ordering of these 3 blocks makes it so the results are
477             # ordered nearest to farest from the query position
478 0 0       0 if ($self->{cright} != $EmptyNode) {
479 0         0 $self->{cright}->_seek_left($position, $results, $n, $max_dist);
480             }
481              
482 0 0 0     0 if (-1 < $position - $self->{end} && $position - $self->{end} < $max_dist) {
483 0         0 push @$results, $self->{interval};
484             }
485              
486             # TODO: can these conditionals be more stringent?
487 0 0       0 if ($self->{cleft} != $EmptyNode) {
488 0         0 $self->{cleft}->_seek_left($position, $results, $n, $max_dist);
489             }
490             }
491              
492              
493            
494             sub _seek_right {
495 0     0   0 my ($self, $position, $results, $n, $max_dist) = @_;
496             # we know we can bail in these 2 cases.
497 0 0       0 return if $self->{maxend} < $position;
498 0 0       0 return if $self->{minstart} - $max_dist > $position;
499              
500             #print "SEEK_RIGHT:",self, self.cleft, self.maxend, self.minstart, position
501              
502             # the ordering of these 3 blocks makes it so the results are
503             # ordered nearest to farest from the query position
504 0 0       0 if ($self->{cleft} != $EmptyNode) {
505 0         0 $self->{cleft}->_seek_right($position, $results, $n, $max_dist);
506             }
507              
508 0 0 0     0 if (-1 < $self->{start} - $position && $self->{start} - $position < $max_dist) {
509 0         0 push @$results, $self->{interval};
510             }
511              
512 0 0       0 if ($self->{cright} != $EmptyNode) {
513 0         0 $self->{cright}->_seek_right($position, $results, $n, $max_dist);
514             }
515             }
516              
517             =head2 left
518              
519             find n features with a start > than `position`
520             f: a IntervalTree::Interval object (or anything with an `end` attribute)
521             n: the number of features to return
522             max_dist: the maximum distance to look before giving up.
523              
524             =cut
525            
526             sub left {
527 0     0   0 my ($self, $position, $n, $max_dist) = @_;
528 0 0       0 $n = 1 if !defined $n;
529 0 0       0 $max_dist = 2500 if !defined $max_dist;
530              
531 0         0 my $results = [];
532             # use start - 1 becuase .left() assumes strictly left-of
533 0         0 $self->_seek_left( $position - 1, $results, $n, $max_dist );
534 0 0       0 return $results if scalar(@$results) == $n;
535              
536 0         0 my $r = $results;
537 0         0 @$r = sort {$b->{end} <=> $a->{end}} @$r;
  0         0  
538 0         0 return @$r[0..$n-1];
539             }
540              
541             =head2 right
542              
543             find n features with a end < than position
544             f: a IntervalTree::Interval object (or anything with a `start` attribute)
545             n: the number of features to return
546             max_dist: the maximum distance to look before giving up.
547              
548             =cut
549              
550             sub right {
551 0     0   0 my ($self, $position, $n, $max_dist) = @_;
552 0 0       0 $n = 1 if !defined $n;
553 0 0       0 $max_dist = 2500 if !defined $max_dist;
554              
555 0         0 my $results = [];
556             # use end + 1 because .right() assumes strictly right-of
557 0         0 $self->_seek_right($position + 1, $results, $n, $max_dist);
558 0 0       0 return $results if scalar(@$results) == $n;
559              
560 0         0 my $r = $results;
561 0         0 @$r = sort {$a->{start} <=> $b->{start}} @$r;
  0         0  
562 0         0 return @$r[0..$n-1];
563             }
564              
565             sub traverse {
566 0     0   0 my ($self, $func) = @_;
567 0         0 $self->_traverse($func);
568             }
569              
570             sub _traverse {
571 0     0   0 my ($self, $func) = @_;
572 0 0       0 $self->{cleft}->_traverse($func) if $self->{cleft} != $EmptyNode;
573 0         0 $func->($self);
574 0 0       0 $self->{cright}->_traverse($func) if $self->{cright} != $EmptyNode;
575             }
576              
577             ## ---- Wrappers that retain the old interface -------------------------------
578              
579             =head2 IntervalTree::Interval
580              
581             Basic feature, with required integer start and end properties.
582             Also accepts optional strand as +1 or -1 (used for up/downstream queries),
583             a name, and any arbitrary data is sent in on the info keyword argument
584              
585             $f1 = IntervalTree::Interval->new(23, 36);
586             $f2 = IntervalTree::Interval->new(34, 48, {'chr':12, 'anno':'transposon'});
587             $f2
588             # output: Interval(34, 48, {'anno': 'transposon', 'chr': 12})
589              
590             =cut
591              
592             package IntervalTree::Interval;
593              
594             sub new {
595 1     1   3 my ($class, $start, $end, $value, $chrom, $strand) = @_;
596 1 50       5 die "start must be less than end" unless $start <= $end;
597 1         3 my $self = {};
598 1         3 $self->{start} = $start;
599 1         1 $self->{end} = $end;
600 1         3 $self->{value} = $value;
601 1         1 $self->{chrom} = $chrom;
602 1         2 $self->{strand} = $strand;
603 1         7 return bless $self, $class;
604             }
605              
606             sub str {
607 0     0     my ($self) = @_;
608 0           my $fstr = "Interval($self->{start}, $self->{end}";
609 0 0         if (defined($self->{value})) {
610 0           $fstr .= ", value=$self->{value}";
611             }
612 0           $fstr .= ")";
613 0           return $fstr;
614             }
615              
616             =head2 AUTHOR
617              
618             Ben Booth, C<< >>
619              
620             Original Authors:
621              
622             James Taylor (james@jamestaylor.org),
623             Ian Schenk (ian.schenck@gmail.com),
624             Brent Pedersen (bpederse@gmail.com)
625              
626             =head2 BUGS
627              
628             Please report any bugs or feature requests to C, or through
629             the web interface at L. I will be notified, and then you'll
630             automatically be notified of progress on your bug as I make changes.
631              
632             =head2 SUPPORT
633              
634             You can find documentation for this module with the perldoc command.
635              
636             perldoc IntervalTree
637              
638              
639             You can also look for information at:
640              
641             =over 4
642              
643             =item * RT: CPAN's request tracker (report bugs here)
644              
645             L
646              
647             =item * AnnoCPAN: Annotated CPAN documentation
648              
649             L
650              
651             =item * CPAN Ratings
652              
653             L
654              
655             =item * Search CPAN
656              
657             L
658              
659             =back
660              
661             =head2 ACKNOWLEDGEMENTS
662              
663             This code was directly ported from the bx-python project:
664              
665             https://bitbucket.org/james_taylor/bx-python/src/tip/lib/bx/intervals/intersection.pyx
666              
667             Original Authors:
668              
669             James Taylor (james@jamestaylor.org),
670             Ian Schenk (ian.schenck@gmail.com),
671             Brent Pedersen (bpederse@gmail.com)
672              
673             =head2 LICENSE AND COPYRIGHT
674              
675             Copyright 2012 Ben Booth.
676              
677             This program is free software; you can redistribute it and/or modify it
678             under the terms of either: the GNU General Public License as published
679             by the Free Software Foundation; or the Artistic License.
680              
681             See http://dev.perl.org/licenses/ for more information.
682              
683              
684             =cut
685              
686             1; # End of IntervalTree