File Coverage

blib/lib/Gzip/BinarySearch.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package Gzip::BinarySearch;
2              
3 9     9   291045 use strict;
  9         24  
  9         291  
4 9     9   47 use warnings;
  9         18  
  9         465  
5 9     9   46 use Carp;
  9         20  
  9         931  
6 9     9   52 use List::Util qw(min max);
  9         16  
  9         1958  
7 9     9   15683 use Gzip::RandomAccess;
  0            
  0            
8              
9             our $VERSION = '0.91';
10              
11             BEGIN {
12             use Exporter;
13             our @ISA = qw(Exporter);
14             our @EXPORT_OK = qw(fs_column tsv_column);
15             }
16              
17             # When binary searching, assume this is the biggest line we'll find.
18             # We'll try this first, then try a bigger decompression if it fails.
19             my $DEFAULT_EST_LINE_LENGTH = 512;
20              
21             # When looking for adjacent identical lines (find_all) extract this
22             # many bytes at a time.
23             my $DEFAULT_SURROUNDING_LINES_BLOCKSIZE = 4096;
24              
25             my @GRA_ALLOWED_ARGS = qw(file index_file index_span cleanup);
26             my %ALLOWED_ARGS = map { $_ => 1 } (
27             @GRA_ALLOWED_ARGS, qw(key_func cmp_func),
28             qw(est_line_length surrounding_lines_blocksize),
29             );
30              
31             sub new {
32             my ($class, %args) = @_;
33             my $gzip = $class->_build_gzip(%args);
34             for my $key (keys %args) {
35             $ALLOWED_ARGS{$key} or croak "Invalid argument '$key'";
36             }
37              
38             my $key_func = $args{key_func} || fs_column(qr/\s+/, 1);
39             my $cmp_func = $args{cmp_func} || sub { $main::a cmp $main::b };
40              
41             my $est_line_length = $args{est_line_length}
42             || $DEFAULT_EST_LINE_LENGTH;
43              
44             my $surrounding_lines_blocksize = $args{surrounding_lines_blocksize}
45             || $DEFAULT_SURROUNDING_LINES_BLOCKSIZE;
46              
47             bless {
48             gzip => $gzip,
49             max_offset => $gzip->uncompressed_size - 1,
50             est_line_length => $est_line_length,
51             surrounding_lines_blocksize => $surrounding_lines_blocksize,
52             key_func => $key_func,
53             cmp_func => $cmp_func,
54             }, $class;
55             }
56              
57             sub find {
58             my ($self, $key) = @_;
59              
60             my ($line, $mid) = $self->_find($key);
61             return $line if defined $line;
62             return;
63             }
64              
65             sub find_all {
66             my ($self, $key) = @_;
67              
68             my ($line, $mid) = $self->_find($key);
69             if (defined $line) {
70             return $self->_search_surrounding_lines($key, $line, $mid);
71             }
72             return;
73             }
74              
75             sub gzip { shift->{gzip} }
76             sub est_line_length { shift->{est_line_length} }
77             sub surrounding_lines_blocksize { shift->{surrounding_lines_blocksize} }
78              
79             # Convenience functions
80             sub fs_column {
81             my ($field_sep, $column_number) = @_;
82             croak "Invalid column number, should be 1-based"
83             if $column_number < 1;
84              
85             if (!ref $field_sep && $field_sep eq ' ') {
86             # Force Perl to match a space, not \s+
87             $field_sep = qr/ /;
88             }
89              
90             return sub {
91             chomp;
92             my @f = split $field_sep, $_, -1;
93             return $f[$column_number - 1];
94             }
95             }
96              
97             sub tsv_column {
98             my $column_number = shift;
99             return fs_column("\t", $column_number);
100             }
101              
102             # Given a key, binary search for the first line encountered with this key,
103             # and return both the line and its offset in the uncompressed data.
104             # Return nothing if the key was not found.
105             sub _find {
106             my ($self, $key) = @_;
107             my ($low, $high, $mid) = (0, $self->{max_offset}, undef);
108              
109             while ($low <= $high) {
110             $mid = int(($low + $high) / 2);
111             my $line = $self->_get_line_at($mid);
112             my $line_key = $self->_get_key($line);
113             my $cmp = $self->_compare_keys($line_key, $key);
114              
115             if ($cmp < 0) {
116             $low = $mid + 1;
117             }
118             elsif ($cmp > 0) {
119             $high = $mid - 1;
120             }
121             else {
122             return ($line, $mid);
123             }
124             }
125              
126             # Key not found
127             return;
128             }
129              
130             # Given an offset, return the entire line 'around' that offset.
131             # This includes any trailing newline character.
132             # If the offset is a newline, it will be counted as part of the preceding line.
133             sub _get_line_at {
134             my ($self, $offset) = @_;
135             my $L = $self->{est_line_length};
136              
137             my $seek_start = max($offset - $L, 0); # global offset
138             my $seek_length = $L * 2;
139             my $midpoint = $offset - $seek_start; # relative to extracted $block, not global
140              
141             my $line;
142             while (!defined $line) {
143             my $block = $self->{gzip}->extract($seek_start, $seek_length);
144              
145             my $prev_nl = rindex($block, "\n", $midpoint - 1); # -1 if no match
146             my $next_nl = index($block, "\n", $midpoint);
147              
148             if ($prev_nl == -1 && $seek_start > 0) {
149             my $extension = min($L, $seek_start);
150             $seek_start -= $extension;
151             $seek_length += $extension;
152             $midpoint += $extension;
153             }
154             elsif ($next_nl == -1 && $seek_start + $seek_length < $self->{max_offset}) {
155             $seek_length += $L;
156             }
157             else {
158             $next_nl = length($block) if $next_nl == -1; # no EOF newline - use last char
159             $line = substr($block, $prev_nl + 1, $next_nl - $prev_nl);
160             }
161             }
162              
163             return $line;
164             }
165              
166             sub _get_key {
167             my ($self, $line) = @_;
168             local *::_ = \$line;
169             return $self->{key_func}->();
170             }
171              
172             sub _compare_keys {
173             my ($self, $key1, $key2) = @_;
174             local (*::a, *::b) = (\$key1, \$key2);
175             return $self->{cmp_func}->();
176              
177             }
178              
179             # Given a line and a seekpoint in the uncompressed data, look forwards and backwards
180             # for adjacent lines with the same key. Return matching lines in same order as file.
181             sub _search_surrounding_lines {
182             my ($self, $query, $mid_line, $mid) = @_;
183             # Problem description:
184             # when searching for 'zits', the algorithm expands, then shrinks line-by-line
185             # at the end it trims the string to "zings\nzits\n..." and for some reason the
186             # zits offset isn't used to trim it further.
187             # you'll notice the _compare_keys thing which means $result_start is never set
188             # if we're shrinking (?)
189              
190             my $block_start = max($mid - length($mid_line) + 1, 0);
191             my $block_end = min($mid + length($mid_line) - 1, $self->{max_offset});
192             my $block = $self->{gzip}->extract($block_start, $block_end - $block_start);
193              
194             # offsets for the result range, relative to block_start
195             my ($result_start, $result_end) = (0, $block_end - $block_start);
196              
197             my $shrinking = 0;
198             while (1) {
199             my ($line, $offset) = $self->_first_line($block, $result_start, $result_end, $block_start == 0);
200             if (!defined $line || $self->_compare_keys($self->_get_key($line), $query) == 0) {
201             last if $shrinking;
202             last if $block_start == 0; # can't expand further
203             # Expand left
204             my $ex = min($self->{surrounding_lines_blocksize}, $block_start);
205             $block_start -= $ex;
206             $result_end += $ex;
207             my $chunk = $self->{gzip}->extract($block_start, $ex);
208             $block = $chunk . $block;
209             }
210             else {
211             # Shrink
212             $result_start = $offset + length($line);
213             $shrinking = 1;
214             }
215             }
216             my ($line, $offset) = $self->_first_line($block, $result_start, $result_end, $block_start == 0);
217             $result_start = $offset;
218              
219             $shrinking = 0;
220             while (1) {
221             my ($line, $offset) = $self->_last_line($block, $result_start, $result_end, $result_end == $self->{max_offset});
222             if (!defined $line || $self->_compare_keys($self->_get_key($line), $query) == 0) {
223             last if $shrinking;
224             last if $block_end == $self->{max_offset};
225             # Expand right
226             my $ex = min($self->{surrounding_lines_blocksize}, $self->{max_offset} - $block_end);
227             my $chunk = $self->{gzip}->extract($block_end, $ex);
228             $block_end += $ex;
229             $result_end += $ex;
230             $block .= $chunk;
231             }
232             else {
233             $result_end = $offset - 1;
234             $shrinking = 1;
235             }
236             }
237            
238             my $result = substr($block, $result_start, $result_end - $result_start);
239             return map { "$_\n" } split /\n/, $result;
240             }
241              
242             # Get the first complete line from the block, limiting to a range from
243             # $start to $end (inclusive). If $allow_incomplete, we instead return
244             # all data up to the first \n.
245             # Also return the offset of the start of this line, relative to the
246             # entire text block.
247             # Return nothing if we need more data (left/right) to get a full line.
248             sub _first_line {
249             my ($self, $block, $start, $end, $allow_incomplete) = @_;
250             my $nl = index($block, "\n", $start);
251             $nl = -1 if $nl > $end;
252             if ($allow_incomplete) {
253             $nl = $end if $nl == -1;
254             return (substr($block, $start, $nl - $start + 1), $start);
255             }
256             else {
257             return if $nl == -1; # more data needed
258             my $nl2 = index($block, "\n", $nl + 1);
259             die if $nl2 < $start && $nl2 != -1;
260             return if $nl2 == -1 || $nl2 > $end;
261             return (substr($block, $nl + 1, $nl2 - $nl), $nl + 1);
262             }
263             }
264              
265             # Same deal as above, but for the last line. $allow_incomplete
266             # returns all data from the last \n onwards, and ignores a
267             # trailing newline at the end of the range.
268             sub _last_line {
269             my ($self, $block, $start, $end, $allow_incomplete) = @_;
270             my $nl = rindex($block, "\n", $end);
271             $nl = -1 if $nl < $start;
272             if ($allow_incomplete) {
273             $nl = rindex($block, "\n", $nl - 1) if $nl == $end; # ignore trailing newline
274             $nl = $start - 1 if $nl == -1; # pretend there's one
275             return (substr($block, $nl + 1, $end - $nl), $nl + 1);
276             }
277             else {
278             return if $nl == -1; # more data needed
279             my $nl2 = rindex($block, "\n", $nl - 1);
280             die if $nl2 > $end && $nl2 != -1;
281             return if $nl2 == -1 || $nl2 < $start;
282             return (substr($block, $nl2 + 1, $nl - $nl2), $nl2 + 1);
283             }
284             }
285              
286             # Build and return the Gzip::RandomAccess object.
287             sub _build_gzip {
288             my ($class, %args) = @_;
289              
290             my %gra_args;
291             for my $key (@GRA_ALLOWED_ARGS) {
292             $gra_args{$key} = $args{$key} if exists $args{$key};
293             }
294              
295             return Gzip::RandomAccess->new(%gra_args);
296             }
297              
298             __END__