File Coverage

blib/lib/Compress/BGZF/Reader.pm
Criterion Covered Total %
statement 238 256 92.9
branch 64 94 68.0
condition 11 15 73.3
subroutine 29 32 90.6
pod 9 9 100.0
total 351 406 86.4


line stmt bran cond sub pod time code
1             package Compress::BGZF::Reader;
2              
3 2     2   36163 use strict;
  2         2  
  2         45  
4 2     2   4 use warnings;
  2         2  
  2         32  
5              
6 2     2   6 use Carp;
  2         2  
  2         71  
7 2     2   1067 use Compress::Zlib;
  2         89125  
  2         387  
8 2     2   11 use List::Util qw/sum/;
  2         2  
  2         146  
9              
10 2     2   8 use constant BGZF_MAGIC => pack "H*", '1f8b0804';
  2         2  
  2         82  
11 2     2   6 use constant HEAD_BYTES => 12; # not including extra fields
  2         2  
  2         73  
12 2     2   6 use constant FOOT_BYTES => 8;
  2         2  
  2         3536  
13              
14             ## no critic
15             # allow for filehandle tie'ing
16 3     3   7 sub TIEHANDLE { Compress::BGZF::Reader::new(@_) }
17 39     39   813 sub READ { Compress::BGZF::Reader::_read(@_) }
18 1013     1013   3449 sub READLINE { Compress::BGZF::Reader::getline(@_) }
19 42     42   7876 sub SEEK { Compress::BGZF::Reader::_seek(@_) }
20 3     3   1535 sub CLOSE { close $_[0]->{fh} }
21 3     3   714 sub TELL { return $_[0]->{u_offset} }
22 0     0   0 sub EOF { return $_[0]->{buffer_len} == -1 }
23 3     3   42 sub FILENO { return fileno $_[0]->{fh} }
24              
25             # accessors
26 3     3 1 886 sub usize { return $_[0]->{u_file_size} };
27              
28             ## use critic
29              
30             sub new_filehandle {
31              
32             #-------------------------------------------------------------------------
33             # ARG 0 : BGZF input filename
34             #-------------------------------------------------------------------------
35             # RET 0 : Filehandle GLOB (internally an IO::File object tied to
36             # Compress::BGZF::Reader)
37             #-------------------------------------------------------------------------
38              
39 3     3 1 30 my ($class, $fn_in) = @_;
40 3 50       8 croak "input filename required" if (! defined $fn_in);
41 3 50       11 tie *FH, $class, $fn_in or croak "failed to tie filehandle";
42              
43 3         6 return \*FH;
44              
45             }
46              
47             sub new {
48              
49             #-------------------------------------------------------------------------
50             # ARG 0 : BGZF input filename
51             #-------------------------------------------------------------------------
52             # RET 0 : Compress::BGZF::Reader object
53             #-------------------------------------------------------------------------
54            
55 6     6 1 2128 my ($class, $fn_in) = @_;
56 6         9 my $self = bless {}, $class;
57              
58             # initialize
59 6 50       28 $self->{fn_in} = $fn_in
60             or croak "Input name required";
61              
62             # open compressed file in binary mode
63 6 50       149 open my $fh, '<:raw', $fn_in or croak "Failed to open input file"; ## no critic
64 6         9 $self->{fh} = $fh;
65              
66             # these member variables allow for data extraction
67 6         8 $self->{buffer} = ''; # contents of currently uncompressed block
68 6         8 $self->{buffer_len} = 0; # save time on frequent length() calls
69 6         6 $self->{block_offset} = 0; # offset of current block in compressed data
70 6         5 $self->{buffer_offset} = 0; # offset of current pos in uncompressed block
71 6         4 $self->{block_size} = 0; # compressed size of current block
72 6         44 $self->{file_size} = -s $fn_in; # size of compressed file
73              
74             # these variables are tracked to allow for full seek implementation
75 6         6 $self->{u_offset} = 0; # calculated current uncompressed offset
76 6         7 $self->{u_file_size} = 0; # size of uncompressed file (filled during
77             # indexing
78              
79             # load offset index
80 6 100       139 if (-e "$fn_in.gzi") {
81 3         8 $self->_load_index( "$fn_in.gzi" ); # from disk
82             }
83             else {
84 3         8 $self->_generate_index(); # on-the-fly
85             }
86              
87             # load initial block
88 6         11 $self->_load_block();
89              
90 6         29 return $self;
91              
92             }
93              
94             sub _load_block {
95            
96             #-------------------------------------------------------------------------
97             # ARG 0 : offset of block start in compressed file
98             #-------------------------------------------------------------------------
99             # no returns
100             #-------------------------------------------------------------------------
101              
102 57     57   71 my ($self, $block_offset) = @_;
103              
104             # loading a block should always reset buffer offset
105 57         53 $self->{buffer_offset} = 0;
106              
107             # avoid reload of current block
108             return if (defined $block_offset
109 57 100 100     142 && $block_offset == $self->{block_offset});
110              
111             # if no offset given (e.g. sequential reads), move to next block
112 44 100       59 if (! defined $block_offset) {
113 25         24 $block_offset = $self->{block_offset} + $self->{block_size};
114             }
115              
116 44         38 $self->{block_offset} = $block_offset;
117             # deal with EOF
118             croak "Read past file end (perhaps corrupted/truncated input?)"
119 44 50       57 if ($self->{block_offset} > $self->{file_size});
120 44 100       63 if ($self->{block_offset} == $self->{file_size}) {
121 11         8 $self->{buffer} = '';
122 11         12 $self->{buffer_len} = -1;
123 11         17 return;
124             }
125              
126             # never assume we're already there
127 33         70 sysseek $self->{fh}, $self->{block_offset}, 0;
128              
129             # parse block according to GZIP spec, including content inflation
130 33         47 my ($block_size, $uncompressed_size, $content)
131             = $self->_unpack_block(1);
132 33         34 $self->{block_size} = $block_size;
133 33         32 $self->{buffer_len} = $uncompressed_size;
134 33         28 $self->{buffer} = $content;
135              
136 33         47 return;
137              
138             }
139              
140             sub _unpack_block {
141              
142             #-------------------------------------------------------------------------
143             # ARG 0 : bool indicating whether to inflate (and return) actual payload
144             #-------------------------------------------------------------------------
145             # RET 0 : compressed block size
146             # RET 1 : uncompressed content size
147             # RET 2 : content (if ARG 0)
148             #-------------------------------------------------------------------------
149              
150 44     44   38 my ($self, $do_unpack) = @_;
151              
152 44         33 my @return_values;
153              
154              
155             my ($magic, $mod, $flags, $os, $len_extra) = unpack 'A4A4CCv',
156 44         64 _safe_sysread($self->{fh}, HEAD_BYTES);
157 44         64 my $t = sysseek $self->{fh}, 0, 1;
158 44 50       67 croak "invalid header at $t (corrupt file or not BGZF?)"
159             if ($magic ne BGZF_MAGIC);
160              
161             # extract stated block size according to BGZF spec
162 44         31 my $block_size;
163 44         33 my $l = 0;
164 44         63 while ($l < $len_extra) {
165             my ($field_id, $field_len) = unpack 'A2v',
166 44         52 _safe_sysread($self->{fh}, 4);
167 44 50       80 if ($field_id eq 'BC') {
168 44 50       55 croak "invalid BC length" if ($field_len != 2);
169 44 50       55 croak "multiple BC fields" if (defined $block_size);
170             $block_size = unpack 'v',
171 44         55 _safe_sysread($self->{fh} => $field_len);
172 44         48 $block_size += 1; # convert to 1-based
173             }
174 44         69 $l += 4 + $field_len;
175             }
176 44 50       57 croak "invalid extra field length" if ($l != $len_extra);
177 44 50       61 croak "failed to read block size" if (! defined $block_size);
178              
179 44         36 push @return_values, $block_size;
180 44         43 my $payload_len = $block_size - HEAD_BYTES - FOOT_BYTES - $len_extra;
181 44         27 my $content;
182 44 100       51 if ($do_unpack) {
183            
184             # decode actual content
185 33         39 my $payload = _safe_sysread($self->{fh}, $payload_len);
186 33         104 my ($i,$status) = inflateInit(-WindowBits => -&MAX_WBITS());
187 33 50       3085 croak "Error during inflate init\n" if ($status != Z_OK);
188 33         121 ($content,$status) = $i->inflate($payload);
189 33 50       3830 croak "Error during inflate run\n" if ($status != Z_STREAM_END);
190             #rawinflate( \$payload => \$content )
191             #or croak "Error inflating: $RawInflateError\n";
192 33         150 my $crc_given = unpack 'V', _safe_sysread($self->{fh} => 4);
193 33 50       3974 croak "content CRC32 mismatch" if ( $crc_given != crc32($content) );
194              
195             }
196 11         16 else { sysseek $self->{fh}, $payload_len + 4, 1; }
197              
198             # unpack and possibly check uncompressed payload size
199 44         58 my $size_given = unpack 'V', _safe_sysread($self->{fh} => 4);
200 44 50 66     143 croak "content length mismatch"
201             if ( defined $content && $size_given != length($content) );
202 44         40 push @return_values, $size_given;
203 44 100       61 push @return_values, $content if (defined $content);
204              
205 44         79 return @return_values;
206              
207             }
208              
209             sub read_data {
210              
211             # More OO-ish wrapper around _read(), avoids conflicts with system read()
212              
213             #-------------------------------------------------------------------------
214             # ARG 0 : number of bytes to read
215             #-------------------------------------------------------------------------
216             # RET 0 : data read
217             #-------------------------------------------------------------------------
218              
219 3     3 1 15 my ($self, $bytes) = @_;
220              
221 3         5 my $r = $self->_read( my $buffer, $bytes );
222             carp "received fewer bytes than requested"
223 3 50 33     12 if ($r < $bytes && $self->{buffer_len} > -1);
224            
225 3 50       4 $buffer = undef if ( $r < 1 );
226 3         5 return $buffer;
227            
228             }
229              
230             sub _read {
231              
232             #-------------------------------------------------------------------------
233             # ARG 0 : buffer to write to
234             # ARG 1 : bytes to attempt to read
235             # ARG 3 : (optional) offset in buffer to start write (default: 0)
236             #-------------------------------------------------------------------------
237             # RET 0 : bytes read (0 at EOF, undef on error)
238             #-------------------------------------------------------------------------
239              
240             # we try to emulate the built-in 'read' call, so we will
241             # overwrite $_[1] and return the number of bytes read. To facilitate this,
242             # make $buf a reference to the buffer passed
243 42     42   37 my $self = shift;
244 42         32 my $buf = \shift; # must be a reference !!
245 42         33 my $bytes = shift;
246 42         25 my $offset = shift;
247              
248             # handle cases when $offset is passed in
249 42         50 my $prefix = '';
250 42 100 100     120 if (defined $offset && $offset != 0) {
251 12         20 $prefix = substr $$buf, 0, $offset;
252 12 100       22 $prefix .= "\0" x ( $offset - length($$buf) )
253             if ( $offset > length($$buf) );
254             }
255              
256 42         41 $$buf = ''; # reset (only AFTER grabbing any prefix above)
257              
258             ITER:
259 42         70 while (length($$buf) < $bytes) {
260              
261 57         46 my $l = length($$buf);
262 57         44 my $remaining = $bytes - $l;
263              
264             # if read is within current block
265 57 100       85 if ($self->{buffer_offset} + $remaining <= $self->{buffer_len}) {
266 19         136 $$buf .= substr $self->{buffer}, $self->{buffer_offset}, $remaining;
267 19         19 $self->{buffer_offset} += $remaining;
268             $self->_load_block()
269 19 50       46 if ($self->{buffer_offset} == $self->{buffer_len});
270             }
271             else {
272 38 100       63 last ITER if ($self->{buffer_len} < 0); #EOF
273 15         125 $$buf .= substr $self->{buffer}, $self->{buffer_offset};
274 15         20 $self->_load_block();
275             }
276              
277             }
278              
279 42         32 my $l = length($$buf);
280 42         36 $self->{u_offset} += $l;
281 42         154 $$buf = $prefix . $$buf;
282              
283 42         66 return $l;
284              
285             }
286              
287              
288             sub getline {
289              
290             #-------------------------------------------------------------------------
291             # No arguments
292             #-------------------------------------------------------------------------
293             # RET 0 : string read (undef at EOF)
294             #-------------------------------------------------------------------------
295              
296 1022     1022 1 768 my ($self) = @_;
297              
298 1022         560 my $data;
299              
300 1022         597 while (1) {
301              
302             # return immediately if EOF
303 1026 100       1228 last if ($self->{buffer_len} < 0);
304              
305             # search current block for record separator
306             # start searching from the current buffer offset
307 1009         1027 pos($self->{buffer}) = $self->{buffer_offset};
308              
309 1009 100       2133 if ($self->{buffer} =~ m|$/|g) {
310 1005         625 my $pos = pos $self->{buffer};
311             $data .= substr $self->{buffer}, $self->{buffer_offset},
312 1005         1318 $pos - $self->{buffer_offset};
313              
314 1005         619 $self->{buffer_offset} = $pos;
315              
316             # if we're at the end of the block, load next
317 1005 50       1117 $self->_load_block if ($pos == $self->{buffer_len});
318              
319 1005         626 $self->{u_offset} += length($data);
320              
321 1005         660 last;
322              
323             }
324              
325             # otherwise, add rest of block to data and load next block
326 4         8 $data .= substr $self->{buffer}, $self->{buffer_offset};
327 4         9 $self->_load_block;
328              
329             }
330              
331 1022 100       2168 return length($data) > 0 ? $data : undef;
332              
333             }
334              
335             sub write_index {
336              
337             # index format (as defined by htslib) consists of little-endian
338             # int64-coded values. The first value is the number of offsets in the
339             # index. The rest of the values consist of pairs of block offsets relative
340             # to the compressed and uncompressed data. The first offset (always 0,0)
341             # is not included.
342              
343             #-------------------------------------------------------------------------
344             # ARG 0 : index output filename
345             #-------------------------------------------------------------------------
346             # No returns
347             #-------------------------------------------------------------------------
348              
349 3     3 1 10 my ($self, $fn_out) = @_;
350              
351 3 50       6 croak "missing index output filename" if (! defined $fn_out);
352              
353              
354 3 50       6 $self->_generate_index() if (! defined $self->{idx});
355 3         3 my @offsets = @{ $self->{idx} };
  3         34  
356 3         4 shift @offsets; # don't write first
357              
358 3         114 open my $fh_out, '>:raw', $fn_out;
359              
360 3         4 print {$fh_out} pack('Q<', scalar(@offsets));
  3         14  
361 3         7 for (@offsets) {
362 8         3 print {$fh_out} pack('Q<', $_->[0]);
  8         11  
363 8         6 print {$fh_out} pack('Q<', $_->[1]);
  8         12  
364             }
365              
366 3         80 close $fh_out;
367              
368 3         10 return;
369              
370             }
371              
372             sub _load_index {
373              
374             #-------------------------------------------------------------------------
375             # ARG 0 : index input filename
376             #-------------------------------------------------------------------------
377             # No returns
378             #-------------------------------------------------------------------------
379              
380 3     3   3 my ($self, $fn_in) = @_;
381 3 50       7 croak "missing index input filename" if (! defined $fn_in);
382              
383             #TODO: speed up index parsing
384              
385 3 50       45 open my $fh_in, '<:raw', $fn_in or croak "error opening index";
386 3 50       27 read( $fh_in, my $n_offsets, 8)
387             or croak "failed to read first quad";
388 3         8 $n_offsets = unpack 'Q<', $n_offsets;
389 3         3 my @idx;
390 3         7 for (0..$n_offsets-1) {
391 8 50       13 read( $fh_in, my $buff, 16) or croak "error reading index";
392 8         20 $idx[$_] = [ unpack 'Q
393             }
394 3         13 close $fh_in;
395 3         7 unshift @idx, [0,0]; # add initial offsets
396              
397 3         3 $self->{u_file_size} = $idx[-1]->[1];
398              
399             # some indices created by htslib bgzip are missing last offset pair
400             # check for that here by loading last block in index and proceeding from
401             # there. Also calculate uncompressed file size at the same time.
402 3         4 my $c_size = $idx[-1]->[0];
403 3         7 sysseek $self->{fh}, $idx[-1]->[0], 0;
404 3         7 my ($c, $u) = $self->_unpack_block(0);
405 3         2 $self->{u_file_size} += $u;
406 3         3 $c_size += $c;
407 3         8 while ($c_size < $self->{file_size}) {
408 0         0 push @idx, [$idx[-1]->[0]+$c, $idx[-1]->[1]+$u];
409 0         0 sysseek $self->{fh}, $idx[-1]->[0], 0;
410 0         0 ($c, $u) = $self->_unpack_block(0);
411 0         0 $self->{u_file_size} += $u;
412 0         0 $c_size += $c;
413             }
414             croak "Unexpected file size/last index mismatch ($c_size v $self->{file_size})"
415 3 50       7 if ($c_size != $self->{file_size});
416              
417 3         6 $self->{idx} = [@idx];
418 3         16 $self->{ridx}->{$_->[0]} = $_->[1] for (@idx);
419              
420 3         6 sysseek $self->{fh}, $self->{block_offset}, 0;
421              
422 3         10 return;
423              
424             }
425              
426             sub _generate_index {
427              
428             #-------------------------------------------------------------------------
429             # No arguments
430             #-------------------------------------------------------------------------
431             # No returns
432             #-------------------------------------------------------------------------
433              
434 3     3   4 my ($self) = @_;
435              
436 3         3 my $uncmp_offset = 0;
437 3         2 my $cmp_offset = 0;
438 3         3 my $i = 0;
439 3         4 $self->{u_file_size} = 0;
440 3         4 $self->{idx} = [];
441 3         4 $self->{ridx} = {};
442              
443 3         7 sysseek $self->{fh}, 0, 0;
444              
445 3         7 while ($cmp_offset < $self->{file_size}) {
446              
447 8         8 push @{$self->{idx}}, [$cmp_offset, $uncmp_offset];
  8         12  
448 8         11 $self->{ridx}->{$cmp_offset} = $uncmp_offset;
449              
450 8         12 my ($block_size, $uncompressed_size) = $self->_unpack_block(0);
451              
452 8         6 $cmp_offset += $block_size;
453 8         6 $uncmp_offset += $uncompressed_size;
454 8         12 $self->{u_file_size} += $uncompressed_size;
455              
456             }
457              
458 3         5 sysseek $self->{fh}, $self->{block_offset}, 0;
459              
460 3         5 return;
461              
462             }
463              
464              
465             sub move_to {
466              
467             # Wrapper around _seek(), avoids conflicts with system seek()
468              
469             #-------------------------------------------------------------------------
470             # ARG 0 : byte offset to which to seek
471             # ARG 1 : relativity of offset (0: file start, 1: current, 2: file end)
472             #-------------------------------------------------------------------------
473             # no returns
474             #-------------------------------------------------------------------------
475              
476            
477 12     12 1 4280 my ($self, @args) = @_;
478 12         21 $self->_seek( @args );
479              
480 12         17 return;
481              
482             }
483              
484              
485             sub _seek {
486              
487             #-------------------------------------------------------------------------
488             # ARG 0 : byte offset to which to seek
489             # ARG 1 : relativity of offset (0: file start, 1: current, 2: file end)
490             #-------------------------------------------------------------------------
491             # no returns
492             #-------------------------------------------------------------------------
493              
494 54     54   59 my ($self, $pos, $whence) = @_;
495              
496 54 100       96 $pos += $self->{u_offset} if ($whence == 1);
497 54 100       75 $pos = $self->{u_file_size} + $pos if ($whence == 2);
498              
499 54 100       73 return if ($pos < 0);
500 47 100       72 if ($pos >= $self->{u_file_size}) {
501 15         16 $self->{buffer_len} = -1;
502 15         10 $self->{u_offset} = $pos;
503 15         15 $self->{block_offset} = $pos;
504 15         24 return 1;
505             }
506              
507             # Do seeded search for nearest block start <= $pos
508             # (although we don't know the size of each block, we can determine the
509             # mean length and usually choose a starting value close to the actual -
510             # benchmarks much faster than binary search)
511             # TODO: benchmark whether breaking this out and Memoizing speeds things up
512              
513 32         22 my $s = scalar @{$self->{idx}};
  32         39  
514 32         65 my $idx = int($pos/($self->{u_file_size}) * $s);
515 32         16 while (1) {
516 35 100       61 if ($pos < $self->{idx}->[$idx]->[1]) {
517 3         3 --$idx;
518 3         3 next;
519             }
520 32 50 66     99 if ($idx+1 < $s && $pos >= $self->{idx}->[$idx+1]->[1]) {
521 0         0 ++$idx;
522 0         0 next;
523             }
524 32         29 last;
525             }
526              
527 32         58 my $block_o = $self->{idx}->[$idx]->[0];
528 32         28 my $block_o_u = $self->{idx}->[$idx]->[1];
529 32         27 my $buff_o = $pos - $block_o_u;
530              
531 32         41 $self->_load_block( $block_o );
532 32         27 $self->{buffer_offset} = $buff_o;
533 32         30 $self->{u_offset} = $block_o_u + $buff_o;
534              
535 32         58 return 1;
536              
537             }
538              
539             sub get_vo {
540              
541             #-------------------------------------------------------------------------
542             # no arguments
543             #-------------------------------------------------------------------------
544             # RET 0 : virtual offset of current position
545             #-------------------------------------------------------------------------
546              
547 0     0 1 0 my ($self) = @_;
548 0         0 return ($self->{block_offset} << 16) | $self->{buffer_offset};
549              
550             }
551              
552             sub move_to_vo {
553              
554             #-------------------------------------------------------------------------
555             # ARG 0 : virtual offset (see POD for definition)
556             #-------------------------------------------------------------------------
557             # no returns
558             #-------------------------------------------------------------------------
559              
560 0     0 1 0 my ($self, $vo) = @_;
561 0         0 my $block_o = $vo >> 16;
562 0         0 my $buff_o = $vo ^ ($block_o << 16);
563 0         0 $self->_load_block( $block_o );
564 0         0 $self->{buffer_offset} = $buff_o;
565 0 0       0 croak "invalid block offset" if (! defined $self->{ridx}->{$block_o});
566 0         0 $self->{u_offset} = $self->{ridx}->{$block_o} + $buff_o;
567              
568 0         0 return;
569              
570             }
571              
572             sub _safe_sysread {
573              
574             # sysread wrapper that checks return count and returns read
575             # (internally we should never read off end of file - doing so indicates
576             # either a software bug or a corrupt input file so we croak)
577              
578             #-------------------------------------------------------------------------
579             # ARG 0 : bytes to read
580             #-------------------------------------------------------------------------
581             # RET 0 : string read
582             #-------------------------------------------------------------------------
583              
584 242     242   190 my ($fh, $len) = @_;
585 242         160 my $buf = '';
586 242         813 my $r = sysread $fh, $buf, $len;
587 242 50       308 croak "returned unexpected byte count" if ($r != $len);
588              
589 242         455 return $buf;
590              
591             }
592              
593             1;
594              
595              
596             __END__