File Coverage

blib/lib/Compress/BGZF/Reader.pm
Criterion Covered Total %
statement 242 260 93.0
branch 64 94 68.0
condition 11 15 73.3
subroutine 30 33 90.9
pod 9 9 100.0
total 356 411 86.6


line stmt bran cond sub pod time code
1             package Compress::BGZF::Reader;
2              
3 2     2   28295 use strict;
  2         3  
  2         44  
4 2     2   5 use warnings;
  2         3  
  2         46  
5              
6 2     2   7 use Carp;
  2         3  
  2         89  
7 2     2   1058 use Compress::Zlib;
  2         92117  
  2         398  
8 2     2   12 use List::Util qw/sum/;
  2         3  
  2         130  
9 2     2   849 use FileHandle;
  2         1201  
  2         7  
10              
11 2     2   484 use constant BGZF_MAGIC => pack "H*", '1f8b0804';
  2         3  
  2         95  
12 2     2   7 use constant HEAD_BYTES => 12; # not including extra fields
  2         2  
  2         69  
13 2     2   6 use constant FOOT_BYTES => 8;
  2         5  
  2         3666  
14              
15             ## no critic
16             # allow for filehandle tie'ing
17 3     3   9 sub TIEHANDLE { Compress::BGZF::Reader::new(@_) }
18 39     39   812 sub READ { Compress::BGZF::Reader::_read(@_) }
19 1013     1013   3272 sub READLINE { Compress::BGZF::Reader::getline(@_) }
20 42     42   7997 sub SEEK { Compress::BGZF::Reader::_seek(@_) }
21 3     3   1706 sub CLOSE { close $_[0]->{fh} }
22 3     3   746 sub TELL { return $_[0]->{u_offset} }
23 0     0   0 sub EOF { return $_[0]->{buffer_len} == -1 }
24 3     3   46 sub FILENO { return fileno $_[0]->{fh} }
25              
26             # accessors
27 3     3 1 916 sub usize { return $_[0]->{u_file_size} };
28              
29             ## use critic
30              
31             sub new_filehandle {
32              
33             #-------------------------------------------------------------------------
34             # ARG 0 : BGZF input filename
35             #-------------------------------------------------------------------------
36             # RET 0 : Filehandle GLOB (internally an IO::File object tied to
37             # Compress::BGZF::Reader)
38             #-------------------------------------------------------------------------
39              
40 3     3 1 40 my ($class, $fn_in) = @_;
41 3 50       10 croak "input filename required" if (! defined $fn_in);
42 3         20 my $fh = FileHandle->new;
43 3 50       107 tie *$fh, $class, $fn_in or croak "failed to tie filehandle";
44             #tie *FH, $class, $fn_in or croak "failed to tie filehandle";
45              
46 3         5 return $fh;
47             #return \*FH;
48              
49             }
50              
51             sub new {
52              
53             #-------------------------------------------------------------------------
54             # ARG 0 : BGZF input filename
55             #-------------------------------------------------------------------------
56             # RET 0 : Compress::BGZF::Reader object
57             #-------------------------------------------------------------------------
58            
59 6     6 1 2238 my ($class, $fn_in) = @_;
60 6         9 my $self = bless {}, $class;
61              
62             # initialize
63 6 50       24 $self->{fn_in} = $fn_in
64             or croak "Input name required";
65              
66             # open compressed file in binary mode
67 6 50       161 open my $fh, '<:raw', $fn_in or croak "Failed to open input file"; ## no critic
68 6         10 $self->{fh} = $fh;
69              
70             # these member variables allow for data extraction
71 6         7 $self->{buffer} = ''; # contents of currently uncompressed block
72 6         8 $self->{buffer_len} = 0; # save time on frequent length() calls
73 6         6 $self->{block_offset} = 0; # offset of current block in compressed data
74 6         6 $self->{buffer_offset} = 0; # offset of current pos in uncompressed block
75 6         5 $self->{block_size} = 0; # compressed size of current block
76 6         47 $self->{file_size} = -s $fn_in; # size of compressed file
77              
78             # these variables are tracked to allow for full seek implementation
79 6         6 $self->{u_offset} = 0; # calculated current uncompressed offset
80 6         9 $self->{u_file_size} = 0; # size of uncompressed file (filled during
81             # indexing
82              
83             # load offset index
84 6 100       175 if (-e "$fn_in.gzi") {
85 3         9 $self->_load_index( "$fn_in.gzi" ); # from disk
86             }
87             else {
88 3         10 $self->_generate_index(); # on-the-fly
89             }
90              
91             # load initial block
92 6         14 $self->_load_block();
93              
94 6         19 return $self;
95              
96             }
97              
98             sub _load_block {
99            
100             #-------------------------------------------------------------------------
101             # ARG 0 : offset of block start in compressed file
102             #-------------------------------------------------------------------------
103             # no returns
104             #-------------------------------------------------------------------------
105              
106 57     57   52 my ($self, $block_offset) = @_;
107              
108             # loading a block should always reset buffer offset
109 57         54 $self->{buffer_offset} = 0;
110              
111             # avoid reload of current block
112             return if (defined $block_offset
113 57 100 100     146 && $block_offset == $self->{block_offset});
114              
115             # if no offset given (e.g. sequential reads), move to next block
116 44 100       65 if (! defined $block_offset) {
117 25         26 $block_offset = $self->{block_offset} + $self->{block_size};
118             }
119              
120 44         36 $self->{block_offset} = $block_offset;
121             # deal with EOF
122             croak "Read past file end (perhaps corrupted/truncated input?)"
123 44 50       58 if ($self->{block_offset} > $self->{file_size});
124 44 100       58 if ($self->{block_offset} == $self->{file_size}) {
125 11         9 $self->{buffer} = '';
126 11         11 $self->{buffer_len} = -1;
127 11         18 return;
128             }
129              
130             # never assume we're already there
131 33         61 sysseek $self->{fh}, $self->{block_offset}, 0;
132              
133             # parse block according to GZIP spec, including content inflation
134 33         50 my ($block_size, $uncompressed_size, $content)
135             = $self->_unpack_block(1);
136 33         35 $self->{block_size} = $block_size;
137 33         34 $self->{buffer_len} = $uncompressed_size;
138 33         32 $self->{buffer} = $content;
139              
140 33         40 return;
141              
142             }
143              
144             sub _unpack_block {
145              
146             #-------------------------------------------------------------------------
147             # ARG 0 : bool indicating whether to inflate (and return) actual payload
148             #-------------------------------------------------------------------------
149             # RET 0 : compressed block size
150             # RET 1 : uncompressed content size
151             # RET 2 : content (if ARG 0)
152             #-------------------------------------------------------------------------
153              
154 44     44   39 my ($self, $do_unpack) = @_;
155              
156 44         35 my @return_values;
157              
158              
159             my ($magic, $mod, $flags, $os, $len_extra) = unpack 'A4A4CCv',
160 44         62 _safe_sysread($self->{fh}, HEAD_BYTES);
161 44         75 my $t = sysseek $self->{fh}, 0, 1;
162 44 50       67 croak "invalid header at $t (corrupt file or not BGZF?)"
163             if ($magic ne BGZF_MAGIC);
164              
165             # extract stated block size according to BGZF spec
166 44         31 my $block_size;
167 44         29 my $l = 0;
168 44         65 while ($l < $len_extra) {
169             my ($field_id, $field_len) = unpack 'A2v',
170 44         57 _safe_sysread($self->{fh}, 4);
171 44 50       77 if ($field_id eq 'BC') {
172 44 50       55 croak "invalid BC length" if ($field_len != 2);
173 44 50       53 croak "multiple BC fields" if (defined $block_size);
174             $block_size = unpack 'v',
175 44         75 _safe_sysread($self->{fh} => $field_len);
176 44         49 $block_size += 1; # convert to 1-based
177             }
178 44         72 $l += 4 + $field_len;
179             }
180 44 50       51 croak "invalid extra field length" if ($l != $len_extra);
181 44 50       54 croak "failed to read block size" if (! defined $block_size);
182              
183 44         41 push @return_values, $block_size;
184 44         42 my $payload_len = $block_size - HEAD_BYTES - FOOT_BYTES - $len_extra;
185 44         34 my $content;
186 44 100       56 if ($do_unpack) {
187            
188             # decode actual content
189 33         38 my $payload = _safe_sysread($self->{fh}, $payload_len);
190 33         106 my ($i,$status) = inflateInit(-WindowBits => -&MAX_WBITS());
191 33 50       3069 croak "Error during inflate init\n" if ($status != Z_OK);
192 33         116 ($content,$status) = $i->inflate($payload);
193 33 50       3846 croak "Error during inflate run\n" if ($status != Z_STREAM_END);
194             #rawinflate( \$payload => \$content )
195             #or croak "Error inflating: $RawInflateError\n";
196 33         157 my $crc_given = unpack 'V', _safe_sysread($self->{fh} => 4);
197 33 50       3999 croak "content CRC32 mismatch" if ( $crc_given != crc32($content) );
198              
199             }
200 11         20 else { sysseek $self->{fh}, $payload_len + 4, 1; }
201              
202             # unpack and possibly check uncompressed payload size
203 44         61 my $size_given = unpack 'V', _safe_sysread($self->{fh} => 4);
204 44 50 66     149 croak "content length mismatch"
205             if ( defined $content && $size_given != length($content) );
206 44         45 push @return_values, $size_given;
207 44 100       66 push @return_values, $content if (defined $content);
208              
209 44         82 return @return_values;
210              
211             }
212              
213             sub read_data {
214              
215             # More OO-ish wrapper around _read(), avoids conflicts with system read()
216              
217             #-------------------------------------------------------------------------
218             # ARG 0 : number of bytes to read
219             #-------------------------------------------------------------------------
220             # RET 0 : data read
221             #-------------------------------------------------------------------------
222              
223 3     3 1 15 my ($self, $bytes) = @_;
224              
225 3         7 my $r = $self->_read( my $buffer, $bytes );
226             carp "received fewer bytes than requested"
227 3 50 33     11 if ($r < $bytes && $self->{buffer_len} > -1);
228            
229 3 50       4 $buffer = undef if ( $r < 1 );
230 3         7 return $buffer;
231            
232             }
233              
234             sub _read {
235              
236             #-------------------------------------------------------------------------
237             # ARG 0 : buffer to write to
238             # ARG 1 : bytes to attempt to read
239             # ARG 3 : (optional) offset in buffer to start write (default: 0)
240             #-------------------------------------------------------------------------
241             # RET 0 : bytes read (0 at EOF, undef on error)
242             #-------------------------------------------------------------------------
243              
244             # we try to emulate the built-in 'read' call, so we will
245             # overwrite $_[1] and return the number of bytes read. To facilitate this,
246             # make $buf a reference to the buffer passed
247 42     42   36 my $self = shift;
248 42         41 my $buf = \shift; # must be a reference !!
249 42         28 my $bytes = shift;
250 42         30 my $offset = shift;
251              
252             # handle cases when $offset is passed in
253 42         35 my $prefix = '';
254 42 100 100     125 if (defined $offset && $offset != 0) {
255 12         20 $prefix = substr $$buf, 0, $offset;
256 12 100       23 $prefix .= "\0" x ( $offset - length($$buf) )
257             if ( $offset > length($$buf) );
258             }
259              
260 42         35 $$buf = ''; # reset (only AFTER grabbing any prefix above)
261              
262             ITER:
263 42         70 while (length($$buf) < $bytes) {
264              
265 57         35 my $l = length($$buf);
266 57         41 my $remaining = $bytes - $l;
267              
268             # if read is within current block
269 57 100       98 if ($self->{buffer_offset} + $remaining <= $self->{buffer_len}) {
270 19         56 $$buf .= substr $self->{buffer}, $self->{buffer_offset}, $remaining;
271 19         17 $self->{buffer_offset} += $remaining;
272             $self->_load_block()
273 19 50       47 if ($self->{buffer_offset} == $self->{buffer_len});
274             }
275             else {
276 38 100       65 last ITER if ($self->{buffer_len} < 0); #EOF
277 15         175 $$buf .= substr $self->{buffer}, $self->{buffer_offset};
278 15         24 $self->_load_block();
279             }
280              
281             }
282              
283 42         37 my $l = length($$buf);
284 42         29 $self->{u_offset} += $l;
285 42         183 $$buf = $prefix . $$buf;
286              
287 42         68 return $l;
288              
289             }
290              
291              
292             sub getline {
293              
294             #-------------------------------------------------------------------------
295             # No arguments
296             #-------------------------------------------------------------------------
297             # RET 0 : string read (undef at EOF)
298             #-------------------------------------------------------------------------
299              
300 1022     1022 1 740 my ($self) = @_;
301              
302 1022         578 my $data;
303              
304 1022         600 while (1) {
305              
306             # return immediately if EOF
307 1026 100       1275 last if ($self->{buffer_len} < 0);
308              
309             # search current block for record separator
310             # start searching from the current buffer offset
311 1009         1030 pos($self->{buffer}) = $self->{buffer_offset};
312              
313 1009 100       2116 if ($self->{buffer} =~ m|$/|g) {
314 1005         733 my $pos = pos $self->{buffer};
315             $data .= substr $self->{buffer}, $self->{buffer_offset},
316 1005         1388 $pos - $self->{buffer_offset};
317              
318 1005         632 $self->{buffer_offset} = $pos;
319              
320             # if we're at the end of the block, load next
321 1005 50       1113 $self->_load_block if ($pos == $self->{buffer_len});
322              
323 1005         689 $self->{u_offset} += length($data);
324              
325 1005         704 last;
326              
327             }
328              
329             # otherwise, add rest of block to data and load next block
330 4         8 $data .= substr $self->{buffer}, $self->{buffer_offset};
331 4         6 $self->_load_block;
332              
333             }
334              
335 1022 100       2202 return length($data) > 0 ? $data : undef;
336              
337             }
338              
339             sub write_index {
340              
341             # index format (as defined by htslib) consists of little-endian
342             # int64-coded values. The first value is the number of offsets in the
343             # index. The rest of the values consist of pairs of block offsets relative
344             # to the compressed and uncompressed data. The first offset (always 0,0)
345             # is not included.
346              
347             #-------------------------------------------------------------------------
348             # ARG 0 : index output filename
349             #-------------------------------------------------------------------------
350             # No returns
351             #-------------------------------------------------------------------------
352              
353 3     3 1 11 my ($self, $fn_out) = @_;
354              
355 3 50       6 croak "missing index output filename" if (! defined $fn_out);
356              
357              
358 3 50       4 $self->_generate_index() if (! defined $self->{idx});
359 3         3 my @offsets = @{ $self->{idx} };
  3         22  
360 3         4 shift @offsets; # don't write first
361              
362 3         122 open my $fh_out, '>:raw', $fn_out;
363              
364 3         3 print {$fh_out} pack('Q<', scalar(@offsets));
  3         34  
365 3         8 for (@offsets) {
366 8         6 print {$fh_out} pack('Q<', $_->[0]);
  8         13  
367 8         5 print {$fh_out} pack('Q<', $_->[1]);
  8         13  
368             }
369              
370 3         75 close $fh_out;
371              
372 3         11 return;
373              
374             }
375              
376             sub _load_index {
377              
378             #-------------------------------------------------------------------------
379             # ARG 0 : index input filename
380             #-------------------------------------------------------------------------
381             # No returns
382             #-------------------------------------------------------------------------
383              
384 3     3   3 my ($self, $fn_in) = @_;
385 3 50       8 croak "missing index input filename" if (! defined $fn_in);
386              
387             #TODO: speed up index parsing
388              
389 3 50       49 open my $fh_in, '<:raw', $fn_in or croak "error opening index";
390 3 50       27 read( $fh_in, my $n_offsets, 8)
391             or croak "failed to read first quad";
392 3         8 $n_offsets = unpack 'Q<', $n_offsets;
393 3         3 my @idx;
394 3         8 for (0..$n_offsets-1) {
395 8 50       14 read( $fh_in, my $buff, 16) or croak "error reading index";
396 8         18 $idx[$_] = [ unpack 'Q
397             }
398 3         16 close $fh_in;
399 3         6 unshift @idx, [0,0]; # add initial offsets
400              
401 3         4 $self->{u_file_size} = $idx[-1]->[1];
402              
403             # some indices created by htslib bgzip are missing last offset pair
404             # check for that here by loading last block in index and proceeding from
405             # there. Also calculate uncompressed file size at the same time.
406 3         3 my $c_size = $idx[-1]->[0];
407 3         7 sysseek $self->{fh}, $idx[-1]->[0], 0;
408 3         7 my ($c, $u) = $self->_unpack_block(0);
409 3         3 $self->{u_file_size} += $u;
410 3         4 $c_size += $c;
411 3         4 while ($c_size < $self->{file_size}) {
412 0         0 push @idx, [$idx[-1]->[0]+$c, $idx[-1]->[1]+$u];
413 0         0 sysseek $self->{fh}, $idx[-1]->[0], 0;
414 0         0 ($c, $u) = $self->_unpack_block(0);
415 0         0 $self->{u_file_size} += $u;
416 0         0 $c_size += $c;
417             }
418             croak "Unexpected file size/last index mismatch ($c_size v $self->{file_size})"
419 3 50       7 if ($c_size != $self->{file_size});
420              
421 3         6 $self->{idx} = [@idx];
422 3         18 $self->{ridx}->{$_->[0]} = $_->[1] for (@idx);
423              
424 3         7 sysseek $self->{fh}, $self->{block_offset}, 0;
425              
426 3         10 return;
427              
428             }
429              
430             sub _generate_index {
431              
432             #-------------------------------------------------------------------------
433             # No arguments
434             #-------------------------------------------------------------------------
435             # No returns
436             #-------------------------------------------------------------------------
437              
438 3     3   4 my ($self) = @_;
439              
440 3         5 my $uncmp_offset = 0;
441 3         3 my $cmp_offset = 0;
442 3         2 my $i = 0;
443 3         5 $self->{u_file_size} = 0;
444 3         5 $self->{idx} = [];
445 3         5 $self->{ridx} = {};
446              
447 3         8 sysseek $self->{fh}, 0, 0;
448              
449 3         9 while ($cmp_offset < $self->{file_size}) {
450              
451 8         6 push @{$self->{idx}}, [$cmp_offset, $uncmp_offset];
  8         19  
452 8         16 $self->{ridx}->{$cmp_offset} = $uncmp_offset;
453              
454 8         14 my ($block_size, $uncompressed_size) = $self->_unpack_block(0);
455              
456 8         9 $cmp_offset += $block_size;
457 8         6 $uncmp_offset += $uncompressed_size;
458 8         36 $self->{u_file_size} += $uncompressed_size;
459              
460             }
461              
462 3         8 sysseek $self->{fh}, $self->{block_offset}, 0;
463              
464 3         5 return;
465              
466             }
467              
468              
469             sub move_to {
470              
471             # Wrapper around _seek(), avoids conflicts with system seek()
472              
473             #-------------------------------------------------------------------------
474             # ARG 0 : byte offset to which to seek
475             # ARG 1 : relativity of offset (0: file start, 1: current, 2: file end)
476             #-------------------------------------------------------------------------
477             # no returns
478             #-------------------------------------------------------------------------
479              
480            
481 12     12 1 4317 my ($self, @args) = @_;
482 12         23 $self->_seek( @args );
483              
484 12         14 return;
485              
486             }
487              
488              
489             sub _seek {
490              
491             #-------------------------------------------------------------------------
492             # ARG 0 : byte offset to which to seek
493             # ARG 1 : relativity of offset (0: file start, 1: current, 2: file end)
494             #-------------------------------------------------------------------------
495             # no returns
496             #-------------------------------------------------------------------------
497              
498 54     54   52 my ($self, $pos, $whence) = @_;
499              
500 54 100       88 $pos += $self->{u_offset} if ($whence == 1);
501 54 100       70 $pos = $self->{u_file_size} + $pos if ($whence == 2);
502              
503 54 100       76 return if ($pos < 0);
504 47 100       71 if ($pos >= $self->{u_file_size}) {
505 15         13 $self->{buffer_len} = -1;
506 15         16 $self->{u_offset} = $pos;
507 15         14 $self->{block_offset} = $pos;
508 15         25 return 1;
509             }
510              
511             # Do seeded search for nearest block start <= $pos
512             # (although we don't know the size of each block, we can determine the
513             # mean length and usually choose a starting value close to the actual -
514             # benchmarks much faster than binary search)
515             # TODO: benchmark whether breaking this out and Memoizing speeds things up
516              
517 32         22 my $s = scalar @{$self->{idx}};
  32         38  
518 32         64 my $idx = int($pos/($self->{u_file_size}) * $s);
519 32         25 while (1) {
520 35 100       67 if ($pos < $self->{idx}->[$idx]->[1]) {
521 3         4 --$idx;
522 3         3 next;
523             }
524 32 50 66     104 if ($idx+1 < $s && $pos >= $self->{idx}->[$idx+1]->[1]) {
525 0         0 ++$idx;
526 0         0 next;
527             }
528 32         31 last;
529             }
530              
531 32         32 my $block_o = $self->{idx}->[$idx]->[0];
532 32         29 my $block_o_u = $self->{idx}->[$idx]->[1];
533 32         28 my $buff_o = $pos - $block_o_u;
534              
535 32         43 $self->_load_block( $block_o );
536 32         28 $self->{buffer_offset} = $buff_o;
537 32         35 $self->{u_offset} = $block_o_u + $buff_o;
538              
539 32         58 return 1;
540              
541             }
542              
543             sub get_vo {
544              
545             #-------------------------------------------------------------------------
546             # no arguments
547             #-------------------------------------------------------------------------
548             # RET 0 : virtual offset of current position
549             #-------------------------------------------------------------------------
550              
551 0     0 1 0 my ($self) = @_;
552 0         0 return ($self->{block_offset} << 16) | $self->{buffer_offset};
553              
554             }
555              
556             sub move_to_vo {
557              
558             #-------------------------------------------------------------------------
559             # ARG 0 : virtual offset (see POD for definition)
560             #-------------------------------------------------------------------------
561             # no returns
562             #-------------------------------------------------------------------------
563              
564 0     0 1 0 my ($self, $vo) = @_;
565 0         0 my $block_o = $vo >> 16;
566 0         0 my $buff_o = $vo ^ ($block_o << 16);
567 0         0 $self->_load_block( $block_o );
568 0         0 $self->{buffer_offset} = $buff_o;
569 0 0       0 croak "invalid block offset" if (! defined $self->{ridx}->{$block_o});
570 0         0 $self->{u_offset} = $self->{ridx}->{$block_o} + $buff_o;
571              
572 0         0 return;
573              
574             }
575              
576             sub _safe_sysread {
577              
578             # sysread wrapper that checks return count and returns read
579             # (internally we should never read off end of file - doing so indicates
580             # either a software bug or a corrupt input file so we croak)
581              
582             #-------------------------------------------------------------------------
583             # ARG 0 : bytes to read
584             #-------------------------------------------------------------------------
585             # RET 0 : string read
586             #-------------------------------------------------------------------------
587              
588 242     242   188 my ($fh, $len) = @_;
589 242         161 my $buf = '';
590 242         807 my $r = sysread $fh, $buf, $len;
591 242 50       319 croak "returned unexpected byte count" if ($r != $len);
592              
593 242         459 return $buf;
594              
595             }
596              
597             1;
598              
599              
600             __END__