File Coverage

blib/lib/WARC/Record/FromVolume.pm
Criterion Covered Total %
statement 129 133 96.9
branch 51 66 77.2
condition 12 12 100.0
subroutine 22 24 91.6
pod 12 12 100.0
total 226 247 91.5


line stmt bran cond sub pod time code
1             package WARC::Record::FromVolume; # -*- CPerl -*-
2              
3 26     26   57372 use strict;
  26         59  
  26         767  
4 26     26   119 use warnings;
  26         52  
  26         1459  
5              
6             our @ISA = qw(WARC::Record);
7             our @CARP_NOT = (@ISA, qw(WARC::Volume WARC::Record::Stub));
8              
9 26     26   449 use WARC; *WARC::Record::FromVolume::VERSION = \$WARC::VERSION;
  26         49  
  26         792  
10              
11 26     26   133 use Carp;
  26         75  
  26         1600  
12 26     26   163 use Fcntl 'SEEK_SET';
  26         68  
  26         1258  
13 26     26   11087 use Symbol 'geniosym';
  26         18832  
  26         1440  
14 26     26   13491 use IO::Uncompress::Gunzip '$GunzipError';
  26         1069765  
  26         44035  
15              
16             require WARC::Fields;
17             require WARC::Record;
18             require WARC::Record::Block;
19             require WARC::Record::Replay;
20              
21 0     0   0 sub _set { croak "attempt to modify WARC record in file" }
22              
23             # The overload to a method call is inherited.
24             sub compareTo {
25 111     111 1 21188 my $a = shift;
26 111         148 my $b = shift;
27 111         175 my $swap = shift;
28              
29             # sort in-memory-only records ahead of on-disk records
30 111 100       204 return $swap ? -1 : 1 unless defined $b->volume;
    100          
31              
32 108 100 100     299 my $cmp =
33             ((($a->volume->filename eq $b->volume->filename)
34             || ($a->volume->_file_tag eq $b->volume->_file_tag))
35             ? ($a->offset <=> $b->offset)
36             : ($a->volume->filename cmp $b->volume->filename));
37              
38 108 100       1542 return $swap ? 0-$cmp : 0+$cmp;
39             }
40              
41             # This implementation uses a hash as the underlying structure.
42              
43             # Keys inherited from WARC::Record base class:
44             #
45             # fields
46              
47             # Keys defined by this class:
48             #
49             # volume
50             # Parent WARC::Volume object
51             # collection (optional)
52             # Parent WARC::Collection object, if record found via a collection
53             # offset
54             # Offset of start-of-record within parent volume
55             # compression
56             # Name of decompression filter used with this record
57             # data_offset
58             # Offset of data block within record (possibly compressed)
59             # sl_packed_size
60             # Size of compressed data block according to "sl" gzip extension
61             # sl_full_size
62             # Size of uncompressed data block according to "sl" gzip extension
63             # protocol
64             # WARC version found at start of record
65             # logical (optional)
66             # Weak reference to logical record object containing this segment
67             # (Defined by this class, but only set by WARC::Record::Logical.)
68              
69             # Keys tested by logical record heuristics:
70             #
71             # compression
72             # defined iff record is compressed
73             # sl_packed_size
74             # defined iff compressed record can be skipped without reading data block
75              
76 1677     1677   27544 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  1677         7045  
77              
78             sub _dbg_dump {
79 36     36   13830 my $self = shift;
80              
81 36         134 my $out = 'WARC '.$self->field('WARC-Type').' record ['.$self->protocol.']';
82 36 100       137 $out .= ' [via '.$self->{compression}.']' if defined $self->{compression};
83 36         58 $out .= "\n";
84              
85 36         147 $out .= ' id '.$self->id."\n";
86              
87 36         72 $out .= ' at '.$self->offset.' in '.$self->volume."\n";
88             $out .= ' "sl" header: '.$self->{sl_packed_size}.' packed from '
89 36 100       125 .$self->{sl_full_size}." octets\n" if defined $self->{sl_full_size};
90              
91 36         89 $out .= ' data begins at offset '.$self->{data_offset};
92 36 100       83 $out .= ' within '.(defined $self->{compression} ? 'record' : 'volume');
93 36         47 $out .="\n";
94              
95 36         124 return $out;
96             }
97              
98             sub _get_compression_error {
99 3     3   11 my $self = shift;
100              
101 3 100       14 if (not defined $self->{compression}) {
    100          
102 1         9 return '(record not compressed)';
103             } elsif ($self->{compression} eq 'IO::Uncompress::Gunzip') {
104 1         5 return $GunzipError;
105             } else {
106 1         7 die "unknown compression method";
107             }
108             }
109              
110 1     1 1 893 sub new { croak "WARC records are read from volumes" }
111              
112             sub _read {
113 574     574   1198 my $class = shift;
114 574         755 my $volume = shift;
115 574         771 my $offset = shift;
116              
117 574         780 my $handle;
118 574 100       1073 if (ref $offset) { # I/O handle passed in instead
119 4         6 $handle = $offset;
120 4         8 $offset = tell $handle;
121             } else { # open new handle and seek to offset
122 570         1428 $handle = $volume->open;
123 570 50       4038 seek $handle, $offset, SEEK_SET or die "seek: $!";
124             }
125              
126 574         2444 my %ob = (volume => $volume, offset => $offset);
127              
128 574         851 my $magic; my $protocol = '';
  574         846  
129 574 50       7287 defined(read $handle, $magic, 6) or die "read: $!";
130 574 100       1921 return undef if $magic eq ''; # end-of-file reached
131              
132 552 100       1396 if ($magic eq 'WARC/1') {
    100          
133             # uncompressed WARC record found ==> pass it on through
134 470         833 $protocol = $magic;
135             } elsif (unpack('H4', $magic) eq '1f8b') {
136             # gzip signature found ==> check for extension header and stack filter
137              
138 80 100       261 if (unpack('x3C', $magic) & 0x04) { # FLG.FEXTRA is set
139 35 50       102 defined(read $handle, $magic, 6, 6) or die "read: $!";
140 35         82 my $xlen = unpack 'v', substr $magic, -2;
141 35 50       46 my $extra; defined(read $handle, $extra, $xlen) or die "read: $!";
  35         83  
142 35         120 my @extra = unpack '(a2 v/a*)*', $extra;
143 35         73 $magic .= $extra;
144             # @extra is now (tag => $data)...
145 35         82 for (my $i = 0; $i < @extra; $i += 2) {
146 30 100 100     142 if ($extra[$i] eq 'sl' and length($extra[1+$i]) == 8)
147 20         88 { @ob{qw/sl_packed_size sl_full_size/} = unpack 'VV', $extra[1+$i] }
148             }
149             }
150              
151 80 50       513 $handle = new IO::Uncompress::Gunzip ($handle,
152             Prime => $magic, MultiStream => 0,
153             AutoClose => 1, Transparent => 0)
154             or die "IO::Uncompress::Gunzip: $GunzipError";
155 80         110302 $ob{compression} = 'IO::Uncompress::Gunzip';
156             } else
157 2         22 { croak "WARC record header not found at offset $offset in $volume\n"
158             ." found [".join(' ', unpack '(H2)*', $magic)."] instead" }
159              
160             # read WARC version
161 550         1844 $protocol .= <$handle>;
162 550         8878 $protocol =~ s/[[:space:]]+$//;
163             # The WARC version read from the file is appended because an
164             # uncompressed WARC record is recognized by the first six bytes of the
165             # WARC version tag, which were transferred to $protocol if found.
166 550 50       1885 $protocol =~ m/^WARC/
167             or croak "WARC record header not found after decompression\n"
168             ." found [".join(' ', unpack '(H2)*', $protocol)."] instead";
169 550         1254 $ob{protocol} = $protocol;
170              
171 550         2953 $ob{fields} = parse WARC::Fields from => $handle;
172 550         1646 $ob{fields}->set_readonly;
173              
174 550         1300 $ob{data_offset} = tell $handle;
175              
176 550         7187 close $handle;
177              
178 550         5510 { our $_total_read; $_total_read++ }
  550         731  
  550         803  
179              
180 550         4442 bless \%ob, $class;
181             }
182              
183 73     73 1 207 sub protocol { (shift)->{protocol} }
184              
185 1191     1191 1 3480 sub volume { (shift)->{volume} }
186              
187 775     775 1 6657 sub offset { (shift)->{offset} }
188              
189             sub logical {
190 93     93 1 175 my $self = shift;
191              
192 93         159 my $segment_header_value = $self->field('WARC-Segment-Number');
193 93 50       222 if (defined $self->{logical}) {
    50          
194 0         0 return $self->{logical}; # cached object remains valid ==> return it
195             } elsif (defined $segment_header_value) {
196 0         0 return _read WARC::Record::Logical $self;
197             } else {
198 93         224 return $self; # no continuation records present
199             }
200             }
201              
202 36     36 1 147 sub segments { return shift }
203              
204             sub next {
205 310     310 1 713 my $self = shift;
206              
207 310         430 my $next = undef;
208              
209 310 100       908 if ($self->{sl_packed_size}) { # gzip "sl" extended header available
    100          
210 12         23 my $handle = $self->volume->open;
211              
212             # seek to read 32-bit ISIZE field at end of gzip stream
213 12 50       34 seek $handle, $self->offset + $self->{sl_packed_size} - 4, SEEK_SET
214             or die "seek: $!";
215 12 50       27 my $isize; defined(read $handle, $isize, 4) or die "read: $!";
  12         153  
216              
217 12 100 100     80 if (length $isize > 0 # read off the end yields nothing
218             and $self->{sl_full_size} == unpack 'V', $isize) { # ... and looks valid
219 4         11 $next = _read WARC::Record::FromVolume $self->volume, $handle;
220 4         88 close $handle;
221 4         17 return $next;
222             } else {
223 8         20 carp "extended 'sl' header was found to be invalid\n"
224             .' in record at '.($self->offset).' in '.($self->volume);
225             }
226             } elsif (not defined $self->{compression}) { # WARC record is not compressed
227             return _read WARC::Record::FromVolume $self->volume,
228 274         551 $self->{data_offset} + $self->field('Content-Length') + 4;
229             }
230              
231             # if we get here, we have to scan for the end of the record
232 32         2626 my $handle = $self->volume->open;
233 32 50       94 seek $handle, $self->offset, SEEK_SET or die "seek: $!";
234              
235             my $zhandle = $self->{compression}->new
236 32 50       225 ($handle, MultiStream => 0, AutoClose => 0)
237             or die "$self->{compression}: ".$self->_get_compression_error;
238 32 50       40273 seek $zhandle, $self->{data_offset} + $self->field('Content-Length'), SEEK_SET
239             or die "zseek: $! ".$self->_get_compression_error;
240 32 50       2345 my $end; defined(read $zhandle, $end, 4)
  32         86  
241             or die "zread: $! ".$self->_get_compression_error;
242 32 50       1011 croak "end-of-record marker not found" unless $end eq (WARC::CRLF x 2);
243              
244             # The main handle is somewhere *after* the actual end of the block
245             # because IO::Uncompress::Gunzip reads ahead. We can get the contents
246             # of that "read ahead" buffer and use that to adjust our final offset.
247 32         68 $next = _read WARC::Record::FromVolume $self->volume,
248             (tell($handle) - length($zhandle->trailingData));
249              
250 32         757 close $zhandle; close $handle;
  32         710  
251              
252 32         128 return $next;
253             }
254              
255             sub open_block {
256 59     59 1 78 my $self = shift;
257              
258 59         133 my $xhandle = Symbol::geniosym;
259 59         1392 tie *$xhandle, 'WARC::Record::Block', $self;
260              
261 59         199 return $xhandle;
262             }
263              
264 57     57 1 119 sub open_continued { (shift)->logical->open_block }
265              
266             sub replay {
267 53     53 1 3324 my $self = shift;
268              
269 53         126 my @handlers = WARC::Record::Replay::find_handlers($self);
270              
271 53         84 my $result = undef;
272 53   100     249 $result = (shift @handlers)->($self)
273             while scalar @handlers && !defined $result;
274              
275 53         495 return $result;
276             }
277              
278             sub open_payload {
279 0     0 1   die "not yet implemented"
280             }
281              
282             1;
283             __END__