File Coverage

blib/lib/WARC/Record/FromVolume.pm
Criterion Covered Total %
statement 143 145 98.6
branch 62 78 79.4
condition 12 12 100.0
subroutine 24 24 100.0
pod 13 13 100.0
total 254 272 93.3


line stmt bran cond sub pod time code
1             package WARC::Record::FromVolume; # -*- CPerl -*-
2              
3 28     28   58955 use strict;
  28         60  
  28         759  
4 28     28   125 use warnings;
  28         45  
  28         1456  
5              
6             our @ISA = qw(WARC::Record);
7             our @CARP_NOT = (@ISA, qw(WARC::Volume WARC::Record::Stub));
8              
9 28     28   471 use WARC; *WARC::Record::FromVolume::VERSION = \$WARC::VERSION;
  28         78  
  28         833  
10              
11 28     28   141 use Carp;
  28         49  
  28         1516  
12 28     28   167 use Fcntl 'SEEK_SET';
  28         57  
  28         1341  
13 28     28   11769 use Symbol 'geniosym';
  28         19521  
  28         1560  
14 28     28   14622 use IO::Uncompress::Gunzip '$GunzipError';
  28         1118414  
  28         48816  
15              
16             require WARC::Fields;
17             require WARC::Record;
18             require WARC::Record::Block;
19             require WARC::Record::Replay;
20              
21             # The overload to a method call is inherited.
22             sub compareTo {
23 134     134 1 21969 my $a = shift;
24 134         181 my $b = shift;
25 134         173 my $swap = shift;
26              
27             # sort in-memory-only records ahead of on-disk records
28 134 100       247 return $swap ? -1 : 1 unless defined $b->volume;
    100          
29              
30 131 100 100     320 my $cmp =
31             ((($a->volume->filename eq $b->volume->filename)
32             || ($a->volume->_file_tag eq $b->volume->_file_tag))
33             ? ($a->offset <=> $b->offset)
34             : ($a->volume->filename cmp $b->volume->filename));
35              
36 131 100       1510 return $swap ? 0-$cmp : 0+$cmp;
37             }
38              
39             # This implementation uses a hash as the underlying structure.
40              
41             # Keys inherited from WARC::Record base class:
42             #
43             # fields
44              
45             # Keys defined by this class:
46             #
47             # volume
48             # Parent WARC::Volume object
49             # collection (optional)
50             # Parent WARC::Collection object, if record found via a collection
51             # offset
52             # Offset of start-of-record within parent volume
53             # compression
54             # Name of decompression filter used with this record
55             # data_offset
56             # Offset of data block within record (possibly compressed)
57             # payload_offset (optional)
58             # Offset of record payload within data block
59             # (Defined by this class, but only set upon protocol replay.)
60             # payload_encodings (optional)
61             # Array of transfer encodings, an undefined value, and content
62             # encodings for the payload data for this record.
63             # (Defined by this class, but only set upon protocol replay.)
64             # (NOT YET IMPLEMENTED)
65             # sl_packed_size
66             # Size of compressed data block according to "sl" gzip extension
67             # sl_full_size
68             # Size of uncompressed data block according to "sl" gzip extension
69             # protocol
70             # WARC version found at start of record
71             # logical (optional)
72             # Weak reference to logical record object containing this segment
73             # (Defined by this class, but only set by WARC::Record::Logical.)
74              
75             # Keys tested by logical record heuristics:
76             #
77             # compression
78             # defined iff record is compressed
79             # sl_packed_size
80             # defined iff compressed record can be skipped without reading data block
81              
82             # Keys used in index writers:
83             #
84             # sl_packed_size
85             # used for "S" field in CDX indexes
86              
87 1763     1763   28222 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  1763         6276  
88              
89             sub _dbg_dump {
90 36     36   13862 my $self = shift;
91              
92 36         106 my $out = 'WARC '.$self->field('WARC-Type').' record ['.$self->protocol.']';
93 36 100       154 $out .= ' [via '.$self->{compression}.']' if defined $self->{compression};
94 36         70 $out .= "\n";
95              
96 36         86 $out .= ' id '.$self->id."\n";
97              
98 36         84 $out .= ' at '.$self->offset.' in '.$self->volume."\n";
99             $out .= ' "sl" header: '.$self->{sl_packed_size}.' packed from '
100 36 100       140 .$self->{sl_full_size}." octets\n" if defined $self->{sl_full_size};
101              
102 36         87 $out .= ' data begins at offset '.$self->{data_offset};
103 36 100       90 $out .= ' within '.(defined $self->{compression} ? 'record' : 'volume');
104 36         50 $out .="\n";
105              
106 36         142 return $out;
107             }
108              
109             sub _get_compression_error {
110 3     3   297 my $self = shift;
111              
112 3 100       12 if (not defined $self->{compression}) {
    100          
113 1         9 return '(record not compressed)';
114             } elsif ($self->{compression} eq 'IO::Uncompress::Gunzip') {
115 1         7 return $GunzipError;
116             } else {
117 1         8 die "unknown compression method";
118             }
119             }
120              
121 1     1 1 744 sub new { croak "WARC records are read from volumes" }
122              
123             sub block {
124 2     2 1 9 my $self = shift;
125              
126 2 100       96 croak "attempt to set block on record from volume" if @_;
127              
128 1         5 return undef;
129             }
130              
131             sub _read {
132 670     670   2090 my $class = shift;
133 670         835 my $volume = shift;
134 670         900 my $offset = shift;
135              
136 670 100       1333 croak "WARC::Record::FromVolume::_read is a class method"
137             unless defined $class;
138 669 100       1493 croak "attempt to read WARC record from undefined volume"
139             unless defined $volume;
140 668 100       1167 croak "attempt to read WARC record from undefined offset"
141             unless defined $offset;
142              
143 667         775 my $handle;
144 667 100       1088 if (ref $offset) { # I/O handle passed in instead
145 4         5 $handle = $offset;
146 4         9 $offset = tell $handle;
147             } else { # open new handle and seek to offset
148 663         1429 $handle = $volume->open;
149 663 50       4612 seek $handle, $offset, SEEK_SET or die "seek: $!";
150             }
151              
152 667         2554 my %ob = (volume => $volume, offset => $offset);
153              
154 667         889 my $magic; my $protocol = '';
  667         1007  
155 667 50       7932 defined(read $handle, $magic, 6) or die "read: $!";
156 667 100       2169 return undef if $magic eq ''; # end-of-file reached
157              
158 638 100       1371 if ($magic eq 'WARC/1') {
    100          
159             # uncompressed WARC record found ==> pass it on through
160 546         803 $protocol = $magic;
161             } elsif (unpack('H4', $magic) eq '1f8b') {
162             # gzip signature found ==> check for extension header and stack filter
163              
164 90 100       277 if (unpack('x3C', $magic) & 0x04) { # FLG.FEXTRA is set
165 35 50       103 defined(read $handle, $magic, 6, 6) or die "read: $!";
166 35         77 my $xlen = unpack 'v', substr $magic, -2;
167 35 50       58 my $extra; defined(read $handle, $extra, $xlen) or die "read: $!";
  35         89  
168 35         105 my @extra = unpack '(a2 v/a*)*', $extra;
169 35         81 $magic .= $extra;
170             # @extra is now (tag => $data)...
171 35         80 for (my $i = 0; $i < @extra; $i += 2) {
172 30 100 100     147 if ($extra[$i] eq 'sl' and length($extra[1+$i]) == 8)
173 20         97 { @ob{qw/sl_packed_size sl_full_size/} = unpack 'VV', $extra[1+$i] }
174             }
175             }
176              
177 90 50       552 $handle = new IO::Uncompress::Gunzip ($handle,
178             Prime => $magic, MultiStream => 0,
179             AutoClose => 1, Transparent => 0)
180             or die "IO::Uncompress::Gunzip: $GunzipError";
181 90         119047 $ob{compression} = 'IO::Uncompress::Gunzip';
182             } else
183 2         11 { croak "WARC record header not found at offset $offset in $volume\n"
184             ." found [".join(' ', unpack '(H2)*', $magic)."] instead" }
185              
186             # read WARC version
187 636         1982 $protocol .= <$handle>;
188 636         8854 $protocol =~ s/[[:space:]]+$//;
189             # The WARC version read from the file is appended because an
190             # uncompressed WARC record is recognized by the first six bytes of the
191             # WARC version tag, which were transferred to $protocol if found.
192 636 50       1960 $protocol =~ m/^WARC/
193             or croak "WARC record header not found after decompression\n"
194             ." found [".join(' ', unpack '(H2)*', $protocol)."] instead";
195 636         1295 $ob{protocol} = $protocol;
196              
197 636         3122 $ob{fields} = parse WARC::Fields from => $handle;
198 636         1683 $ob{fields}->set_readonly;
199              
200 636         1418 $ob{data_offset} = tell $handle;
201              
202 636         7974 close $handle;
203              
204 636         5971 { our $_total_read; $_total_read++ }
  636         699  
  636         868  
205              
206 636         4644 bless \%ob, $class;
207             }
208              
209 73     73 1 213 sub protocol { (shift)->{protocol} }
210              
211 1493     1493 1 4042 sub volume { (shift)->{volume} }
212              
213 866     866 1 6940 sub offset { (shift)->{offset} }
214              
215             sub logical {
216 174     174 1 314 my $self = shift;
217              
218 174         339 my $segment_header_value = $self->field('WARC-Segment-Number');
219 174 50       490 if (defined $self->{logical}) {
    50          
220 0         0 return $self->{logical}; # cached object remains valid ==> return it
221             } elsif (defined $segment_header_value) {
222 0         0 return _read WARC::Record::Logical $self;
223             } else {
224 174         422 return $self; # no continuation records present
225             }
226             }
227              
228 72 100   72 1 181 sub segments { if (wantarray) { return shift } else { return 1 } }
  36         160  
  36         109  
229              
230             sub next {
231 380     380 1 831 my $self = shift;
232              
233 380         522 my $next = undef;
234              
235 380 100       959 if ($self->{sl_packed_size}) { # gzip "sl" extended header available
    100          
236 12         25 my $handle = $self->volume->open;
237              
238             # seek to read 32-bit ISIZE field at end of gzip stream
239 12 50       32 seek $handle, $self->offset + $self->{sl_packed_size} - 4, SEEK_SET
240             or die "seek: $!";
241 12 50       28 my $isize; defined(read $handle, $isize, 4) or die "read: $!";
  12         165  
242              
243 12 100 100     77 if (length $isize > 0 # read off the end yields nothing
244             and $self->{sl_full_size} == unpack 'V', $isize) { # ... and looks valid
245 4         10 $next = _read WARC::Record::FromVolume $self->volume, $handle;
246 4         87 close $handle;
247 4         20 return $next;
248             } else {
249 8         20 carp "extended 'sl' header was found to be invalid\n"
250             .' in record at '.($self->offset).' in '.($self->volume);
251             }
252             } elsif (not defined $self->{compression}) { # WARC record is not compressed
253             return _read WARC::Record::FromVolume $self->volume,
254 336         607 $self->{data_offset} + $self->field('Content-Length') + 4;
255             }
256              
257             # if we get here, we have to scan for the end of the record
258 40         2642 my $handle = $self->volume->open;
259 40 50       109 seek $handle, $self->offset, SEEK_SET or die "seek: $!";
260              
261             my $zhandle = $self->{compression}->new
262 40 50       277 ($handle, MultiStream => 0, AutoClose => 0)
263             or die "$self->{compression}: ".$self->_get_compression_error;
264 40 50       51976 seek $zhandle, $self->{data_offset} + $self->field('Content-Length'), SEEK_SET
265             or die "zseek: $! ".$self->_get_compression_error;
266 40 50       2970 my $end; defined(read $zhandle, $end, 4)
  40         105  
267             or die "zread: $! ".$self->_get_compression_error;
268 40 50       1330 croak "end-of-record marker not found" unless $end eq (WARC::CRLF x 2);
269              
270             # The main handle is somewhere *after* the actual end of the block
271             # because IO::Uncompress::Gunzip reads ahead. We can get the contents
272             # of that "read ahead" buffer and use that to adjust our final offset.
273 40         92 $next = _read WARC::Record::FromVolume $self->volume,
274             (tell($handle) - length($zhandle->trailingData));
275              
276 40         960 close $zhandle; close $handle;
  40         937  
277              
278 40         177 return $next;
279             }
280              
281             sub open_block {
282 140     140 1 217 my $self = shift;
283              
284 140         379 my $xhandle = Symbol::geniosym;
285 140         3448 tie *$xhandle, 'WARC::Record::Block', $self;
286              
287 140         513 return $xhandle;
288             }
289              
290 138     138 1 306 sub open_continued { (shift)->logical->open_block }
291              
292             sub replay {
293 148     148 1 3601 my $self = shift;
294              
295 148         443 my @handlers = WARC::Record::Replay::find_handlers($self);
296              
297 148         260 my $result = undef;
298 148   100     669 $result = (shift @handlers)->($self)
299             while scalar @handlers && !defined $result;
300              
301 148         1984 return $result;
302             }
303              
304             sub open_payload {
305 8     8 1 35 my $self = shift;
306              
307 8         16 my $replay = $self->replay;
308 8 50       29 return undef unless $replay; # no payload found
309              
310 8         20 my $xhandle = Symbol::geniosym;
311 8         172 tie *$xhandle, 'WARC::Record::Payload', $self, $replay;
312              
313 8         75 return $xhandle;
314             }
315              
316             1;
317             __END__