File Coverage

blib/lib/WARC/Record/Logical/Heuristics.pm
Criterion Covered Total %
statement 216 216 100.0
branch 72 72 100.0
condition 24 24 100.0
subroutine 14 14 100.0
pod 2 2 100.0
total 328 328 100.0


line stmt bran cond sub pod time code
1             package WARC::Record::Logical::Heuristics; # -*- CPerl -*-
2              
3 2     2   61299 use strict;
  2         12  
  2         55  
4 2     2   8 use warnings;
  2         5  
  2         74  
5              
6             our @ISA = qw();
7              
8 2     2   348 use WARC; *WARC::Record::Logical::Heuristics::VERSION = \$WARC::VERSION;
  2         5  
  2         75  
9              
10 2     2   11 use Carp;
  2         4  
  2         108  
11 2     2   11 use File::Spec;
  2         4  
  2         4383  
12              
13             =head1 NAME
14              
15             WARC::Record::Logical::Heuristics - heuristics for locating record segments
16              
17             =head1 SYNOPSIS
18              
19             use WARC::Record::Logical::Heuristics;
20              
21             =head1 DESCRIPTION
22              
23             This is an internal module that provides functions for locating record
24             segments when the needed information is not available from an index.
25              
26             These mostly assume that IIPC WARC guidelines have been followed, as
27             otherwise there simply is no efficient solution.
28              
29             Implementations vary, however, with some using only an incrementing serial
30             number and a constant timestamp from the initiation of the crawl job, while
31             the guidelines and specification envision a timestamp reflecting the first
32             write to that specific file rather than the start of the crawl. Constant
33             timestamps are checked first, since the search is simpler.
34              
35             =over
36              
37             =item $WARC::Record::Logical::Heuristics::Patience
38              
39             This variable sets a threshold used to limit the reach of an unproductive
40             search. This module tracks the "effort" expended (I/O performed) during a
41             search and abandons the search if the threshold is exceeded. Finding
42             results dynamically (and temporarily) increases this threshold during a
43             search, such that this really sets how far the search will go between
44             results before giving up and concluding that there are no more results.
45              
46             The search will reach farther if either the WARC files are not compressed,
47             or the "sl" GZIP extension documented in L is used.
48             Decompressing record data to find the next record is considerable effort
49             for larger records, but is not counted for very small records that the
50             system is likely to already have cached after the header has been read.
51              
52             =cut
53              
54             # These provide a simple mechanism to limit the scope of a search that is
55             # not producing results. Both are localized in the top-level calls.
56              
57             our $Patience = 10000; # How much effort to put into a search?
58             our $Effort = 0; # How much have we done so far during this search?
59              
60             # Most I/O incurs "effort", represented by incrementing $Effort, while
61             # partial success (finding an interesting record) increases "patience",
62             # represented by incrementing $Patience. The search stops when either
63             # there are no more places to look or $Effort exceeds $Patience.
64              
65             =item %WARC::Record::Logical::Heuristics::Effort
66              
67             This internal hash indicates how costly certain operations should be
68             considered. The keys and their meanings are subject to change at whim, but
69             this is available for quick tuning if needed. Generally, the better
70             solution is to index your data rather than spend time tuning heuristics.
71              
72             =cut
73              
74             our %Effort =
75             (# read_record:
76             # effort incurred to read a record header, regardless of compression
77             read_record => 5,
78              
79             # gzread_data_per_tick:
80             # number of bytes to read while advancing past a compressed record to
81             # incur one effort point; effort incurred rounds down, even to zero
82             #
83             # this value is a shot-in-the-dark estimate that gunzipping 320 KiB is
84             # equivalent to the open/seek/read process for loading a record header
85             gzread_data_per_tick => 64 * 1024,
86              
87             # readdir_files_per_tick:
88             # number of file names to read and check while scanning a directory for
89             # to incur one effort point; effort incurred rounds down, even to zero
90             #
91             # this value is a shot-in-the-dark estimate that reading/matching 1600
92             # file names is equivalent to loading a record header; this estimate
93             # may be high or low depending on the number of axes used in the search
94             readdir_files_per_tick => 320,
95             );
96              
97             # Internal functions:
98              
99             ## @axes = _split_digit_spans( $filename )
100             ##
101             ## Extract possible sequence numbers from $filename and return list of
102             ## array references [PREFIX, NUMBER, SUFFIX] where NUMBER is a field that
103             ## can be adjusted to find "nearby" files if NUMBER turns out to actually
104             ## be a sequence number. Finds numerous false matches in normal use, but
105             ## broad searches cost only time while excessive narrowing causes failure.
106             ##
107             ## Does not perform I/O; does not increment $Effort.
108              
109             sub _split_digit_spans ($) {
110 38     38   4707 my $name = shift;
111 38         55 my @axes = ();
112              
113             # Split on zero-width boundaries between digits and non-digits.
114 38         788 my @pieces = split /(?=[0-9])(?<=[^0-9])|(?=[^0-9])(?<=[0-9])/, $name;
115             # The @pieces array now contains alternating spans of digits and non-digits.
116              
117 38         116 for (my $i = 0; $i < @pieces; $i++) {
118 524 100 100     1557 next unless ($pieces[$i] =~ /^[0-9]+$/ && length($pieces[$i]) < 9);
119             # More than 8 digits is probably not a sequence number and may be
120             # beyond the range of an integer anyway. Use indexes instead of
121             # heuristics if you need to work with a billion WARC files.
122 237         1139 push @axes, [join('', @pieces[0..($i-1)]),
123             $pieces[$i], join('', @pieces[($i+1)..$#pieces])];
124             }
125              
126 38         166 return @axes;
127             }
128              
129             ## @found = _find_nearby_files( $direction, @axes )
130             ##
131             ## Locate existing files that appear to be part of a contiguous sequence
132             ## along an axis in @axes. The $direction argument is either +1 to search
133             ## for higher numbers or -1 to search for lower numbers. A direction
134             ## value with a magnitude greater than 1 results in skipping possibilities
135             ## during the search.
136             ##
137             ## Returns a list of array references reflecting the files along each axis
138             ## from the argument list but omitting axes on which no files were found.
139             ##
140             ## Performs only directory lookups, which have highly unpredictable costs
141             ## and are usually cached by the system; does not increment $Effort.
142              
143             sub _find_nearby_files ($@) {
144 20     20   2234 my $direction = shift;
145 20         34 my @found = ();
146              
147 20         32 foreach my $axis (@_) {
148 180         220 my @files = ();
149 180         340 my $i = $axis->[1] + $direction; my $file;
  180         194  
150 180         2275 while (-f ($file = join '', ($axis->[0],
151             sprintf('%0*d', length $axis->[1], $i),
152             $axis->[2])))
153 42         121 { push @files, $file; $i += $direction }
  42         548  
154 180 100       695 push @found, \@files if scalar @files;
155             }
156              
157 20         61 return @found;
158             }
159              
160             ## @found = _scan_directory_for_axes( $dirname, @axes )
161             ##
162             ## Locate existing files that may appear to be part of a sequence along an
163             ## axis in @axes, using wildcards for long digit spans.
164             ##
165             ## The $dirname argument specifies the name of a directory to search and
166             ## all @axes are interpreted relative to $dirname. This differs from
167             ## _find_nearby_files where each axis specifies full absolute filenames.
168             ## For this function, the axes are strictly filenames with no directory.
169             ##
170             ## Returns a list of array references reflecting the files along each axis
171             ## from the argument list but omitting axes on which no files were found.
172             ##
173             ## Performs directory reads; increments $Effort to count file names read.
174              
175             sub _scan_directory_for_axes ($@) {
176 16     16   828 my $dirname = shift;
177 16         26 my $read_count = 0;
178              
179             my @re = map {
180 16         30 my $pre = quotemeta $_->[0]; my $post = quotemeta $_->[2];
  60         175  
  60         86  
181 60         129 $pre =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg;
  14         54  
182 60         136 $post =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg;
  2         13  
183 60         81 my $midlen = length $_->[1]; qr/^$pre[0-9]{$midlen}$post/ } @_;
  60         907  
184              
185 16         29 my $filename;
186 16         22 my @found = ();
187 16 100       819 opendir my $dir, $dirname or croak "$dirname: $!";
188 15         533 while (defined ($filename = readdir $dir)) {
189 435         745 foreach (0 .. $#re) {
190 1740 100       3788 if ($filename =~ $re[$_])
191 148         182 { push @{$found[$_]}, $filename }
  148         278  
192             }
193 435         920 $read_count++;
194             }
195 15         199 closedir $dir;
196              
197 15         94 $Effort += int($read_count / $Effort{readdir_files_per_tick});
198 15         38 return grep {scalar @$_} @found;
  60         195  
199             }
200              
201             ## @similar = _find_similar_files( $seed )
202             ##
203             ## Locate existing files that may appear to be part of a sequence involving
204             ## any digit span in $seed, using wildcards for long digit spans and
205             ## searching only the directory containing $seed.
206             ##
207             ## Returns a list of array references, each containing two array references
208             ## for files sorting before and after $seed, reflecting the files along
209             ## each axis derived from $seed on which files other than $seed were found.
210             ##
211             ## Uses _scan_directory_for_axes; does not perform I/O directly.
212              
213             sub _find_similar_files ($) {
214 12     12   4792 my $seedfile = shift;
215              
216 12         30 my $fs_volname; my $dirname; my $filename;
  12         0  
217 12         126 ($fs_volname, $dirname, $filename) = File::Spec->splitpath($seedfile);
218              
219 12         85 my @found = _scan_directory_for_axes
220             (File::Spec->catpath($fs_volname, $dirname, ''),
221             _split_digit_spans $filename);
222 12         41 my @similar = ();
223 12         23 foreach my $axis_files (@found) {
224 48         65 my @before = (); my @after = ();
  48         54  
225 48         64 foreach my $fname (@$axis_files) {
226 114 100       221 if ($fname lt $filename) {
    100          
227 32         151 push @before, File::Spec->catpath($fs_volname, $dirname, $fname);
228             } elsif ($fname gt $filename) {
229 34         152 push @after, File::Spec->catpath($fs_volname, $dirname, $fname);
230             }
231             }
232 18         42 push @similar, [[sort {$a cmp $b} @before],
233 48 100       146 [sort {$a cmp $b} @after]] if @before + @after;
  28         45  
234             }
235              
236 12         45 return @similar;
237             }
238              
239             ## ($checkpoint, @records) =
240             ## _scan_volume( $volume, $start, $end, [$field, $value]... )
241             ##
242             ## Search $volume for segment records where any $field matches $value
243             ## starting at offset $start and ending at or after offset $end. If $end
244             ## is an undefined value, searches until end-of-file.
245             ##
246             ## Only returns records that have a 'WARC-Segment-Number' header.
247             ##
248             ## The returned $checkpoint is the last record examined, regardless of
249             ## header values, and provides a valid offset for resuming a search.
250              
251             sub _scan_volume ($$$@) {
252 97     97   3549 my $volume = shift;
253 97         126 my $start = shift;
254 97         112 my $end = shift;
255              
256 97         212 my $record = $volume->record_at($start);
257 97         347 my @records = ();
258              
259 97   100     344 while ($record && (!defined $end || $record->offset <= $end)) {
      100        
260 885         1788 $Effort += $Effort{read_record};
261             next unless (defined $record->field('WARC-Segment-Number')
262 885 100 100     1443 && grep {defined $record->field($_->[0])} @_);
  327         619  
263 224 100       360 push @records, $record if grep {defined $record->field($_->[0])
  286 100       504  
264             && $record->field($_->[0]) eq $_->[1]} @_;
265             } continue { $Effort += int($record->field('Content-Length')
266             / $Effort{gzread_data_per_tick})
267             if (defined $record->{compression}
268 885 100 100     1633 && !defined $record->{sl_packed_size});
269 885         1424 $record = $record->next }
270              
271 97         299 return $record, @records;
272             }
273              
274             =item ( $first_segment, @clues ) = find_first_segment( $record )
275              
276             Attempt to locate the first segment of the logical record suggested by the
277             given record without using indexes. Croaks if given a record that does not
278             appear to have been written using WARC segmentation. Returns a
279             C object for the first record and a list of other objects
280             that may be useful for locating continuation records. Returns undef in the
281             first slot if no clear first segment was found, but can still return other
282             records encountered during the search even if the search was ultimately
283             unsuccessful.
284              
285             =cut
286              
287             ## Each "clue" can be a WARC::Record, or a hint in the form of [key => value].
288             ##
289             ## The hint keys currently are:
290             ##
291             ## tail => $record
292             ## last record examined in initial volume
293             ## (a good starting point to search for more segments)
294             ##
295             ## files_on_axes => [$filename, ...]...
296             ## array of arrays from _find_nearby_files
297             ## files_from_dir => [[$filename...], [$filename...]]...
298             ## array of arrays from _find_similar_files
299             ## Note that the filenames are set to undef in these hints as the
300             ## corresponding WARC volumes are scanned, with any relevant records
301             ## added directly to the clue list as they are found.
302              
303             sub find_first_segment {
304 11     11 1 569 local $Patience = $Patience;
305 11         21 local $Effort = 0;
306              
307 11         18 my $initial = shift;
308              
309 11 100       31 croak 'searching for segments for unsegmented record'
310             unless defined $initial->field('WARC-Segment-Number');
311              
312 10         27 my $origin_id = $initial->field('WARC-Segment-Origin-ID');
313 10         17 my @clues = (); my $point; my @records;
  10         18  
314              
315             # First we search the volume containing the initial record, since
316             # multiple WARC files may have been concatenated together after writing.
317 10         23 ($point, @records) = _scan_volume $initial->volume, 0, $initial->offset,
318             [WARC_Segment_Origin_ID => $origin_id], [WARC_Record_ID => $origin_id];
319             # ... @records will always include $initial ...
320 10         32 push @clues, @records, [tail => $point];
321              
322 10         23 foreach my $record (@records) {
323 10 100       24 return $record, @clues if $record->field('WARC-Record-ID') eq $origin_id;
324             }
325 7         16 $Patience += $Effort * ((scalar @records) - 1);
326 7 100       20 return undef, @clues if $Effort > $Patience;
327              
328             # If we get this far, the first segment must be in another volume.
329             {
330 6         14 my @simple_axes = _split_digit_spans $initial->volume->filename;
331 6         18 my @nearby = _find_nearby_files -1, @simple_axes;
332              
333             # A simple sequence number may be in use; we can check these volumes
334             # before reading the directory to handle varying timestamps.
335 6 100       18 push @clues, [files_on_axes => @nearby] if scalar @nearby;
336 6         21 foreach my $axis_files (reverse @nearby) {
337             # Work backwards on the assumption that sequence numbers are nearer
338             # to the end of the filename. (Correct for Wget and Wpull.)
339 4         9 foreach my $name (@$axis_files) {
340 6         8 my $previousEffort = $Effort;
341 6         17 my $volume = mount WARC::Volume ($name);
342 6         136 (undef, @records) = _scan_volume $volume, 0, undef,
343             [WARC_Segment_Origin_ID => $origin_id],
344             [WARC_Record_ID => $origin_id];
345 6         14 push @clues, @records; $name = undef;
  6         9  
346 6         13 foreach my $record (@records) {
347 3 100       8 return $record, @clues
348             if $record->field('WARC-Record-ID') eq $origin_id;
349             }
350 5         10 $Patience += ($Effort - $previousEffort) * scalar @records;
351 5 100       25 return undef, @clues if $Effort > $Patience;
352             }
353             }
354             }
355              
356             # If we get this far, the first segment is in another volume and multiple
357             # numbers must change to find that other volume. Assume that timestamps
358             # are in use in the file names, confounding the simple sequence search.
359             {
360 6         11 my @nearby = _find_similar_files $initial->volume->filename;
  4         9  
  4         12  
361              
362 4 100       13 push @clues, [files_from_dir => @nearby] if scalar @nearby;
363             # Work forwards on the assumption that sequence numbers are nearer to
364             # the beginning of the filename. (Correct in Internet Archive samples.)
365 4         9 foreach my $fname ((map {reverse @{$_->[0]}} @nearby),
  8         11  
  8         14  
366             # work backwards within the "before" list on each axis
367             # ... and forwards within the "after" list on each axis
368 8         10 (map {@{$_->[1]}} @nearby)) {
  8         16  
369 9         13 my $previousEffort = $Effort;
370 9         28 my $volume = mount WARC::Volume ($fname);
371 9         141 (undef, @records) = _scan_volume $volume, 0, undef,
372             [WARC_Segment_Origin_ID => $origin_id],
373             [WARC_Record_ID => $origin_id];
374 9         20 push @clues, @records; $fname = undef;
  9         11  
375 9         16 foreach my $record (@records) {
376 4 100       9 return $record, @clues
377             if $record->field('WARC-Record-ID') eq $origin_id;
378             }
379 7         13 $Patience += ($Effort - $previousEffort) * scalar @records;
380 7 100       23 return undef, @clues if $Effort > $Patience;
381             }
382             }
383              
384             # If we get this far, we have run out of places to look and the user will
385             # need to build an index instead of relying on heuristics.
386 1         6 return undef, @clues;
387             }
388              
389             =item ( @segments ) = find_continuation( $first_segment, @clues )
390              
391             Attempt to locate the continuation segments of a logical record without
392             using indexes. Uses the clues returned from C to aid
393             in the search and returns a list of continuation records found that appear
394             to be part of the same logical record as the given first segment.
395              
396             =cut
397              
398             sub _add_segments (\$\@\@) {
399 78     78   97 my $total_segment_count_ref = shift;
400 78         92 my $have_segments_ref = shift;
401 78         90 my $new_segments_ref = shift;
402              
403 78         148 foreach (@$new_segments_ref) {
404 38         98 $have_segments_ref->[$_->field('WARC-Segment-Number')]++;
405 38 100       76 $$total_segment_count_ref = $_->field('WARC-Segment-Number')
406             if defined $_->field('WARC-Segment-Total-Length');
407             }
408             }
409             sub _have_all_segments_p ($@) {
410 86     86   104 my $total_segment_count = shift;
411              
412             # We cannot have all segments if we have not seen the last segment yet.
413 86 100       333 return 0 unless defined $total_segment_count;
414              
415             # We have seen the last segment, do we have all of the others?
416 24 100       42 for (my $i = 2; $i < $total_segment_count; $i++) { return 0 unless $_[$i] }
  59         142  
417             # Start the search at 2 because offsets 0 and 1 are not used here.
418              
419 9         62 return 1;
420             }
421              
422             sub find_continuation {
423 16     16 1 4769 local $Patience = $Patience;
424 16         28 local $Effort = 0;
425              
426 16         30 my $first_segment = shift; my $origin_id = $first_segment->id;
  16         46  
427              
428             # First we unpack the clues and check if all segments were found while
429             # searching for the first segment.
430 16         32 my @segments = (); my @nearby_volume_files = ();
  16         25  
431 16         25 my $have_tail = 0; my $point = undef;
  16         23  
432 16         21 my @similar_volume_files_before = (); my @similar_volume_files_after = ();
  16         22  
433 16         36 foreach my $clue (@_) {
434 35 100       95 if (UNIVERSAL::isa($clue, 'WARC::Record')) {
    100          
435 21 100       58 push @segments, $clue unless $clue == $first_segment;
436             } elsif (ref $clue eq 'ARRAY') {
437 13         23 my $tag = shift @$clue;
438 13 100       33 if ($tag eq 'tail') {
    100          
    100          
439 7         10 $have_tail = 1;
440 7         14 $point = shift @$clue;
441             } elsif ($tag eq 'files_on_axes') {
442 3         32 push @nearby_volume_files, map {[grep defined, @$_]} @$clue;
  6         23  
443             } elsif ($tag eq 'files_from_dir') {
444 2         5 foreach (@$clue) {
445 6         8 push @similar_volume_files_before, [grep defined, @{$_->[0]}];
  6         14  
446 6         7 push @similar_volume_files_after, [grep defined, @{$_->[1]}];
  6         13  
447             }
448 1         9 } else { die "unrecognized hint tag: $tag" }
449 1         8 } else { die "unrecognized clue" }
450 33         65 $clue = undef;
451             }
452 14         62 @similar_volume_files_before = grep {scalar @$_} @similar_volume_files_before;
  6         12  
453 14         20 @similar_volume_files_after = grep {scalar @$_} @similar_volume_files_after;
  6         10  
454              
455 14         19 my @have_segments = (); my $total_segment_count = undef;
  14         23  
456 14         38 _add_segments $total_segment_count, @have_segments, @segments;
457              
458 14 100       27 return @segments if _have_all_segments_p $total_segment_count, @have_segments;
459              
460             # If we get to here, at least one segment was not found while searching
461             # for the first segment, so we will need to search too.
462 12         20 my @records = ();
463              
464             # Pick up where find_first_segment left off...
465 12 100       33 if ($point) {
    100          
466 4         12 (undef, @records) = _scan_volume $point->volume, $point->offset, undef,
467             [WARC_Segment_Origin_ID => $origin_id];
468 4         14 _add_segments $total_segment_count, @have_segments, @records;
469 4         8 push @segments, @records;
470             } elsif (!$have_tail) {
471             # The search may have begun with the first segment directly; ensure
472             # that we scan the entire volume containing the first segment later.
473 7         18 push @nearby_volume_files, [$first_segment->volume->filename];
474             }
475 12         110 $Patience += $Effort * scalar @records;
476             return @segments
477 12 100 100     22 if (_have_all_segments_p $total_segment_count, @have_segments
478             or $Effort > $Patience);
479              
480             # Search for more volumes in a simple sequence...
481             {
482 10         19 my @simple_axes = _split_digit_spans $first_segment->volume->filename;
483 10         27 my @nearby = _find_nearby_files 1, @simple_axes;
484              
485             # Were more volumes found in the simple sequence search now or previously?
486 10         32 foreach my $axis_files ((reverse @nearby),
487             (reverse @nearby_volume_files)) {
488             # Work backwards on the assumption that sequence numbers are nearer
489             # to the end of the filename. (Correct for Wget and Wpull.)
490 17         30 foreach my $name (@$axis_files) {
491 27         33 my $previousEffort = $Effort;
492 27         69 my $volume = mount WARC::Volume ($name);
493 27         424 (undef, @records) = _scan_volume $volume, 0, undef,
494             [WARC_Segment_Origin_ID => $origin_id];
495 27         74 _add_segments $total_segment_count, @have_segments, @records;
496 27         41 push @segments, @records;
497 27         43 $Patience += ($Effort - $previousEffort) * scalar @records;
498             return @segments
499 27 100 100     51 if (_have_all_segments_p $total_segment_count, @have_segments
500             or $Effort > $Patience);
501             }
502             }
503             }
504              
505             # Search for more volumes by directory scan...
506             {
507 10 100       16 unless (@similar_volume_files_before + @similar_volume_files_after) {
  6         12  
  6         15  
508             # Unlike the simple sequence search, the directory scan finds files
509             # in both directions from the starting point on all axes, but it may
510             # not have been needed to find the first segment. Do it now if not.
511 5         13 my @nearby = _find_similar_files $first_segment->volume->filename;
512 5         13 foreach (@nearby) {
513 13         17 push @similar_volume_files_before, $_->[0];
514 13         21 push @similar_volume_files_after, $_->[1];
515             }
516             }
517             # Any interesting records in volumes before the volume containing the
518             # initial record were probably found while locating the first segment.
519 6         12 foreach my $axis_files (@similar_volume_files_after,
520             reverse @similar_volume_files_before) {
521             # Work forwards on the assumption that sequence numbers are nearer to
522             # the beginning of the filename. (Correct in Internet Archive samples.)
523 23         35 foreach my $fname (@$axis_files) {
524 33         42 my $previousEffort = $Effort;
525 33         84 my $volume = mount WARC::Volume ($fname);
526 33         486 (undef, @records) = _scan_volume $volume, 0, undef,
527             [WARC_Segment_Origin_ID => $origin_id];
528 33         87 _add_segments $total_segment_count, @have_segments, @records;
529 33         55 push @segments, @records;
530 33         51 $Patience += ($Effort - $previousEffort) * scalar @records;
531             return @segments
532 33 100 100     60 if (_have_all_segments_p $total_segment_count, @have_segments
533             or $Effort > $Patience);
534             }
535             }
536             }
537              
538             # If we get to here, we have run out of places to look and the user will
539             # need to build an index instead of relying on heuristics.
540 2         17 return @segments;
541             }
542              
543             =back
544              
545             =cut
546              
547             1;
548             __END__