File Coverage

blib/lib/WARC/Record/Logical/Heuristics.pm

Criterion	Covered	Total	%
statement	216	216	100.0
branch	72	72	100.0
condition	24	24	100.0
subroutine	14	14	100.0
pod	2	2	100.0
total	328	328	100.0

line	stmt	bran	cond	sub	pod	time	code
1							package WARC::Record::Logical::Heuristics; # -- CPerl --
2
3	2			2		61299	use strict;
	2					12
	2					55
4	2			2		8	use warnings;
	2					5
	2					74
5
6							our @ISA = qw();
7
8	2			2		348	use WARC; *WARC::Record::Logical::Heuristics::VERSION = \$WARC::VERSION;
	2					5
	2					75
9
10	2			2		11	use Carp;
	2					4
	2					108
11	2			2		11	use File::Spec;
	2					4
	2					4383
12
13							=head1 NAME
14
15							WARC::Record::Logical::Heuristics - heuristics for locating record segments
16
17							=head1 SYNOPSIS
18
19							use WARC::Record::Logical::Heuristics;
20
21							=head1 DESCRIPTION
22
23							This is an internal module that provides functions for locating record
24							segments when the needed information is not available from an index.
25
26							These mostly assume that IIPC WARC guidelines have been followed, as
27							otherwise there simply is no efficient solution.
28
29							Implementations vary, however, with some using only an incrementing serial
30							number and a constant timestamp from the initiation of the crawl job, while
31							the guidelines and specification envision a timestamp reflecting the first
32							write to that specific file rather than the start of the crawl. Constant
33							timestamps are checked first, since the search is simpler.
34
35							=over
36
37							=item $WARC::Record::Logical::Heuristics::Patience
38
39							This variable sets a threshold used to limit the reach of an unproductive
40							search. This module tracks the "effort" expended (I/O performed) during a
41							search and abandons the search if the threshold is exceeded. Finding
42							results dynamically (and temporarily) increases this threshold during a
43							search, such that this really sets how far the search will go between
44							results before giving up and concluding that there are no more results.
45
46							The search will reach farther if either the WARC files are not compressed,
47							or the "sl" GZIP extension documented in L is used.
48							Decompressing record data to find the next record is considerable effort
49							for larger records, but is not counted for very small records that the
50							system is likely to already have cached after the header has been read.
51
52							=cut
53
54							# These provide a simple mechanism to limit the scope of a search that is
55							# not producing results. Both are localized in the top-level calls.
56
57							our $Patience = 10000; # How much effort to put into a search?
58							our $Effort = 0; # How much have we done so far during this search?
59
60							# Most I/O incurs "effort", represented by incrementing $Effort, while
61							# partial success (finding an interesting record) increases "patience",
62							# represented by incrementing $Patience. The search stops when either
63							# there are no more places to look or $Effort exceeds $Patience.
64
65							=item %WARC::Record::Logical::Heuristics::Effort
66
67							This internal hash indicates how costly certain operations should be
68							considered. The keys and their meanings are subject to change at whim, but
69							this is available for quick tuning if needed. Generally, the better
70							solution is to index your data rather than spend time tuning heuristics.
71
72							=cut
73
74							our %Effort =
75							(# read_record:
76							# effort incurred to read a record header, regardless of compression
77							read_record => 5,
78
79							# gzread_data_per_tick:
80							# number of bytes to read while advancing past a compressed record to
81							# incur one effort point; effort incurred rounds down, even to zero
82							#
83							# this value is a shot-in-the-dark estimate that gunzipping 320 KiB is
84							# equivalent to the open/seek/read process for loading a record header
85							gzread_data_per_tick => 64 * 1024,
86
87							# readdir_files_per_tick:
88							# number of file names to read and check while scanning a directory for
89							# to incur one effort point; effort incurred rounds down, even to zero
90							#
91							# this value is a shot-in-the-dark estimate that reading/matching 1600
92							# file names is equivalent to loading a record header; this estimate
93							# may be high or low depending on the number of axes used in the search
94							readdir_files_per_tick => 320,
95							);
96
97							# Internal functions:
98
99							## @axes = _split_digit_spans( $filename )
100							##
101							## Extract possible sequence numbers from $filename and return list of
102							## array references [PREFIX, NUMBER, SUFFIX] where NUMBER is a field that
103							## can be adjusted to find "nearby" files if NUMBER turns out to actually
104							## be a sequence number. Finds numerous false matches in normal use, but
105							## broad searches cost only time while excessive narrowing causes failure.
106							##
107							## Does not perform I/O; does not increment $Effort.
108
109							sub _split_digit_spans ($) {
110	38			38		4707	my $name = shift;
111	38					55	my @axes = ();
112
113							# Split on zero-width boundaries between digits and non-digits.
114	38					788	my @pieces = split /(?=[0-9])(?<=[^0-9])\|(?=[^0-9])(?<=[0-9])/, $name;
115							# The @pieces array now contains alternating spans of digits and non-digits.
116
117	38					116	for (my $i = 0; $i < @pieces; $i++) {
118	524	100	100			1557	next unless ($pieces[$i] =~ /^[0-9]+$/ && length($pieces[$i]) < 9);
119							# More than 8 digits is probably not a sequence number and may be
120							# beyond the range of an integer anyway. Use indexes instead of
121							# heuristics if you need to work with a billion WARC files.
122	237					1139	push @axes, [join('', @pieces[0..($i-1)]),
123							$pieces[$i], join('', @pieces[($i+1)..$#pieces])];
124							}
125
126	38					166	return @axes;
127							}
128
129							## @found = _find_nearby_files( $direction, @axes )
130							##
131							## Locate existing files that appear to be part of a contiguous sequence
132							## along an axis in @axes. The $direction argument is either +1 to search
133							## for higher numbers or -1 to search for lower numbers. A direction
134							## value with a magnitude greater than 1 results in skipping possibilities
135							## during the search.
136							##
137							## Returns a list of array references reflecting the files along each axis
138							## from the argument list but omitting axes on which no files were found.
139							##
140							## Performs only directory lookups, which have highly unpredictable costs
141							## and are usually cached by the system; does not increment $Effort.
142
143							sub _find_nearby_files ($@) {
144	20			20		2234	my $direction = shift;
145	20					34	my @found = ();
146
147	20					32	foreach my $axis (@_) {
148	180					220	my @files = ();
149	180					340	my $i = $axis->[1] + $direction; my $file;
	180					194
150	180					2275	while (-f ($file = join '', ($axis->[0],
151							sprintf('%0*d', length $axis->[1], $i),
152							$axis->[2])))
153	42					121	{ push @files, $file; $i += $direction }
	42					548
154	180	100				695	push @found, \@files if scalar @files;
155							}
156
157	20					61	return @found;
158							}
159
160							## @found = _scan_directory_for_axes( $dirname, @axes )
161							##
162							## Locate existing files that may appear to be part of a sequence along an
163							## axis in @axes, using wildcards for long digit spans.
164							##
165							## The $dirname argument specifies the name of a directory to search and
166							## all @axes are interpreted relative to $dirname. This differs from
167							## _find_nearby_files where each axis specifies full absolute filenames.
168							## For this function, the axes are strictly filenames with no directory.
169							##
170							## Returns a list of array references reflecting the files along each axis
171							## from the argument list but omitting axes on which no files were found.
172							##
173							## Performs directory reads; increments $Effort to count file names read.
174
175							sub _scan_directory_for_axes ($@) {
176	16			16		828	my $dirname = shift;
177	16					26	my $read_count = 0;
178
179							my @re = map {
180	16					30	my $pre = quotemeta $_->[0]; my $post = quotemeta $_->[2];
	60					175
	60					86
181	60					129	$pre =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg;
	14					54
182	60					136	$post =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg;
	2					13
183	60					81	my $midlen = length $_->[1]; qr/^$pre[0-9]{$midlen}$post/ } @_;
	60					907
184
185	16					29	my $filename;
186	16					22	my @found = ();
187	16	100				819	opendir my $dir, $dirname or croak "$dirname: $!";
188	15					533	while (defined ($filename = readdir $dir)) {
189	435					745	foreach (0 .. $#re) {
190	1740	100				3788	if ($filename =~ $re[$_])
191	148					182	{ push @{$found[$_]}, $filename }
	148					278
192							}
193	435					920	$read_count++;
194							}
195	15					199	closedir $dir;
196
197	15					94	$Effort += int($read_count / $Effort{readdir_files_per_tick});
198	15					38	return grep {scalar @$_} @found;
	60					195
199							}
200
201							## @similar = _find_similar_files( $seed )
202							##
203							## Locate existing files that may appear to be part of a sequence involving
204							## any digit span in $seed, using wildcards for long digit spans and
205							## searching only the directory containing $seed.
206							##
207							## Returns a list of array references, each containing two array references
208							## for files sorting before and after $seed, reflecting the files along
209							## each axis derived from $seed on which files other than $seed were found.
210							##
211							## Uses _scan_directory_for_axes; does not perform I/O directly.
212
213							sub _find_similar_files ($) {
214	12			12		4792	my $seedfile = shift;
215
216	12					30	my $fs_volname; my $dirname; my $filename;
	12					0
217	12					126	($fs_volname, $dirname, $filename) = File::Spec->splitpath($seedfile);
218
219	12					85	my @found = _scan_directory_for_axes
220							(File::Spec->catpath($fs_volname, $dirname, ''),
221							_split_digit_spans $filename);
222	12					41	my @similar = ();
223	12					23	foreach my $axis_files (@found) {
224	48					65	my @before = (); my @after = ();
	48					54
225	48					64	foreach my $fname (@$axis_files) {
226	114	100				221	if ($fname lt $filename) {
		100
227	32					151	push @before, File::Spec->catpath($fs_volname, $dirname, $fname);
228							} elsif ($fname gt $filename) {
229	34					152	push @after, File::Spec->catpath($fs_volname, $dirname, $fname);
230							}
231							}
232	18					42	push @similar, [[sort {$a cmp $b} @before],
233	48	100				146	[sort {$a cmp $b} @after]] if @before + @after;
	28					45
234							}
235
236	12					45	return @similar;
237							}
238
239							## ($checkpoint, @records) =
240							## _scan_volume( $volume, $start, $end, [$field, $value]... )
241							##
242							## Search $volume for segment records where any $field matches $value
243							## starting at offset $start and ending at or after offset $end. If $end
244							## is an undefined value, searches until end-of-file.
245							##
246							## Only returns records that have a 'WARC-Segment-Number' header.
247							##
248							## The returned $checkpoint is the last record examined, regardless of
249							## header values, and provides a valid offset for resuming a search.
250
251							sub _scan_volume ($$$@) {
252	97			97		3549	my $volume = shift;
253	97					126	my $start = shift;
254	97					112	my $end = shift;
255
256	97					212	my $record = $volume->record_at($start);
257	97					347	my @records = ();
258
259	97		100			344	while ($record && (!defined $end \|\| $record->offset <= $end)) {
			100
260	885					1788	$Effort += $Effort{read_record};
261							next unless (defined $record->field('WARC-Segment-Number')
262	885	100	100			1443	&& grep {defined $record->field($_->[0])} @_);
	327					619
263	224	100				360	push @records, $record if grep {defined $record->field($_->[0])
	286	100				504
264							&& $record->field($_->[0]) eq $_->[1]} @_;
265							} continue { $Effort += int($record->field('Content-Length')
266							/ $Effort{gzread_data_per_tick})
267							if (defined $record->{compression}
268	885	100	100			1633	&& !defined $record->{sl_packed_size});
269	885					1424	$record = $record->next }
270
271	97					299	return $record, @records;
272							}
273
274							=item ( $first_segment, @clues ) = find_first_segment( $record )
275
276							Attempt to locate the first segment of the logical record suggested by the
277							given record without using indexes. Croaks if given a record that does not
278							appear to have been written using WARC segmentation. Returns a
279							C object for the first record and a list of other objects
280							that may be useful for locating continuation records. Returns undef in the
281							first slot if no clear first segment was found, but can still return other
282							records encountered during the search even if the search was ultimately
283							unsuccessful.
284
285							=cut
286
287							## Each "clue" can be a WARC::Record, or a hint in the form of [key => value].
288							##
289							## The hint keys currently are:
290							##
291							## tail => $record
292							## last record examined in initial volume
293							## (a good starting point to search for more segments)
294							##
295							## files_on_axes => [$filename, ...]...
296							## array of arrays from _find_nearby_files
297							## files_from_dir => [[$filename...], [$filename...]]...
298							## array of arrays from _find_similar_files
299							## Note that the filenames are set to undef in these hints as the
300							## corresponding WARC volumes are scanned, with any relevant records
301							## added directly to the clue list as they are found.
302
303							sub find_first_segment {
304	11			11	1	569	local $Patience = $Patience;
305	11					21	local $Effort = 0;
306
307	11					18	my $initial = shift;
308
309	11	100				31	croak 'searching for segments for unsegmented record'
310							unless defined $initial->field('WARC-Segment-Number');
311
312	10					27	my $origin_id = $initial->field('WARC-Segment-Origin-ID');
313	10					17	my @clues = (); my $point; my @records;
	10					18
314
315							# First we search the volume containing the initial record, since
316							# multiple WARC files may have been concatenated together after writing.
317	10					23	($point, @records) = _scan_volume $initial->volume, 0, $initial->offset,
318							[WARC_Segment_Origin_ID => $origin_id], [WARC_Record_ID => $origin_id];
319							# ... @records will always include $initial ...
320	10					32	push @clues, @records, [tail => $point];
321
322	10					23	foreach my $record (@records) {
323	10	100				24	return $record, @clues if $record->field('WARC-Record-ID') eq $origin_id;
324							}
325	7					16	$Patience += $Effort * ((scalar @records) - 1);
326	7	100				20	return undef, @clues if $Effort > $Patience;
327
328							# If we get this far, the first segment must be in another volume.
329							{
330	6					14	my @simple_axes = _split_digit_spans $initial->volume->filename;
331	6					18	my @nearby = _find_nearby_files -1, @simple_axes;
332
333							# A simple sequence number may be in use; we can check these volumes
334							# before reading the directory to handle varying timestamps.
335	6	100				18	push @clues, [files_on_axes => @nearby] if scalar @nearby;
336	6					21	foreach my $axis_files (reverse @nearby) {
337							# Work backwards on the assumption that sequence numbers are nearer
338							# to the end of the filename. (Correct for Wget and Wpull.)
339	4					9	foreach my $name (@$axis_files) {
340	6					8	my $previousEffort = $Effort;
341	6					17	my $volume = mount WARC::Volume ($name);
342	6					136	(undef, @records) = _scan_volume $volume, 0, undef,
343							[WARC_Segment_Origin_ID => $origin_id],
344							[WARC_Record_ID => $origin_id];
345	6					14	push @clues, @records; $name = undef;
	6					9
346	6					13	foreach my $record (@records) {
347	3	100				8	return $record, @clues
348							if $record->field('WARC-Record-ID') eq $origin_id;
349							}
350	5					10	$Patience += ($Effort - $previousEffort) * scalar @records;
351	5	100				25	return undef, @clues if $Effort > $Patience;
352							}
353							}
354							}
355
356							# If we get this far, the first segment is in another volume and multiple
357							# numbers must change to find that other volume. Assume that timestamps
358							# are in use in the file names, confounding the simple sequence search.
359							{
360	6					11	my @nearby = _find_similar_files $initial->volume->filename;
	4					9
	4					12
361
362	4	100				13	push @clues, [files_from_dir => @nearby] if scalar @nearby;
363							# Work forwards on the assumption that sequence numbers are nearer to
364							# the beginning of the filename. (Correct in Internet Archive samples.)
365	4					9	foreach my $fname ((map {reverse @{$_->[0]}} @nearby),
	8					11
	8					14
366							# work backwards within the "before" list on each axis
367							# ... and forwards within the "after" list on each axis
368	8					10	(map {@{$_->[1]}} @nearby)) {
	8					16
369	9					13	my $previousEffort = $Effort;
370	9					28	my $volume = mount WARC::Volume ($fname);
371	9					141	(undef, @records) = _scan_volume $volume, 0, undef,
372							[WARC_Segment_Origin_ID => $origin_id],
373							[WARC_Record_ID => $origin_id];
374	9					20	push @clues, @records; $fname = undef;
	9					11
375	9					16	foreach my $record (@records) {
376	4	100				9	return $record, @clues
377							if $record->field('WARC-Record-ID') eq $origin_id;
378							}
379	7					13	$Patience += ($Effort - $previousEffort) * scalar @records;
380	7	100				23	return undef, @clues if $Effort > $Patience;
381							}
382							}
383
384							# If we get this far, we have run out of places to look and the user will
385							# need to build an index instead of relying on heuristics.
386	1					6	return undef, @clues;
387							}
388
389							=item ( @segments ) = find_continuation( $first_segment, @clues )
390
391							Attempt to locate the continuation segments of a logical record without
392							using indexes. Uses the clues returned from C to aid
393							in the search and returns a list of continuation records found that appear
394							to be part of the same logical record as the given first segment.
395
396							=cut
397
398							sub _add_segments (\$\@\@) {
399	78			78		97	my $total_segment_count_ref = shift;
400	78					92	my $have_segments_ref = shift;
401	78					90	my $new_segments_ref = shift;
402
403	78					148	foreach (@$new_segments_ref) {
404	38					98	$have_segments_ref->[$_->field('WARC-Segment-Number')]++;
405	38	100				76	$$total_segment_count_ref = $_->field('WARC-Segment-Number')
406							if defined $_->field('WARC-Segment-Total-Length');
407							}
408							}
409							sub _have_all_segments_p ($@) {
410	86			86		104	my $total_segment_count = shift;
411
412							# We cannot have all segments if we have not seen the last segment yet.
413	86	100				333	return 0 unless defined $total_segment_count;
414
415							# We have seen the last segment, do we have all of the others?
416	24	100				42	for (my $i = 2; $i < $total_segment_count; $i++) { return 0 unless $_[$i] }
	59					142
417							# Start the search at 2 because offsets 0 and 1 are not used here.
418
419	9					62	return 1;
420							}
421
422							sub find_continuation {
423	16			16	1	4769	local $Patience = $Patience;
424	16					28	local $Effort = 0;
425
426	16					30	my $first_segment = shift; my $origin_id = $first_segment->id;
	16					46
427
428							# First we unpack the clues and check if all segments were found while
429							# searching for the first segment.
430	16					32	my @segments = (); my @nearby_volume_files = ();
	16					25
431	16					25	my $have_tail = 0; my $point = undef;
	16					23
432	16					21	my @similar_volume_files_before = (); my @similar_volume_files_after = ();
	16					22
433	16					36	foreach my $clue (@_) {
434	35	100				95	if (UNIVERSAL::isa($clue, 'WARC::Record')) {
		100
435	21	100				58	push @segments, $clue unless $clue == $first_segment;
436							} elsif (ref $clue eq 'ARRAY') {
437	13					23	my $tag = shift @$clue;
438	13	100				33	if ($tag eq 'tail') {
		100
		100
439	7					10	$have_tail = 1;
440	7					14	$point = shift @$clue;
441							} elsif ($tag eq 'files_on_axes') {
442	3					32	push @nearby_volume_files, map {[grep defined, @$_]} @$clue;
	6					23
443							} elsif ($tag eq 'files_from_dir') {
444	2					5	foreach (@$clue) {
445	6					8	push @similar_volume_files_before, [grep defined, @{$_->[0]}];
	6					14
446	6					7	push @similar_volume_files_after, [grep defined, @{$_->[1]}];
	6					13
447							}
448	1					9	} else { die "unrecognized hint tag: $tag" }
449	1					8	} else { die "unrecognized clue" }
450	33					65	$clue = undef;
451							}
452	14					62	@similar_volume_files_before = grep {scalar @$_} @similar_volume_files_before;
	6					12
453	14					20	@similar_volume_files_after = grep {scalar @$_} @similar_volume_files_after;
	6					10
454
455	14					19	my @have_segments = (); my $total_segment_count = undef;
	14					23
456	14					38	_add_segments $total_segment_count, @have_segments, @segments;
457
458	14	100				27	return @segments if _have_all_segments_p $total_segment_count, @have_segments;
459
460							# If we get to here, at least one segment was not found while searching
461							# for the first segment, so we will need to search too.
462	12					20	my @records = ();
463
464							# Pick up where find_first_segment left off...
465	12	100				33	if ($point) {
		100
466	4					12	(undef, @records) = _scan_volume $point->volume, $point->offset, undef,
467							[WARC_Segment_Origin_ID => $origin_id];
468	4					14	_add_segments $total_segment_count, @have_segments, @records;
469	4					8	push @segments, @records;
470							} elsif (!$have_tail) {
471							# The search may have begun with the first segment directly; ensure
472							# that we scan the entire volume containing the first segment later.
473	7					18	push @nearby_volume_files, [$first_segment->volume->filename];
474							}
475	12					110	$Patience += $Effort * scalar @records;
476							return @segments
477	12	100	100			22	if (_have_all_segments_p $total_segment_count, @have_segments
478							or $Effort > $Patience);
479
480							# Search for more volumes in a simple sequence...
481							{
482	10					19	my @simple_axes = _split_digit_spans $first_segment->volume->filename;
483	10					27	my @nearby = _find_nearby_files 1, @simple_axes;
484
485							# Were more volumes found in the simple sequence search now or previously?
486	10					32	foreach my $axis_files ((reverse @nearby),
487							(reverse @nearby_volume_files)) {
488							# Work backwards on the assumption that sequence numbers are nearer
489							# to the end of the filename. (Correct for Wget and Wpull.)
490	17					30	foreach my $name (@$axis_files) {
491	27					33	my $previousEffort = $Effort;
492	27					69	my $volume = mount WARC::Volume ($name);
493	27					424	(undef, @records) = _scan_volume $volume, 0, undef,
494							[WARC_Segment_Origin_ID => $origin_id];
495	27					74	_add_segments $total_segment_count, @have_segments, @records;
496	27					41	push @segments, @records;
497	27					43	$Patience += ($Effort - $previousEffort) * scalar @records;
498							return @segments
499	27	100	100			51	if (_have_all_segments_p $total_segment_count, @have_segments
500							or $Effort > $Patience);
501							}
502							}
503							}
504
505							# Search for more volumes by directory scan...
506							{
507	10	100				16	unless (@similar_volume_files_before + @similar_volume_files_after) {
	6					12
	6					15
508							# Unlike the simple sequence search, the directory scan finds files
509							# in both directions from the starting point on all axes, but it may
510							# not have been needed to find the first segment. Do it now if not.
511	5					13	my @nearby = _find_similar_files $first_segment->volume->filename;
512	5					13	foreach (@nearby) {
513	13					17	push @similar_volume_files_before, $_->[0];
514	13					21	push @similar_volume_files_after, $_->[1];
515							}
516							}
517							# Any interesting records in volumes before the volume containing the
518							# initial record were probably found while locating the first segment.
519	6					12	foreach my $axis_files (@similar_volume_files_after,
520							reverse @similar_volume_files_before) {
521							# Work forwards on the assumption that sequence numbers are nearer to
522							# the beginning of the filename. (Correct in Internet Archive samples.)
523	23					35	foreach my $fname (@$axis_files) {
524	33					42	my $previousEffort = $Effort;
525	33					84	my $volume = mount WARC::Volume ($fname);
526	33					486	(undef, @records) = _scan_volume $volume, 0, undef,
527							[WARC_Segment_Origin_ID => $origin_id];
528	33					87	_add_segments $total_segment_count, @have_segments, @records;
529	33					55	push @segments, @records;
530	33					51	$Patience += ($Effort - $previousEffort) * scalar @records;
531							return @segments
532	33	100	100			60	if (_have_all_segments_p $total_segment_count, @have_segments
533							or $Effort > $Patience);
534							}
535							}
536							}
537
538							# If we get to here, we have run out of places to look and the user will
539							# need to build an index instead of relying on heuristics.
540	2					17	return @segments;
541							}
542
543							=back
544
545							=cut
546
547							1;
548							__END__