| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WARC::Record::Logical::Heuristics;			# -*- CPerl -*- | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 2 |  |  | 2 |  | 61299 | use strict; | 
|  | 2 |  |  |  |  | 12 |  | 
|  | 2 |  |  |  |  | 55 |  | 
| 4 | 2 |  |  | 2 |  | 8 | use warnings; | 
|  | 2 |  |  |  |  | 5 |  | 
|  | 2 |  |  |  |  | 74 |  | 
| 5 |  |  |  |  |  |  |  | 
| 6 |  |  |  |  |  |  | our @ISA = qw(); | 
| 7 |  |  |  |  |  |  |  | 
| 8 | 2 |  |  | 2 |  | 348 | use WARC; *WARC::Record::Logical::Heuristics::VERSION = \$WARC::VERSION; | 
|  | 2 |  |  |  |  | 5 |  | 
|  | 2 |  |  |  |  | 75 |  | 
| 9 |  |  |  |  |  |  |  | 
| 10 | 2 |  |  | 2 |  | 11 | use Carp; | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 108 |  | 
| 11 | 2 |  |  | 2 |  | 11 | use File::Spec; | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 4383 |  | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | =head1 NAME | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | WARC::Record::Logical::Heuristics - heuristics for locating record segments | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | use WARC::Record::Logical::Heuristics; | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | This is an internal module that provides functions for locating record | 
| 24 |  |  |  |  |  |  | segments when the needed information is not available from an index. | 
| 25 |  |  |  |  |  |  |  | 
| 26 |  |  |  |  |  |  | These mostly assume that IIPC WARC guidelines have been followed, as | 
| 27 |  |  |  |  |  |  | otherwise there simply is no efficient solution. | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | Implementations vary, however, with some using only an incrementing serial | 
| 30 |  |  |  |  |  |  | number and a constant timestamp from the initiation of the crawl job, while | 
| 31 |  |  |  |  |  |  | the guidelines and specification envision a timestamp reflecting the first | 
| 32 |  |  |  |  |  |  | write to that specific file rather than the start of the crawl.  Constant | 
| 33 |  |  |  |  |  |  | timestamps are checked first, since the search is simpler. | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | =over | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | =item $WARC::Record::Logical::Heuristics::Patience | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | This variable sets a threshold used to limit the reach of an unproductive | 
| 40 |  |  |  |  |  |  | search.  This module tracks the "effort" expended (I/O performed) during a | 
| 41 |  |  |  |  |  |  | search and abandons the search if the threshold is exceeded.  Finding | 
| 42 |  |  |  |  |  |  | results dynamically (and temporarily) increases this threshold during a | 
| 43 |  |  |  |  |  |  | search, such that this really sets how far the search will go between | 
| 44 |  |  |  |  |  |  | results before giving up and concluding that there are no more results. | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  | The search will reach farther if either the WARC files are not compressed, | 
| 47 |  |  |  |  |  |  | or the "sl" GZIP extension documented in L is used. | 
| 48 |  |  |  |  |  |  | Decompressing record data to find the next record is considerable effort | 
| 49 |  |  |  |  |  |  | for larger records, but is not counted for very small records that the | 
| 50 |  |  |  |  |  |  | system is likely to already have cached after the header has been read. | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | =cut | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | # These provide a simple mechanism to limit the scope of a search that is | 
| 55 |  |  |  |  |  |  | # not producing results.  Both are localized in the top-level calls. | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | our $Patience = 10000;	# How much effort to put into a search? | 
| 58 |  |  |  |  |  |  | our $Effort = 0;	# How much have we done so far during this search? | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | # Most I/O incurs "effort", represented by incrementing $Effort, while | 
| 61 |  |  |  |  |  |  | # partial success (finding an interesting record) increases "patience", | 
| 62 |  |  |  |  |  |  | # represented by incrementing $Patience.  The search stops when either | 
| 63 |  |  |  |  |  |  | # there are no more places to look or $Effort exceeds $Patience. | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | =item %WARC::Record::Logical::Heuristics::Effort | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | This internal hash indicates how costly certain operations should be | 
| 68 |  |  |  |  |  |  | considered.  The keys and their meanings are subject to change at whim, but | 
| 69 |  |  |  |  |  |  | this is available for quick tuning if needed.  Generally, the better | 
| 70 |  |  |  |  |  |  | solution is to index your data rather than spend time tuning heuristics. | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | =cut | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  | our %Effort = | 
| 75 |  |  |  |  |  |  | (# read_record: | 
| 76 |  |  |  |  |  |  | #  effort incurred to read a record header, regardless of compression | 
| 77 |  |  |  |  |  |  | read_record => 5, | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | # gzread_data_per_tick: | 
| 80 |  |  |  |  |  |  | #  number of bytes to read while advancing past a compressed record to | 
| 81 |  |  |  |  |  |  | #  incur one effort point; effort incurred rounds down, even to zero | 
| 82 |  |  |  |  |  |  | # | 
| 83 |  |  |  |  |  |  | #  this value is a shot-in-the-dark estimate that gunzipping 320 KiB is | 
| 84 |  |  |  |  |  |  | #  equivalent to the open/seek/read process for loading a record header | 
| 85 |  |  |  |  |  |  | gzread_data_per_tick => 64 * 1024, | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | # readdir_files_per_tick: | 
| 88 |  |  |  |  |  |  | #  number of file names to read and check while scanning a directory for | 
| 89 |  |  |  |  |  |  | #  to incur one effort point; effort incurred rounds down, even to zero | 
| 90 |  |  |  |  |  |  | # | 
| 91 |  |  |  |  |  |  | #  this value is a shot-in-the-dark estimate that reading/matching 1600 | 
| 92 |  |  |  |  |  |  | #  file names is equivalent to loading a record header; this estimate | 
| 93 |  |  |  |  |  |  | #  may be high or low depending on the number of axes used in the search | 
| 94 |  |  |  |  |  |  | readdir_files_per_tick => 320, | 
| 95 |  |  |  |  |  |  | ); | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | # Internal functions: | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | ## @axes = _split_digit_spans( $filename ) | 
| 100 |  |  |  |  |  |  | ## | 
| 101 |  |  |  |  |  |  | ##  Extract possible sequence numbers from $filename and return list of | 
| 102 |  |  |  |  |  |  | ##  array references [PREFIX, NUMBER, SUFFIX] where NUMBER is a field that | 
| 103 |  |  |  |  |  |  | ##  can be adjusted to find "nearby" files if NUMBER turns out to actually | 
| 104 |  |  |  |  |  |  | ##  be a sequence number.  Finds numerous false matches in normal use, but | 
| 105 |  |  |  |  |  |  | ##  broad searches cost only time while excessive narrowing causes failure. | 
| 106 |  |  |  |  |  |  | ## | 
| 107 |  |  |  |  |  |  | ##  Does not perform I/O; does not increment $Effort. | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | sub _split_digit_spans ($) { | 
| 110 | 38 |  |  | 38 |  | 4707 | my $name = shift; | 
| 111 | 38 |  |  |  |  | 55 | my @axes = (); | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | # Split on zero-width boundaries between digits and non-digits. | 
| 114 | 38 |  |  |  |  | 788 | my @pieces = split /(?=[0-9])(?<=[^0-9])|(?=[^0-9])(?<=[0-9])/, $name; | 
| 115 |  |  |  |  |  |  | # The @pieces array now contains alternating spans of digits and non-digits. | 
| 116 |  |  |  |  |  |  |  | 
| 117 | 38 |  |  |  |  | 116 | for (my $i = 0; $i < @pieces; $i++) { | 
| 118 | 524 | 100 | 100 |  |  | 1557 | next unless ($pieces[$i] =~ /^[0-9]+$/ && length($pieces[$i]) < 9); | 
| 119 |  |  |  |  |  |  | # More than 8 digits is probably not a sequence number and may be | 
| 120 |  |  |  |  |  |  | # beyond the range of an integer anyway.  Use indexes instead of | 
| 121 |  |  |  |  |  |  | # heuristics if you need to work with a billion WARC files. | 
| 122 | 237 |  |  |  |  | 1139 | push @axes, [join('', @pieces[0..($i-1)]), | 
| 123 |  |  |  |  |  |  | $pieces[$i], join('', @pieces[($i+1)..$#pieces])]; | 
| 124 |  |  |  |  |  |  | } | 
| 125 |  |  |  |  |  |  |  | 
| 126 | 38 |  |  |  |  | 166 | return @axes; | 
| 127 |  |  |  |  |  |  | } | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  | ## @found = _find_nearby_files( $direction, @axes ) | 
| 130 |  |  |  |  |  |  | ## | 
| 131 |  |  |  |  |  |  | ##  Locate existing files that appear to be part of a contiguous sequence | 
| 132 |  |  |  |  |  |  | ##  along an axis in @axes.  The $direction argument is either +1 to search | 
| 133 |  |  |  |  |  |  | ##  for higher numbers or -1 to search for lower numbers.  A direction | 
| 134 |  |  |  |  |  |  | ##  value with a magnitude greater than 1 results in skipping possibilities | 
| 135 |  |  |  |  |  |  | ##  during the search. | 
| 136 |  |  |  |  |  |  | ## | 
| 137 |  |  |  |  |  |  | ##  Returns a list of array references reflecting the files along each axis | 
| 138 |  |  |  |  |  |  | ##  from the argument list but omitting axes on which no files were found. | 
| 139 |  |  |  |  |  |  | ## | 
| 140 |  |  |  |  |  |  | ##  Performs only directory lookups, which have highly unpredictable costs | 
| 141 |  |  |  |  |  |  | ##  and are usually cached by the system; does not increment $Effort. | 
| 142 |  |  |  |  |  |  |  | 
| 143 |  |  |  |  |  |  | sub _find_nearby_files ($@) { | 
| 144 | 20 |  |  | 20 |  | 2234 | my $direction = shift; | 
| 145 | 20 |  |  |  |  | 34 | my @found = (); | 
| 146 |  |  |  |  |  |  |  | 
| 147 | 20 |  |  |  |  | 32 | foreach my $axis (@_) { | 
| 148 | 180 |  |  |  |  | 220 | my @files = (); | 
| 149 | 180 |  |  |  |  | 340 | my $i = $axis->[1] + $direction; my $file; | 
|  | 180 |  |  |  |  | 194 |  | 
| 150 | 180 |  |  |  |  | 2275 | while (-f ($file = join '', ($axis->[0], | 
| 151 |  |  |  |  |  |  | sprintf('%0*d', length $axis->[1], $i), | 
| 152 |  |  |  |  |  |  | $axis->[2]))) | 
| 153 | 42 |  |  |  |  | 121 | { push @files, $file; $i += $direction } | 
|  | 42 |  |  |  |  | 548 |  | 
| 154 | 180 | 100 |  |  |  | 695 | push @found, \@files if scalar @files; | 
| 155 |  |  |  |  |  |  | } | 
| 156 |  |  |  |  |  |  |  | 
| 157 | 20 |  |  |  |  | 61 | return @found; | 
| 158 |  |  |  |  |  |  | } | 
| 159 |  |  |  |  |  |  |  | 
| 160 |  |  |  |  |  |  | ## @found = _scan_directory_for_axes( $dirname, @axes ) | 
| 161 |  |  |  |  |  |  | ## | 
| 162 |  |  |  |  |  |  | ##  Locate existing files that may appear to be part of a sequence along an | 
| 163 |  |  |  |  |  |  | ##  axis in @axes, using wildcards for long digit spans. | 
| 164 |  |  |  |  |  |  | ## | 
| 165 |  |  |  |  |  |  | ##  The $dirname argument specifies the name of a directory to search and | 
| 166 |  |  |  |  |  |  | ##  all @axes are interpreted relative to $dirname.  This differs from | 
| 167 |  |  |  |  |  |  | ##  _find_nearby_files where each axis specifies full absolute filenames. | 
| 168 |  |  |  |  |  |  | ##  For this function, the axes are strictly filenames with no directory. | 
| 169 |  |  |  |  |  |  | ## | 
| 170 |  |  |  |  |  |  | ##  Returns a list of array references reflecting the files along each axis | 
| 171 |  |  |  |  |  |  | ##  from the argument list but omitting axes on which no files were found. | 
| 172 |  |  |  |  |  |  | ## | 
| 173 |  |  |  |  |  |  | ##  Performs directory reads; increments $Effort to count file names read. | 
| 174 |  |  |  |  |  |  |  | 
| 175 |  |  |  |  |  |  | sub _scan_directory_for_axes ($@) { | 
| 176 | 16 |  |  | 16 |  | 828 | my $dirname = shift; | 
| 177 | 16 |  |  |  |  | 26 | my $read_count = 0; | 
| 178 |  |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | my @re = map { | 
| 180 | 16 |  |  |  |  | 30 | my $pre = quotemeta $_->[0]; my $post = quotemeta $_->[2]; | 
|  | 60 |  |  |  |  | 175 |  | 
|  | 60 |  |  |  |  | 86 |  | 
| 181 | 60 |  |  |  |  | 129 | $pre  =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg; | 
|  | 14 |  |  |  |  | 54 |  | 
| 182 | 60 |  |  |  |  | 136 | $post =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg; | 
|  | 2 |  |  |  |  | 13 |  | 
| 183 | 60 |  |  |  |  | 81 | my $midlen = length $_->[1]; qr/^$pre[0-9]{$midlen}$post/ } @_; | 
|  | 60 |  |  |  |  | 907 |  | 
| 184 |  |  |  |  |  |  |  | 
| 185 | 16 |  |  |  |  | 29 | my $filename; | 
| 186 | 16 |  |  |  |  | 22 | my @found = (); | 
| 187 | 16 | 100 |  |  |  | 819 | opendir my $dir, $dirname or croak "$dirname: $!"; | 
| 188 | 15 |  |  |  |  | 533 | while (defined ($filename = readdir $dir)) { | 
| 189 | 435 |  |  |  |  | 745 | foreach (0 .. $#re) { | 
| 190 | 1740 | 100 |  |  |  | 3788 | if ($filename =~ $re[$_]) | 
| 191 | 148 |  |  |  |  | 182 | { push @{$found[$_]}, $filename } | 
|  | 148 |  |  |  |  | 278 |  | 
| 192 |  |  |  |  |  |  | } | 
| 193 | 435 |  |  |  |  | 920 | $read_count++; | 
| 194 |  |  |  |  |  |  | } | 
| 195 | 15 |  |  |  |  | 199 | closedir $dir; | 
| 196 |  |  |  |  |  |  |  | 
| 197 | 15 |  |  |  |  | 94 | $Effort += int($read_count / $Effort{readdir_files_per_tick}); | 
| 198 | 15 |  |  |  |  | 38 | return grep {scalar @$_} @found; | 
|  | 60 |  |  |  |  | 195 |  | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  |  | 
| 201 |  |  |  |  |  |  | ## @similar = _find_similar_files( $seed ) | 
| 202 |  |  |  |  |  |  | ## | 
| 203 |  |  |  |  |  |  | ## Locate existing files that may appear to be part of a sequence involving | 
| 204 |  |  |  |  |  |  | ## any digit span in $seed, using wildcards for long digit spans and | 
| 205 |  |  |  |  |  |  | ## searching only the directory containing $seed. | 
| 206 |  |  |  |  |  |  | ## | 
| 207 |  |  |  |  |  |  | ## Returns a list of array references, each containing two array references | 
| 208 |  |  |  |  |  |  | ## for files sorting before and after $seed, reflecting the files along | 
| 209 |  |  |  |  |  |  | ## each axis derived from $seed on which files other than $seed were found. | 
| 210 |  |  |  |  |  |  | ## | 
| 211 |  |  |  |  |  |  | ## Uses _scan_directory_for_axes; does not perform I/O directly. | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | sub _find_similar_files ($) { | 
| 214 | 12 |  |  | 12 |  | 4792 | my $seedfile = shift; | 
| 215 |  |  |  |  |  |  |  | 
| 216 | 12 |  |  |  |  | 30 | my $fs_volname; my $dirname; my $filename; | 
|  | 12 |  |  |  |  | 0 |  | 
| 217 | 12 |  |  |  |  | 126 | ($fs_volname, $dirname, $filename) = File::Spec->splitpath($seedfile); | 
| 218 |  |  |  |  |  |  |  | 
| 219 | 12 |  |  |  |  | 85 | my @found = _scan_directory_for_axes | 
| 220 |  |  |  |  |  |  | (File::Spec->catpath($fs_volname, $dirname, ''), | 
| 221 |  |  |  |  |  |  | _split_digit_spans $filename); | 
| 222 | 12 |  |  |  |  | 41 | my @similar = (); | 
| 223 | 12 |  |  |  |  | 23 | foreach my $axis_files (@found) { | 
| 224 | 48 |  |  |  |  | 65 | my @before = (); my @after = (); | 
|  | 48 |  |  |  |  | 54 |  | 
| 225 | 48 |  |  |  |  | 64 | foreach my $fname (@$axis_files) { | 
| 226 | 114 | 100 |  |  |  | 221 | if ($fname lt $filename) { | 
|  |  | 100 |  |  |  |  |  | 
| 227 | 32 |  |  |  |  | 151 | push @before, File::Spec->catpath($fs_volname, $dirname, $fname); | 
| 228 |  |  |  |  |  |  | } elsif ($fname gt $filename) { | 
| 229 | 34 |  |  |  |  | 152 | push @after,  File::Spec->catpath($fs_volname, $dirname, $fname); | 
| 230 |  |  |  |  |  |  | } | 
| 231 |  |  |  |  |  |  | } | 
| 232 | 18 |  |  |  |  | 42 | push @similar, [[sort {$a cmp $b} @before], | 
| 233 | 48 | 100 |  |  |  | 146 | [sort {$a cmp $b} @after]] if @before + @after; | 
|  | 28 |  |  |  |  | 45 |  | 
| 234 |  |  |  |  |  |  | } | 
| 235 |  |  |  |  |  |  |  | 
| 236 | 12 |  |  |  |  | 45 | return @similar; | 
| 237 |  |  |  |  |  |  | } | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  | ## ($checkpoint, @records) = | 
| 240 |  |  |  |  |  |  | ##	_scan_volume( $volume, $start, $end, [$field, $value]... ) | 
| 241 |  |  |  |  |  |  | ## | 
| 242 |  |  |  |  |  |  | ##  Search $volume for segment records where any $field matches $value | 
| 243 |  |  |  |  |  |  | ##  starting at offset $start and ending at or after offset $end.  If $end | 
| 244 |  |  |  |  |  |  | ##  is an undefined value, searches until end-of-file. | 
| 245 |  |  |  |  |  |  | ## | 
| 246 |  |  |  |  |  |  | ##  Only returns records that have a 'WARC-Segment-Number' header. | 
| 247 |  |  |  |  |  |  | ## | 
| 248 |  |  |  |  |  |  | ##  The returned $checkpoint is the last record examined, regardless of | 
| 249 |  |  |  |  |  |  | ##  header values, and provides a valid offset for resuming a search. | 
| 250 |  |  |  |  |  |  |  | 
| 251 |  |  |  |  |  |  | sub _scan_volume ($$$@) { | 
| 252 | 97 |  |  | 97 |  | 3549 | my $volume = shift; | 
| 253 | 97 |  |  |  |  | 126 | my $start = shift; | 
| 254 | 97 |  |  |  |  | 112 | my $end = shift; | 
| 255 |  |  |  |  |  |  |  | 
| 256 | 97 |  |  |  |  | 212 | my $record = $volume->record_at($start); | 
| 257 | 97 |  |  |  |  | 347 | my @records = (); | 
| 258 |  |  |  |  |  |  |  | 
| 259 | 97 |  | 100 |  |  | 344 | while ($record && (!defined $end || $record->offset <= $end)) { | 
|  |  |  | 100 |  |  |  |  | 
| 260 | 885 |  |  |  |  | 1788 | $Effort += $Effort{read_record}; | 
| 261 |  |  |  |  |  |  | next unless (defined $record->field('WARC-Segment-Number') | 
| 262 | 885 | 100 | 100 |  |  | 1443 | && grep {defined $record->field($_->[0])} @_); | 
|  | 327 |  |  |  |  | 619 |  | 
| 263 | 224 | 100 |  |  |  | 360 | push @records, $record if grep {defined $record->field($_->[0]) | 
|  | 286 | 100 |  |  |  | 504 |  | 
| 264 |  |  |  |  |  |  | && $record->field($_->[0]) eq $_->[1]} @_; | 
| 265 |  |  |  |  |  |  | } continue { $Effort += int($record->field('Content-Length') | 
| 266 |  |  |  |  |  |  | / $Effort{gzread_data_per_tick}) | 
| 267 |  |  |  |  |  |  | if (defined $record->{compression} | 
| 268 | 885 | 100 | 100 |  |  | 1633 | && !defined $record->{sl_packed_size}); | 
| 269 | 885 |  |  |  |  | 1424 | $record = $record->next } | 
| 270 |  |  |  |  |  |  |  | 
| 271 | 97 |  |  |  |  | 299 | return $record, @records; | 
| 272 |  |  |  |  |  |  | } | 
| 273 |  |  |  |  |  |  |  | 
| 274 |  |  |  |  |  |  | =item ( $first_segment, @clues ) = find_first_segment( $record ) | 
| 275 |  |  |  |  |  |  |  | 
| 276 |  |  |  |  |  |  | Attempt to locate the first segment of the logical record suggested by the | 
| 277 |  |  |  |  |  |  | given record without using indexes.  Croaks if given a record that does not | 
| 278 |  |  |  |  |  |  | appear to have been written using WARC segmentation.  Returns a | 
| 279 |  |  |  |  |  |  | C object for the first record and a list of other objects | 
| 280 |  |  |  |  |  |  | that may be useful for locating continuation records.  Returns undef in the | 
| 281 |  |  |  |  |  |  | first slot if no clear first segment was found, but can still return other | 
| 282 |  |  |  |  |  |  | records encountered during the search even if the search was ultimately | 
| 283 |  |  |  |  |  |  | unsuccessful. | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | =cut | 
| 286 |  |  |  |  |  |  |  | 
| 287 |  |  |  |  |  |  | ## Each "clue" can be a WARC::Record, or a hint in the form of [key => value]. | 
| 288 |  |  |  |  |  |  | ## | 
| 289 |  |  |  |  |  |  | ## The hint keys currently are: | 
| 290 |  |  |  |  |  |  | ## | 
| 291 |  |  |  |  |  |  | ##	tail		=>	$record | 
| 292 |  |  |  |  |  |  | ##		last record examined in initial volume | 
| 293 |  |  |  |  |  |  | ##		(a good starting point to search for more segments) | 
| 294 |  |  |  |  |  |  | ## | 
| 295 |  |  |  |  |  |  | ##	files_on_axes	=>	[$filename, ...]... | 
| 296 |  |  |  |  |  |  | ##		array of arrays from _find_nearby_files | 
| 297 |  |  |  |  |  |  | ##	files_from_dir	=>	[[$filename...], [$filename...]]... | 
| 298 |  |  |  |  |  |  | ##		array of arrays from _find_similar_files | 
| 299 |  |  |  |  |  |  | ##	Note that the filenames are set to undef in these hints as the | 
| 300 |  |  |  |  |  |  | ##	corresponding WARC volumes are scanned, with any relevant records | 
| 301 |  |  |  |  |  |  | ##	added directly to the clue list as they are found. | 
| 302 |  |  |  |  |  |  |  | 
| 303 |  |  |  |  |  |  | sub find_first_segment { | 
| 304 | 11 |  |  | 11 | 1 | 569 | local $Patience = $Patience; | 
| 305 | 11 |  |  |  |  | 21 | local $Effort = 0; | 
| 306 |  |  |  |  |  |  |  | 
| 307 | 11 |  |  |  |  | 18 | my $initial = shift; | 
| 308 |  |  |  |  |  |  |  | 
| 309 | 11 | 100 |  |  |  | 31 | croak 'searching for segments for unsegmented record' | 
| 310 |  |  |  |  |  |  | unless defined $initial->field('WARC-Segment-Number'); | 
| 311 |  |  |  |  |  |  |  | 
| 312 | 10 |  |  |  |  | 27 | my $origin_id = $initial->field('WARC-Segment-Origin-ID'); | 
| 313 | 10 |  |  |  |  | 17 | my @clues = (); my $point; my @records; | 
|  | 10 |  |  |  |  | 18 |  | 
| 314 |  |  |  |  |  |  |  | 
| 315 |  |  |  |  |  |  | # First we search the volume containing the initial record, since | 
| 316 |  |  |  |  |  |  | # multiple WARC files may have been concatenated together after writing. | 
| 317 | 10 |  |  |  |  | 23 | ($point, @records) = _scan_volume $initial->volume, 0, $initial->offset, | 
| 318 |  |  |  |  |  |  | [WARC_Segment_Origin_ID => $origin_id], [WARC_Record_ID => $origin_id]; | 
| 319 |  |  |  |  |  |  | # ... @records will always include $initial ... | 
| 320 | 10 |  |  |  |  | 32 | push @clues, @records, [tail => $point]; | 
| 321 |  |  |  |  |  |  |  | 
| 322 | 10 |  |  |  |  | 23 | foreach my $record (@records) { | 
| 323 | 10 | 100 |  |  |  | 24 | return $record, @clues if $record->field('WARC-Record-ID') eq $origin_id; | 
| 324 |  |  |  |  |  |  | } | 
| 325 | 7 |  |  |  |  | 16 | $Patience += $Effort * ((scalar @records) - 1); | 
| 326 | 7 | 100 |  |  |  | 20 | return undef, @clues if $Effort > $Patience; | 
| 327 |  |  |  |  |  |  |  | 
| 328 |  |  |  |  |  |  | # If we get this far, the first segment must be in another volume. | 
| 329 |  |  |  |  |  |  | { | 
| 330 | 6 |  |  |  |  | 14 | my @simple_axes = _split_digit_spans $initial->volume->filename; | 
| 331 | 6 |  |  |  |  | 18 | my @nearby = _find_nearby_files -1, @simple_axes; | 
| 332 |  |  |  |  |  |  |  | 
| 333 |  |  |  |  |  |  | # A simple sequence number may be in use; we can check these volumes | 
| 334 |  |  |  |  |  |  | # before reading the directory to handle varying timestamps. | 
| 335 | 6 | 100 |  |  |  | 18 | push @clues, [files_on_axes => @nearby] if scalar @nearby; | 
| 336 | 6 |  |  |  |  | 21 | foreach my $axis_files (reverse @nearby) { | 
| 337 |  |  |  |  |  |  | # Work backwards on the assumption that sequence numbers are nearer | 
| 338 |  |  |  |  |  |  | # to the end of the filename.  (Correct for Wget and Wpull.) | 
| 339 | 4 |  |  |  |  | 9 | foreach my $name (@$axis_files) { | 
| 340 | 6 |  |  |  |  | 8 | my $previousEffort = $Effort; | 
| 341 | 6 |  |  |  |  | 17 | my $volume = mount WARC::Volume ($name); | 
| 342 | 6 |  |  |  |  | 136 | (undef, @records) = _scan_volume $volume, 0, undef, | 
| 343 |  |  |  |  |  |  | [WARC_Segment_Origin_ID => $origin_id], | 
| 344 |  |  |  |  |  |  | [WARC_Record_ID => $origin_id]; | 
| 345 | 6 |  |  |  |  | 14 | push @clues, @records; $name = undef; | 
|  | 6 |  |  |  |  | 9 |  | 
| 346 | 6 |  |  |  |  | 13 | foreach my $record (@records) { | 
| 347 | 3 | 100 |  |  |  | 8 | return $record, @clues | 
| 348 |  |  |  |  |  |  | if $record->field('WARC-Record-ID') eq $origin_id; | 
| 349 |  |  |  |  |  |  | } | 
| 350 | 5 |  |  |  |  | 10 | $Patience += ($Effort - $previousEffort) * scalar @records; | 
| 351 | 5 | 100 |  |  |  | 25 | return undef, @clues if $Effort > $Patience; | 
| 352 |  |  |  |  |  |  | } | 
| 353 |  |  |  |  |  |  | } | 
| 354 |  |  |  |  |  |  | } | 
| 355 |  |  |  |  |  |  |  | 
| 356 |  |  |  |  |  |  | # If we get this far, the first segment is in another volume and multiple | 
| 357 |  |  |  |  |  |  | # numbers must change to find that other volume.  Assume that timestamps | 
| 358 |  |  |  |  |  |  | # are in use in the file names, confounding the simple sequence search. | 
| 359 |  |  |  |  |  |  | { | 
| 360 | 6 |  |  |  |  | 11 | my @nearby = _find_similar_files $initial->volume->filename; | 
|  | 4 |  |  |  |  | 9 |  | 
|  | 4 |  |  |  |  | 12 |  | 
| 361 |  |  |  |  |  |  |  | 
| 362 | 4 | 100 |  |  |  | 13 | push @clues, [files_from_dir => @nearby] if scalar @nearby; | 
| 363 |  |  |  |  |  |  | # Work forwards on the assumption that sequence numbers are nearer to | 
| 364 |  |  |  |  |  |  | # the beginning of the filename.  (Correct in Internet Archive samples.) | 
| 365 | 4 |  |  |  |  | 9 | foreach my $fname ((map {reverse @{$_->[0]}} @nearby), | 
|  | 8 |  |  |  |  | 11 |  | 
|  | 8 |  |  |  |  | 14 |  | 
| 366 |  |  |  |  |  |  | # work backwards within the "before" list on each axis | 
| 367 |  |  |  |  |  |  | # ... and forwards within the "after" list on each axis | 
| 368 | 8 |  |  |  |  | 10 | (map {@{$_->[1]}} @nearby)) { | 
|  | 8 |  |  |  |  | 16 |  | 
| 369 | 9 |  |  |  |  | 13 | my $previousEffort = $Effort; | 
| 370 | 9 |  |  |  |  | 28 | my $volume = mount WARC::Volume ($fname); | 
| 371 | 9 |  |  |  |  | 141 | (undef, @records) = _scan_volume $volume, 0, undef, | 
| 372 |  |  |  |  |  |  | [WARC_Segment_Origin_ID => $origin_id], | 
| 373 |  |  |  |  |  |  | [WARC_Record_ID => $origin_id]; | 
| 374 | 9 |  |  |  |  | 20 | push @clues, @records; $fname = undef; | 
|  | 9 |  |  |  |  | 11 |  | 
| 375 | 9 |  |  |  |  | 16 | foreach my $record (@records) { | 
| 376 | 4 | 100 |  |  |  | 9 | return $record, @clues | 
| 377 |  |  |  |  |  |  | if $record->field('WARC-Record-ID') eq $origin_id; | 
| 378 |  |  |  |  |  |  | } | 
| 379 | 7 |  |  |  |  | 13 | $Patience += ($Effort - $previousEffort) * scalar @records; | 
| 380 | 7 | 100 |  |  |  | 23 | return undef, @clues if $Effort > $Patience; | 
| 381 |  |  |  |  |  |  | } | 
| 382 |  |  |  |  |  |  | } | 
| 383 |  |  |  |  |  |  |  | 
| 384 |  |  |  |  |  |  | # If we get this far, we have run out of places to look and the user will | 
| 385 |  |  |  |  |  |  | # need to build an index instead of relying on heuristics. | 
| 386 | 1 |  |  |  |  | 6 | return undef, @clues; | 
| 387 |  |  |  |  |  |  | } | 
| 388 |  |  |  |  |  |  |  | 
| 389 |  |  |  |  |  |  | =item ( @segments ) = find_continuation( $first_segment, @clues ) | 
| 390 |  |  |  |  |  |  |  | 
| 391 |  |  |  |  |  |  | Attempt to locate the continuation segments of a logical record without | 
| 392 |  |  |  |  |  |  | using indexes.  Uses the clues returned from C to aid | 
| 393 |  |  |  |  |  |  | in the search and returns a list of continuation records found that appear | 
| 394 |  |  |  |  |  |  | to be part of the same logical record as the given first segment. | 
| 395 |  |  |  |  |  |  |  | 
| 396 |  |  |  |  |  |  | =cut | 
| 397 |  |  |  |  |  |  |  | 
| 398 |  |  |  |  |  |  | sub _add_segments (\$\@\@) { | 
| 399 | 78 |  |  | 78 |  | 97 | my $total_segment_count_ref = shift; | 
| 400 | 78 |  |  |  |  | 92 | my $have_segments_ref = shift; | 
| 401 | 78 |  |  |  |  | 90 | my $new_segments_ref = shift; | 
| 402 |  |  |  |  |  |  |  | 
| 403 | 78 |  |  |  |  | 148 | foreach (@$new_segments_ref) { | 
| 404 | 38 |  |  |  |  | 98 | $have_segments_ref->[$_->field('WARC-Segment-Number')]++; | 
| 405 | 38 | 100 |  |  |  | 76 | $$total_segment_count_ref = $_->field('WARC-Segment-Number') | 
| 406 |  |  |  |  |  |  | if defined $_->field('WARC-Segment-Total-Length'); | 
| 407 |  |  |  |  |  |  | } | 
| 408 |  |  |  |  |  |  | } | 
| 409 |  |  |  |  |  |  | sub _have_all_segments_p ($@) { | 
| 410 | 86 |  |  | 86 |  | 104 | my $total_segment_count = shift; | 
| 411 |  |  |  |  |  |  |  | 
| 412 |  |  |  |  |  |  | # We cannot have all segments if we have not seen the last segment yet. | 
| 413 | 86 | 100 |  |  |  | 333 | return 0 unless defined $total_segment_count; | 
| 414 |  |  |  |  |  |  |  | 
| 415 |  |  |  |  |  |  | # We have seen the last segment, do we have all of the others? | 
| 416 | 24 | 100 |  |  |  | 42 | for (my $i = 2; $i < $total_segment_count; $i++) { return 0 unless $_[$i] } | 
|  | 59 |  |  |  |  | 142 |  | 
| 417 |  |  |  |  |  |  | #  Start the search at 2 because offsets 0 and 1 are not used here. | 
| 418 |  |  |  |  |  |  |  | 
| 419 | 9 |  |  |  |  | 62 | return 1; | 
| 420 |  |  |  |  |  |  | } | 
| 421 |  |  |  |  |  |  |  | 
| 422 |  |  |  |  |  |  | sub find_continuation { | 
| 423 | 16 |  |  | 16 | 1 | 4769 | local $Patience = $Patience; | 
| 424 | 16 |  |  |  |  | 28 | local $Effort = 0; | 
| 425 |  |  |  |  |  |  |  | 
| 426 | 16 |  |  |  |  | 30 | my $first_segment = shift; my $origin_id = $first_segment->id; | 
|  | 16 |  |  |  |  | 46 |  | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | # First we unpack the clues and check if all segments were found while | 
| 429 |  |  |  |  |  |  | # searching for the first segment. | 
| 430 | 16 |  |  |  |  | 32 | my @segments = (); my @nearby_volume_files = (); | 
|  | 16 |  |  |  |  | 25 |  | 
| 431 | 16 |  |  |  |  | 25 | my $have_tail = 0; my $point = undef; | 
|  | 16 |  |  |  |  | 23 |  | 
| 432 | 16 |  |  |  |  | 21 | my @similar_volume_files_before = (); my @similar_volume_files_after = (); | 
|  | 16 |  |  |  |  | 22 |  | 
| 433 | 16 |  |  |  |  | 36 | foreach my $clue (@_) { | 
| 434 | 35 | 100 |  |  |  | 95 | if (UNIVERSAL::isa($clue, 'WARC::Record')) { | 
|  |  | 100 |  |  |  |  |  | 
| 435 | 21 | 100 |  |  |  | 58 | push @segments, $clue unless $clue == $first_segment; | 
| 436 |  |  |  |  |  |  | } elsif (ref $clue eq 'ARRAY') { | 
| 437 | 13 |  |  |  |  | 23 | my $tag = shift @$clue; | 
| 438 | 13 | 100 |  |  |  | 33 | if ($tag eq 'tail') { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
| 439 | 7 |  |  |  |  | 10 | $have_tail = 1; | 
| 440 | 7 |  |  |  |  | 14 | $point = shift @$clue; | 
| 441 |  |  |  |  |  |  | } elsif ($tag eq 'files_on_axes') { | 
| 442 | 3 |  |  |  |  | 32 | push @nearby_volume_files, map {[grep defined, @$_]} @$clue; | 
|  | 6 |  |  |  |  | 23 |  | 
| 443 |  |  |  |  |  |  | } elsif ($tag eq 'files_from_dir') { | 
| 444 | 2 |  |  |  |  | 5 | foreach (@$clue) { | 
| 445 | 6 |  |  |  |  | 8 | push @similar_volume_files_before, [grep defined, @{$_->[0]}]; | 
|  | 6 |  |  |  |  | 14 |  | 
| 446 | 6 |  |  |  |  | 7 | push @similar_volume_files_after,  [grep defined, @{$_->[1]}]; | 
|  | 6 |  |  |  |  | 13 |  | 
| 447 |  |  |  |  |  |  | } | 
| 448 | 1 |  |  |  |  | 9 | } else { die "unrecognized hint tag: $tag" } | 
| 449 | 1 |  |  |  |  | 8 | } else { die "unrecognized clue" } | 
| 450 | 33 |  |  |  |  | 65 | $clue = undef; | 
| 451 |  |  |  |  |  |  | } | 
| 452 | 14 |  |  |  |  | 62 | @similar_volume_files_before = grep {scalar @$_} @similar_volume_files_before; | 
|  | 6 |  |  |  |  | 12 |  | 
| 453 | 14 |  |  |  |  | 20 | @similar_volume_files_after  = grep {scalar @$_} @similar_volume_files_after; | 
|  | 6 |  |  |  |  | 10 |  | 
| 454 |  |  |  |  |  |  |  | 
| 455 | 14 |  |  |  |  | 19 | my @have_segments = (); my $total_segment_count = undef; | 
|  | 14 |  |  |  |  | 23 |  | 
| 456 | 14 |  |  |  |  | 38 | _add_segments $total_segment_count, @have_segments, @segments; | 
| 457 |  |  |  |  |  |  |  | 
| 458 | 14 | 100 |  |  |  | 27 | return @segments if _have_all_segments_p $total_segment_count, @have_segments; | 
| 459 |  |  |  |  |  |  |  | 
| 460 |  |  |  |  |  |  | # If we get to here, at least one segment was not found while searching | 
| 461 |  |  |  |  |  |  | # for the first segment, so we will need to search too. | 
| 462 | 12 |  |  |  |  | 20 | my @records = (); | 
| 463 |  |  |  |  |  |  |  | 
| 464 |  |  |  |  |  |  | # Pick up where find_first_segment left off... | 
| 465 | 12 | 100 |  |  |  | 33 | if ($point) { | 
|  |  | 100 |  |  |  |  |  | 
| 466 | 4 |  |  |  |  | 12 | (undef, @records) = _scan_volume $point->volume, $point->offset, undef, | 
| 467 |  |  |  |  |  |  | [WARC_Segment_Origin_ID => $origin_id]; | 
| 468 | 4 |  |  |  |  | 14 | _add_segments $total_segment_count, @have_segments, @records; | 
| 469 | 4 |  |  |  |  | 8 | push @segments, @records; | 
| 470 |  |  |  |  |  |  | } elsif (!$have_tail) { | 
| 471 |  |  |  |  |  |  | # The search may have begun with the first segment directly; ensure | 
| 472 |  |  |  |  |  |  | # that we scan the entire volume containing the first segment later. | 
| 473 | 7 |  |  |  |  | 18 | push @nearby_volume_files, [$first_segment->volume->filename]; | 
| 474 |  |  |  |  |  |  | } | 
| 475 | 12 |  |  |  |  | 110 | $Patience += $Effort * scalar @records; | 
| 476 |  |  |  |  |  |  | return @segments | 
| 477 | 12 | 100 | 100 |  |  | 22 | if (_have_all_segments_p $total_segment_count, @have_segments | 
| 478 |  |  |  |  |  |  | or $Effort > $Patience); | 
| 479 |  |  |  |  |  |  |  | 
| 480 |  |  |  |  |  |  | # Search for more volumes in a simple sequence... | 
| 481 |  |  |  |  |  |  | { | 
| 482 | 10 |  |  |  |  | 19 | my @simple_axes = _split_digit_spans $first_segment->volume->filename; | 
| 483 | 10 |  |  |  |  | 27 | my @nearby = _find_nearby_files 1, @simple_axes; | 
| 484 |  |  |  |  |  |  |  | 
| 485 |  |  |  |  |  |  | # Were more volumes found in the simple sequence search now or previously? | 
| 486 | 10 |  |  |  |  | 32 | foreach my $axis_files ((reverse @nearby), | 
| 487 |  |  |  |  |  |  | (reverse @nearby_volume_files)) { | 
| 488 |  |  |  |  |  |  | # Work backwards on the assumption that sequence numbers are nearer | 
| 489 |  |  |  |  |  |  | # to the end of the filename.  (Correct for Wget and Wpull.) | 
| 490 | 17 |  |  |  |  | 30 | foreach my $name (@$axis_files) { | 
| 491 | 27 |  |  |  |  | 33 | my $previousEffort = $Effort; | 
| 492 | 27 |  |  |  |  | 69 | my $volume = mount WARC::Volume ($name); | 
| 493 | 27 |  |  |  |  | 424 | (undef, @records) = _scan_volume $volume, 0, undef, | 
| 494 |  |  |  |  |  |  | [WARC_Segment_Origin_ID => $origin_id]; | 
| 495 | 27 |  |  |  |  | 74 | _add_segments $total_segment_count, @have_segments, @records; | 
| 496 | 27 |  |  |  |  | 41 | push @segments, @records; | 
| 497 | 27 |  |  |  |  | 43 | $Patience += ($Effort - $previousEffort) * scalar @records; | 
| 498 |  |  |  |  |  |  | return @segments | 
| 499 | 27 | 100 | 100 |  |  | 51 | if (_have_all_segments_p $total_segment_count, @have_segments | 
| 500 |  |  |  |  |  |  | or $Effort > $Patience); | 
| 501 |  |  |  |  |  |  | } | 
| 502 |  |  |  |  |  |  | } | 
| 503 |  |  |  |  |  |  | } | 
| 504 |  |  |  |  |  |  |  | 
| 505 |  |  |  |  |  |  | # Search for more volumes by directory scan... | 
| 506 |  |  |  |  |  |  | { | 
| 507 | 10 | 100 |  |  |  | 16 | unless (@similar_volume_files_before + @similar_volume_files_after) { | 
|  | 6 |  |  |  |  | 12 |  | 
|  | 6 |  |  |  |  | 15 |  | 
| 508 |  |  |  |  |  |  | # Unlike the simple sequence search, the directory scan finds files | 
| 509 |  |  |  |  |  |  | # in both directions from the starting point on all axes, but it may | 
| 510 |  |  |  |  |  |  | # not have been needed to find the first segment.  Do it now if not. | 
| 511 | 5 |  |  |  |  | 13 | my @nearby = _find_similar_files $first_segment->volume->filename; | 
| 512 | 5 |  |  |  |  | 13 | foreach (@nearby) { | 
| 513 | 13 |  |  |  |  | 17 | push @similar_volume_files_before, $_->[0]; | 
| 514 | 13 |  |  |  |  | 21 | push @similar_volume_files_after,  $_->[1]; | 
| 515 |  |  |  |  |  |  | } | 
| 516 |  |  |  |  |  |  | } | 
| 517 |  |  |  |  |  |  | # Any interesting records in volumes before the volume containing the | 
| 518 |  |  |  |  |  |  | # initial record were probably found while locating the first segment. | 
| 519 | 6 |  |  |  |  | 12 | foreach my $axis_files (@similar_volume_files_after, | 
| 520 |  |  |  |  |  |  | reverse @similar_volume_files_before) { | 
| 521 |  |  |  |  |  |  | # Work forwards on the assumption that sequence numbers are nearer to | 
| 522 |  |  |  |  |  |  | # the beginning of the filename.  (Correct in Internet Archive samples.) | 
| 523 | 23 |  |  |  |  | 35 | foreach my $fname (@$axis_files) { | 
| 524 | 33 |  |  |  |  | 42 | my $previousEffort = $Effort; | 
| 525 | 33 |  |  |  |  | 84 | my $volume = mount WARC::Volume ($fname); | 
| 526 | 33 |  |  |  |  | 486 | (undef, @records) = _scan_volume $volume, 0, undef, | 
| 527 |  |  |  |  |  |  | [WARC_Segment_Origin_ID => $origin_id]; | 
| 528 | 33 |  |  |  |  | 87 | _add_segments $total_segment_count, @have_segments, @records; | 
| 529 | 33 |  |  |  |  | 55 | push @segments, @records; | 
| 530 | 33 |  |  |  |  | 51 | $Patience += ($Effort - $previousEffort) * scalar @records; | 
| 531 |  |  |  |  |  |  | return @segments | 
| 532 | 33 | 100 | 100 |  |  | 60 | if (_have_all_segments_p $total_segment_count, @have_segments | 
| 533 |  |  |  |  |  |  | or $Effort > $Patience); | 
| 534 |  |  |  |  |  |  | } | 
| 535 |  |  |  |  |  |  | } | 
| 536 |  |  |  |  |  |  | } | 
| 537 |  |  |  |  |  |  |  | 
| 538 |  |  |  |  |  |  | # If we get to here, we have run out of places to look and the user will | 
| 539 |  |  |  |  |  |  | # need to build an index instead of relying on heuristics. | 
| 540 | 2 |  |  |  |  | 17 | return @segments; | 
| 541 |  |  |  |  |  |  | } | 
| 542 |  |  |  |  |  |  |  | 
| 543 |  |  |  |  |  |  | =back | 
| 544 |  |  |  |  |  |  |  | 
| 545 |  |  |  |  |  |  | =cut | 
| 546 |  |  |  |  |  |  |  | 
| 547 |  |  |  |  |  |  | 1; | 
| 548 |  |  |  |  |  |  | __END__ |