File Coverage

blib/lib/WARC/Index/File/CDX.pm
Criterion Covered Total %
statement 124 124 100.0
branch 53 54 98.1
condition 11 12 100.0
subroutine 15 15 100.0
pod 5 5 100.0
total 208 210 99.5


line stmt bran cond sub pod time code
1             package WARC::Index::File::CDX; # -*- CPerl -*-
2              
3 1     1   61498 use strict;
  1         10  
  1         26  
4 1     1   4 use warnings;
  1         2  
  1         62  
5              
6             require WARC::Index;
7             our @ISA = qw(WARC::Index);
8              
9             require WARC; *WARC::Index::File::CDX::VERSION = \$WARC::VERSION;
10              
11 1     1   7 use Carp;
  1         1  
  1         48  
12 1     1   13 use File::Spec;
  1         2  
  1         28  
13 1     1   5 use Fcntl 'SEEK_SET';
  1         2  
  1         1605  
14             require File::Spec::Unix;
15              
16             require WARC::Date;
17             require WARC::Index::File::CDX::Entry;
18             require WARC::Volume;
19              
20             WARC::Index::register(filename => qr/[.]cdx$/);
21              
22             our %CDX_Field_Index_Key_Map =
23             (a => 'url', b => 'time', u => 'record_id');
24              
25             our %CDX_Import_Map =
26             (time => sub {
27             my $cdx_date = shift;
28             croak "invalid CDX datestamp"
29             unless $cdx_date =~ m/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/;
30             return WARC::Date->from_string(sprintf('%04d-%02d-%02dT%02d:%02d:%02dZ',
31             $1, $2, $3, $4, $5, $6))},
32             );
33              
34             # This implementation uses a hash as the underlying structure.
35             # Keys defined by this class:
36             #
37             # cdx_file
38             # file name of CDX file
39             # delimiter
40             # delimiter character used in CDX file
41             # fields
42             # array of CDX field codes, in order
43             # field_index
44             # hash mapping CDX field code letter => position
45             # key_index
46             # hash mapping WARC::Index search keys => position
47             # volumes
48             # hash mapping WARC file names => WARC::Volume objects
49              
50             sub _dbg_dump {
51 6     6   42 my $self = shift;
52              
53 6         31 my $out = __PACKAGE__ . "\n in " . $self->{cdx_file} . "\n";
54             $out .= sprintf ' delimiter ASCII %d/%d ',
55 6         52 (ord $self->{delimiter})/16, (ord $self->{delimiter})%16;
56 6         14 $out .= ' '.join(' ', 'CDX', @{$self->{fields}})."\n";
  6         23  
57              
58 6         31 return $out;
59             }
60              
61             sub attach {
62 13     13 1 5370 my $class = shift;
63 13         75 my $cdx_file = shift;
64              
65 13         48 local $/ = "\012"; # ASCII 0/10 LF
66 13 100       724 open my $cdx, '<', $cdx_file or croak "$cdx_file: $!";
67 12         78 binmode $cdx, ':raw';
68              
69 12         167 my $header = <$cdx>;
70 12 100       225 croak "could not read CDX header line from $cdx_file: $!"
71             unless defined $header;
72              
73 11         22 chomp $header;
74 11         30 my $delimiter = substr $header, 0, 1;
75 11 100 100     403 croak "no CDX marker found in $cdx_file"
76             unless 'CDX' eq substr $header, 1, 3
77             and $delimiter eq substr $header, 4, 1;
78              
79 9         76 my @fields = split /\Q$delimiter/, substr $header, 5;
80              
81 9         33 my %field_index = map {$fields[$_] => $_} 0 .. $#fields;
  120         233  
82             my %key_index =
83 10         26 map {$CDX_Field_Index_Key_Map{$fields[$_]} => $_}
84 9         66 grep defined $CDX_Field_Index_Key_Map{$fields[$_]}, 0 .. $#fields;
85              
86             croak "CDX file $cdx_file does not index WARC file name"
87 9 100       192 unless defined $field_index{g};
88             croak "CDX file $cdx_file does not index record offset"
89 8 100 100     224 unless defined $field_index{v} or defined $field_index{V};
90              
91 7         166 bless {cdx_file => $cdx_file, delimiter => $delimiter, fields => \@fields,
92             field_index => \%field_index, key_index => \%key_index}, $class
93             }
94              
95             sub _get_volume {
96 83     83   97 my $self = shift;
97 83         93 my $name = shift;
98              
99 83 100       340 return $self->{volumes}{$name} if defined $self->{volumes}{$name};
100              
101             # otherwise...
102 6         116 my ($vol, $cdx_dirs, $file) = File::Spec->splitpath($self->{cdx_file});
103 6         43 my @cdx_dirs = File::Spec->splitdir($cdx_dirs);
104 6         51 my ($wvol, $warc_dirs, $warc_file) = File::Spec::Unix->splitpath($name);
105 6         23 my @warc_dirs = File::Spec::Unix->splitdir($warc_dirs);
106 6         97 my $warcfilename =
107             File::Spec->catpath($vol, File::Spec->catdir(@cdx_dirs,
108             @warc_dirs), $warc_file);
109 6         41 return $self->{volumes}{$name} = mount WARC::Volume ($warcfilename);
110             }
111              
112             sub _parse_cdx_entry {
113 95     95   150 my $self = shift;
114 95         117 my $pos = shift;
115 95         133 my $entry = shift;
116             # uncoverable condition right
117 95   66     268 my $entry_length = $entry && length $entry;
118              
119 95 100       217 return undef unless $entry_length; # as occurs at end-of-file
120              
121 83         118 chomp $entry;
122 83         329 my @fields = split /\Q$self->{delimiter}/, $entry;
123 83         192 my $volname = $fields[$self->{field_index}{g}];
124             my %entry =
125             ( _index => $self, _entry_offset => $pos, _entry_length => $entry_length,
126             _g__volume => $self->_get_volume($volname),
127             _Vv__record_offset =>
128             $fields[$self->{field_index}{($volname =~ m/[.]w?arc$/ ? 'v' : 'V')}],
129             map {$_ => (defined $CDX_Import_Map{$_}
130             ? $CDX_Import_Map{$_}->($fields[$self->{key_index}{$_}])
131 83 100       185 : $fields[$self->{key_index}{$_}])} keys %{$self->{key_index}}
  71 100       310  
  83         237  
132             );
133              
134 83         605 bless \%entry, (ref $self).'::Entry';
135             }
136              
137             sub searchable {
138 16     16 1 1062 my $self = shift;
139 16         23 my $key = shift;
140              
141 16 100       47 return defined $self->{key_index}{url} if $key eq 'url_prefix';
142 12         52 return defined $self->{key_index}{$key};
143             }
144              
145             sub _search_all {
146 14     14   21 my $self = shift;
147              
148 14         50 local $/ = "\012"; # ASCII 0/10 LF
149 14         20 my @results = ();
150              
151 14 100       592 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
152 13         81 binmode $cdx, ':raw';
153              
154 13         138 my $header = <$cdx>; # skip header to reach first entry
155 13         33 my $offset = tell $cdx;
156 13         48 my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
157 13 100       46 return () unless defined $entry->distance(@_);
158              
159 7         17 while (defined $entry) {
160 28 100       56 push @results, $entry unless 0 > $entry->distance(@_);
161 28         42 $offset = tell $cdx; # ... and advance ...
162 28         127 $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
163             }
164              
165 7         109 return @results;
166             }
167              
168             sub _search_best_match {
169 18     18   23 my $self = shift;
170              
171 18         57 local $/ = "\012"; # ASCII 0/10 LF
172 18         23 my $result = undef;
173 18         20 my $result_distance = -1;
174              
175 18 100       783 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
176 17         99 binmode $cdx, ':raw';
177              
178 17         174 my $header = <$cdx>; # skip header to reach first entry
179 17         49 my $offset = tell $cdx;
180 17         56 my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
181 17 100       51 return undef unless defined $entry->distance(@_);
182              
183 11         21 while (defined $entry) {
184 31         68 my $distance = $entry->distance(@_);
185 31 100       55 unless (0 > $distance) {
186 19 100 100     53 if ($result_distance < 0 # first match found
187             or $distance < $result_distance) # or better match found
188 16         27 { $result = $entry; $result_distance = $distance }
  16         19  
189             }
190 31 100       209 return $result if $result_distance == 0; # no better match possible
191 22         32 $offset = tell $cdx; # ... and advance ...
192 22         81 $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
193             }
194              
195 2         32 return $result;
196             }
197              
198             sub search {
199 35     35 1 5418 my $self = shift;
200              
201 35 100       74 unless (defined wantarray)
202 1         157 { carp "calling 'search' method in void context"; return }
  1         65  
203              
204 34 100       161 croak "no arguments given to 'search' method"
205             unless scalar @_;
206 33 100       158 croak "odd number of arguments given to 'search' method"
207             if scalar @_ % 2;
208              
209 32 100       60 if (wantarray) { return $self->_search_all(@_) }
  14         29  
210 18         36 else { return $self->_search_best_match(@_) }
211             }
212              
213             sub first_entry {
214 4     4 1 394 my $self = shift;
215              
216 4         16 local $/ = "\012"; # ASCII 0/10 LF
217              
218 4 100       227 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
219 3         20 binmode $cdx, ':raw';
220              
221 3         31 my $header = <$cdx>; # skip header to reach first entry
222 3         10 my $offset = tell $cdx;
223 3         13 return $self->_parse_cdx_entry($offset, scalar <$cdx>);
224             }
225              
226             sub entry_at {
227 15     15 1 937 my $self = shift;
228 15         17 my $offset = shift;
229              
230 15         57 local $/ = "\012"; # ASCII 0/10 LF
231              
232 15 100       753 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
233 14         91 binmode $cdx, ':raw';
234              
235             # one octet before requested position must be an end-of-line marker
236 14 100       231 seek $cdx, $offset - 1, SEEK_SET or croak "seek $self->{cdx_file}: $!";
237 13 50       28 my $eol; defined(read $cdx, $eol, 1) or croak "read $self->{cdx_file}: $!";
  13         186  
238 13 100       236 croak "offset $offset in $self->{cdx_file} not a record boundary"
239             unless $eol eq $/;
240              
241 12         77 return $self->_parse_cdx_entry($offset, scalar <$cdx>);
242             }
243              
244             1;
245             __END__