File Coverage

blib/lib/WARC/Index/File/CDX.pm
Criterion Covered Total %
statement 124 124 100.0
branch 53 54 98.1
condition 11 12 100.0
subroutine 15 15 100.0
pod 5 5 100.0
total 208 210 99.5


line stmt bran cond sub pod time code
1             package WARC::Index::File::CDX; # -*- CPerl -*-
2              
3 1     1   70375 use strict;
  1         13  
  1         32  
4 1     1   5 use warnings;
  1         2  
  1         68  
5              
6             require WARC::Index;
7             our @ISA = qw(WARC::Index);
8              
9             require WARC; *WARC::Index::File::CDX::VERSION = \$WARC::VERSION;
10              
11 1     1   6 use Carp;
  1         1  
  1         69  
12 1     1   7 use File::Spec;
  1         2  
  1         25  
13 1     1   21 use Fcntl 'SEEK_SET';
  1         3  
  1         2479  
14             require File::Spec::Unix;
15              
16             require WARC::Date;
17             require WARC::Index::File::CDX::Entry;
18             require WARC::Volume;
19              
20             WARC::Index::register(filename => qr/[.]cdx$/);
21              
22             our %CDX_Field_Index_Key_Map =
23             (a => 'url', b => 'time', u => 'record_id');
24              
25             our %CDX_Import_Map =
26             (time => sub {
27             my $cdx_date = shift;
28             croak "invalid CDX datestamp"
29             unless $cdx_date =~ m/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/;
30             return WARC::Date->from_string(sprintf('%04d-%02d-%02dT%02d:%02d:%02dZ',
31             $1, $2, $3, $4, $5, $6))},
32             );
33              
34             # This implementation uses a hash as the underlying structure.
35             # Keys defined by this class:
36             #
37             # cdx_file
38             # file name of CDX file
39             # delimiter
40             # delimiter character used in CDX file
41             # fields
42             # array of CDX field codes, in order
43             # field_index
44             # hash mapping CDX field code letter => position
45             # key_index
46             # hash mapping WARC::Index search keys => position
47             # volumes
48             # hash mapping WARC file names => WARC::Volume objects
49              
50             sub _dbg_dump {
51 6     6   44 my $self = shift;
52              
53 6         30 my $out = __PACKAGE__ . "\n in " . $self->{cdx_file} . "\n";
54             $out .= sprintf ' delimiter ASCII %d/%d ',
55 6         53 (ord $self->{delimiter})/16, (ord $self->{delimiter})%16;
56 6         16 $out .= ' '.join(' ', 'CDX', @{$self->{fields}})."\n";
  6         21  
57              
58 6         32 return $out;
59             }
60              
61             sub attach {
62 13     13 1 7271 my $class = shift;
63 13         26 my $cdx_file = shift;
64              
65 13         50 local $/ = "\012"; # ASCII 0/10 LF
66 13 100       782 open my $cdx, '<', $cdx_file or croak "$cdx_file: $!";
67 12         84 binmode $cdx, ':raw';
68              
69 12         190 my $header = <$cdx>;
70 12 100       224 croak "could not read CDX header line from $cdx_file: $!"
71             unless defined $header;
72              
73 11         85 chomp $header;
74 11         46 my $delimiter = substr $header, 0, 1;
75 11 100 100     513 croak "no CDX marker found in $cdx_file"
76             unless 'CDX' eq substr $header, 1, 3
77             and $delimiter eq substr $header, 4, 1;
78              
79 9         79 my @fields = split /\Q$delimiter/, substr $header, 5;
80              
81 9         32 my %field_index = map {$fields[$_] => $_} 0 .. $#fields;
  120         254  
82             my %key_index =
83 10         31 map {$CDX_Field_Index_Key_Map{$fields[$_]} => $_}
84 9         72 grep defined $CDX_Field_Index_Key_Map{$fields[$_]}, 0 .. $#fields;
85              
86             croak "CDX file $cdx_file does not index WARC file name"
87 9 100       216 unless defined $field_index{g};
88             croak "CDX file $cdx_file does not index record offset"
89 8 100 100     237 unless defined $field_index{v} or defined $field_index{V};
90              
91 7         217 bless {cdx_file => $cdx_file, delimiter => $delimiter, fields => \@fields,
92             field_index => \%field_index, key_index => \%key_index}, $class
93             }
94              
95             sub _get_volume {
96 83     83   118 my $self = shift;
97 83         123 my $name = shift;
98              
99 83 100       394 return $self->{volumes}{$name} if defined $self->{volumes}{$name};
100              
101             # otherwise...
102 6         125 my ($vol, $cdx_dirs, $file) = File::Spec->splitpath($self->{cdx_file});
103 6         48 my @cdx_dirs = File::Spec->splitdir($cdx_dirs);
104 6         61 my ($wvol, $warc_dirs, $warc_file) = File::Spec::Unix->splitpath($name);
105 6         24 my @warc_dirs = File::Spec::Unix->splitdir($warc_dirs);
106 6         90 my $warcfilename =
107             File::Spec->catpath($vol, File::Spec->catdir(@cdx_dirs,
108             @warc_dirs), $warc_file);
109 6         39 return $self->{volumes}{$name} = mount WARC::Volume ($warcfilename);
110             }
111              
112             sub _parse_cdx_entry {
113 95     95   187 my $self = shift;
114 95         149 my $pos = shift;
115 95         164 my $entry = shift;
116             # uncoverable condition right
117 95   66     311 my $entry_length = $entry && length $entry;
118              
119 95 100       273 return undef unless $entry_length; # as occurs at end-of-file
120              
121 83         129 chomp $entry;
122 83         377 my @fields = split /\Q$self->{delimiter}/, $entry;
123 83         195 my $volname = $fields[$self->{field_index}{g}];
124             my %entry =
125             ( _index => $self, _entry_offset => $pos, _entry_length => $entry_length,
126             _g__volume => $self->_get_volume($volname),
127             _Vv__record_offset =>
128             $fields[$self->{field_index}{($volname =~ m/[.]w?arc$/ ? 'v' : 'V')}],
129             map {$_ => (defined $CDX_Import_Map{$_}
130             ? $CDX_Import_Map{$_}->($fields[$self->{key_index}{$_}])
131 83 100       181 : $fields[$self->{key_index}{$_}])} keys %{$self->{key_index}}
  71 100       366  
  83         280  
132             );
133              
134 83         684 bless \%entry, (ref $self).'::Entry';
135             }
136              
137             sub searchable {
138 16     16 1 1260 my $self = shift;
139 16         27 my $key = shift;
140              
141 16 100       60 return defined $self->{key_index}{url} if $key eq 'url_prefix';
142 12         61 return defined $self->{key_index}{$key};
143             }
144              
145             sub _search_all {
146 14     14   24 my $self = shift;
147              
148 14         51 local $/ = "\012"; # ASCII 0/10 LF
149 14         25 my @results = ();
150              
151 14 100       645 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
152 13         89 binmode $cdx, ':raw';
153              
154 13         149 my $header = <$cdx>; # skip header to reach first entry
155 13         42 my $offset = tell $cdx;
156 13         56 my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
157 13 100       58 return () unless defined $entry->distance(@_);
158              
159 7         13 while (defined $entry) {
160 28 100       67 push @results, $entry unless 0 > $entry->distance(@_);
161 28         48 $offset = tell $cdx; # ... and advance ...
162 28         145 $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
163             }
164              
165 7         125 return @results;
166             }
167              
168             sub _search_best_match {
169 18     18   30 my $self = shift;
170              
171 18         64 local $/ = "\012"; # ASCII 0/10 LF
172 18         28 my $result = undef;
173 18         28 my $result_distance = -1;
174              
175 18 100       876 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
176 17         110 binmode $cdx, ':raw';
177              
178 17         206 my $header = <$cdx>; # skip header to reach first entry
179 17         53 my $offset = tell $cdx;
180 17         69 my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
181 17 100       65 return undef unless defined $entry->distance(@_);
182              
183 11         22 while (defined $entry) {
184 31         79 my $distance = $entry->distance(@_);
185 31 100       65 unless (0 > $distance) {
186 19 100 100     57 if ($result_distance < 0 # first match found
187             or $distance < $result_distance) # or better match found
188 16         39 { $result = $entry; $result_distance = $distance }
  16         22  
189             }
190 31 100       231 return $result if $result_distance == 0; # no better match possible
191 22         38 $offset = tell $cdx; # ... and advance ...
192 22         90 $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
193             }
194              
195 2         34 return $result;
196             }
197              
198             sub search {
199 35     35 1 6547 my $self = shift;
200              
201 35 100       83 unless (defined wantarray)
202 1         207 { carp "calling 'search' method in void context"; return }
  1         82  
203              
204 34 100       192 croak "no arguments given to 'search' method"
205             unless scalar @_;
206 33 100       187 croak "odd number of arguments given to 'search' method"
207             if scalar @_ % 2;
208              
209 32 100       66 if (wantarray) { return $self->_search_all(@_) }
  14         34  
210 18         46 else { return $self->_search_best_match(@_) }
211             }
212              
213             sub first_entry {
214 4     4 1 431 my $self = shift;
215              
216 4         18 local $/ = "\012"; # ASCII 0/10 LF
217              
218 4 100       255 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
219 3         20 binmode $cdx, ':raw';
220              
221 3         37 my $header = <$cdx>; # skip header to reach first entry
222 3         11 my $offset = tell $cdx;
223 3         17 return $self->_parse_cdx_entry($offset, scalar <$cdx>);
224             }
225              
226             sub entry_at {
227 15     15 1 1105 my $self = shift;
228 15         21 my $offset = shift;
229              
230 15         57 local $/ = "\012"; # ASCII 0/10 LF
231              
232 15 100       701 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
233 14         91 binmode $cdx, ':raw';
234              
235             # one octet before requested position must be an end-of-line marker
236 14 100       242 seek $cdx, $offset - 1, SEEK_SET or croak "seek $self->{cdx_file}: $!";
237 13 50       26 my $eol; defined(read $cdx, $eol, 1) or croak "read $self->{cdx_file}: $!";
  13         188  
238 13 100       246 croak "offset $offset in $self->{cdx_file} not a record boundary"
239             unless $eol eq $/;
240              
241 12         77 return $self->_parse_cdx_entry($offset, scalar <$cdx>);
242             }
243              
244             1;
245             __END__