File Coverage

blib/lib/WARC/Index.pm
Criterion Covered Total %
statement 54 54 100.0
branch 21 22 95.4
condition 3 3 100.0
subroutine 10 10 100.0
pod 6 6 100.0
total 94 95 98.9


line stmt bran cond sub pod time code
1             package WARC::Index; # -*- CPerl -*-
2              
3 29     29   58200 use strict;
  29         64  
  29         792  
4 29     29   126 use warnings;
  29         50  
  29         644  
5              
6 29     29   142 use Carp;
  29         47  
  29         8832  
7              
8             our @ISA = qw();
9              
10             require WARC; *WARC::Index::VERSION = \$WARC::VERSION;
11              
12             =head1 NAME
13              
14             WARC::Index - base class for WARC index classes
15              
16             =head1 SYNOPSIS
17              
18             use WARC::Index::File::CDX; # or ...
19             use WARC::Index::File::SDBM;
20             # or some other WARC::Index::File::* implementation
21              
22             $index = attach WARC::Index::File::CDX (...); # or ...
23             $index = attach WARC::Index::File::SDBM (...);
24              
25             $record = $index->search(url => $url, time => $when);
26             @records = $index->search(url => $url, time => $when);
27              
28             build WARC::Index::File::CDX (...); # or ...
29             build WARC::Index::File::SDBM (...);
30              
31             =head1 DESCRIPTION
32              
33             C is an abstract base class for indexes on WARC files and
34             WARC-alike files. This class establishes the expected interface and
35             provides a simple interface for building indexes.
36              
37             =head2 Methods
38              
39             =over
40              
41             =item $index = attach WARC::Index::File::* (...)
42              
43             Construct an index object using the indicated technology and whatever
44             parameters the index implementation needs.
45              
46             Typically, indexes are file-based and a single parameter is the name of an
47             index file which in turn contains the names of the indexed WARC files.
48              
49             =cut
50              
51             sub attach {
52 1     1 1 81 die __PACKAGE__." is an abstract base class and "
53             .(shift)." must override the 'attach' method"
54             }
55              
56             =item $yes_or_no = $index-Esearchable( $key )
57              
58             Return true or false to reflect if the index can search for the requested
59             key. Indexes may be able to search for keys that are not present in
60             entries returned from those indexes.
61              
62             See the L<"Search Keys" section|WARC::Collection/"Search Keys"> of the
63             C page for details on the implemented search keys.
64              
65             =cut
66              
67             sub searchable {
68 1     1 1 667 die __PACKAGE__." is an abstract base class and "
69             .(ref shift)." must override the 'searchable' method"
70             }
71              
72             =item $record = $index-Esearch( ... )
73              
74             =item @records = $index-Esearch( ... )
75              
76             Search an index for records matching parameters. The C
77             class uses this method to search each index in a collection.
78              
79             If the none of the requested search keys are searchable, returns an
80             undefined value in scalar context and the empty list in list context.
81              
82             The details of the parameters for this method are documented in the
83             L<"Search Keys" section|WARC::Collection/"Search Keys"> of the
84             C page.
85              
86             =cut
87              
88             sub search {
89 1     1 1 357 die __PACKAGE__." is an abstract base class and "
90             .(ref shift)." must override the 'search' method"
91             }
92              
93             =item build WARC::Index::File::* (into =E $dest, from =E ...)
94              
95             =item build WARC::Index::File::* (from =E [...], into =E $dest)
96              
97             The C base class B provide this method, however. The
98             C method works by loading the corresponding index builder class and
99             driving the process or simply returning the newly-constructed object.
100              
101             The C method itself handles the C key for specifying the files
102             to index. The C key can be given an array reference, after which
103             more key =E value pairs may follow, or can simply use the rest of the
104             argument list as its value.
105              
106             If the C key is given, the C method will read the indicated
107             files, construct an index, and return nothing. If the C key is not
108             given, the C method will construct and return an index builder.
109              
110             All index builders accept at least the C key for specifying where to
111             store the index. See the documentation for WARC::Index::File::*::Builder
112             for more information.
113              
114             =cut
115              
116             sub build {
117 7     7 1 5928 my $class = shift;
118              
119 7 100       118 croak "'build' is a class method"
120             if ref $class;
121 6 100       120 croak "no arguments given to 'build' class method"
122             unless scalar @_;
123              
124 5         7 my @args = (); my $from = undef;
  5         6  
125 5         12 while (@_) {
126 11         13 my $key = shift;
127 11 100       19 if ($key eq 'from') {
128 3 100       10 if (UNIVERSAL::isa($_[0], 'ARRAY')) { $from = shift }
  1         2  
129 2         8 else { $from = [splice @_] }
130 8         14 } else { push @args, $key, shift }
131             }
132              
133 5 100 100     132 croak "empty list of index sources given"
134             if defined $from && scalar @$from == 0;
135              
136 4         6 my $bclass = $class . q{::Builder};
137             {
138 29     29   206 no strict 'refs';
  29         44  
  29         9542  
  4         7  
139 4 100       5 unless (exists ${$class.'::'}{'Builder::'})
  4         17  
140 1 50       49 { eval q{require }.$bclass; die $@ if $@ }
  1         10  
141             }
142              
143 3         10 my $ob = _new $bclass (@args);
144              
145 3 100       27 return $ob unless defined $from;
146              
147 2         5 $ob->add(@$from);
148 2         14 return ();
149             }
150              
151             =back
152              
153             =head2 Optional Methods
154              
155             Some index systems may also provide these methods:
156              
157             =over
158              
159             =item $entry = $index-Efirst_entry
160              
161             An index that has a sequential ordering may provide this method to obtain
162             the first entry in the index. Indexes that do not have a meaningful
163             sequence amongst their entries do not provide this method.
164              
165             =item $entry = $index-Eentry_at( $position )
166              
167             An index that has a sequential ordering may provide this method to obtain
168             an entry at a specified position in the index. The exact format of the
169             position parameter is not specified in general, but should be a value
170             previously obtained from the C method on an entry from the
171             same index. Valid positions may be sparse.
172              
173             =back
174              
175             =head2 Index system registration
176              
177             The C package also provides a registry of loaded index
178             support. The C function adds the calling package to the list.
179              
180             =cut
181              
182             # Array of arrays listing index implementations and filename patterns.
183             # Each element: [ Package => qr/pattern1/, qr/pattern2/, ... ]
184             our @Index_Handlers = ();
185              
186             =over
187              
188             =item WARC::Index::register( filename =E $filename_re )
189              
190             Add the calling package to an internal list of available index handlers.
191             The calling package must be a subclass of C or this function
192             will croak().
193              
194             The C key indicates that the calling package expects to handle
195             index files with names matching the provided regex.
196              
197             =cut
198              
199             sub register {
200 13     13 1 2669 my %opt = @_;
201 13         31 my $caller = scalar caller;
202              
203 13 100       326 croak "WARC::Index implementations must subclass WARC::Index"
204             unless $caller->isa('WARC::Index');
205              
206             croak "WARC::Index implementations must handle a filename pattern"
207 12 100       242 unless $opt{filename};
208              
209 10         27 foreach my $row (grep {$_->[0] eq $caller} @Index_Handlers) {
  8         19  
210 2         4 push @$row, $opt{filename}; # add pattern to existing row
211             return # ensure that there will be at most one row per package
212 2         5 }
213 8         22 push @Index_Handlers, [$caller => $opt{filename}];
214              
215             return # nothing
216 8         23 }
217              
218             =item WARC::Index::find_handler( $filename )
219              
220             Return the registered handler for $filename or undef if none match. If
221             multiple handlers match, which one is returned is unspecified.
222              
223             =cut
224              
225             sub find_handler {
226 20     20 1 697 my $filename = shift;
227 20         40 my @match = grep {grep {$filename =~ $_} @$_[1..$#$_]} @Index_Handlers;
  43         87  
  55         243  
228 20 100       57 return undef unless @match;
229 18         58 return $match[0][0];
230             }
231              
232             =back
233              
234             =cut
235              
236             1;
237             __END__