File Coverage

blib/lib/WARC/Index.pm
Criterion Covered Total %
statement 28 28 100.0
branch 6 6 100.0
condition n/a
subroutine 8 9 88.8
pod 6 6 100.0
total 48 49 97.9


line stmt bran cond sub pod time code
1             package WARC::Index; # -*- CPerl -*-
2              
3 27     27   66162 use strict;
  27         57  
  27         786  
4 27     27   127 use warnings;
  27         44  
  27         652  
5              
6 27     27   187 use Carp;
  27         58  
  27         11421  
7              
8             our @ISA = qw();
9              
10             require WARC; *WARC::Index::VERSION = \$WARC::VERSION;
11              
12             =head1 NAME
13              
14             WARC::Index - base class for WARC index classes
15              
16             =head1 SYNOPSIS
17              
18             use WARC::Index::File::CDX; # or ...
19             use WARC::Index::File::SDBM;
20             # or some other WARC::Index::File::* implementation
21              
22             $index = attach WARC::Index::File::CDX (...); # or ...
23             $index = attach WARC::Index::File::SDBM (...);
24              
25             $record = $index->search(url => $url, time => $when);
26             @records = $index->search(url => $url, time => $when);
27              
28             build WARC::Index::File::CDX (...); # or ...
29             build WARC::Index::File::SDBM (...);
30              
31             =head1 DESCRIPTION
32              
33             C is an abstract base class for indexes on WARC files and
34             WARC-alike files. This class establishes the expected interface and
35             provides a simple interface for building indexes.
36              
37             =head2 Methods
38              
39             =over
40              
41             =item $index = attach WARC::Index::File::* (...)
42              
43             Construct an index object using the indicated technology and whatever
44             parameters the index implementation needs.
45              
46             Typically, indexes are file-based and a single parameter is the name of an
47             index file which in turn contains the names of the indexed WARC files.
48              
49             =cut
50              
51             sub attach {
52 1     1 1 98 die __PACKAGE__." is an abstract base class and "
53             .(shift)." must override the 'attach' method"
54             }
55              
56             =item $yes_or_no = $index-Esearchable( $key )
57              
58             Return true or false to reflect if the index can search for the requested
59             key. Indexes may be able to search for keys that are not present in
60             entries returned from those indexes.
61              
62             See the L<"Search Keys" section|WARC::Collection/"Search Keys"> of the
63             C page for details on the implemented search keys.
64              
65             =cut
66              
67             sub searchable {
68 1     1 1 777 die __PACKAGE__." is an abstract base class and "
69             .(ref shift)." must override the 'searchable' method"
70             }
71              
72             =item $record = $index-Esearch( ... )
73              
74             =item @records = $index-Esearch( ... )
75              
76             Search an index for records matching parameters. The C
77             class uses this method to search each index in a collection.
78              
79             If the none of the requested search keys are searchable, returns an
80             undefined value in scalar context and the empty list in list context.
81              
82             The details of the parameters for this method are documented in the
83             L<"Search Keys" section|WARC::Collection/"Search Keys"> of the
84             C page.
85              
86             =cut
87              
88             sub search {
89 1     1 1 429 die __PACKAGE__." is an abstract base class and "
90             .(ref shift)." must override the 'search' method"
91             }
92              
93             =item build WARC::Index::File::* (into =E $dest, from =E ...)
94              
95             =item build WARC::Index::File::* (from =E [...], into =E $dest)
96              
97             The C base class B provide this method, however. The
98             C method works by loading the corresponding index builder class and
99             driving the process or simply returning the newly-constructed object.
100              
101             The C method itself handles the C key for specifying the files
102             to index. The C key can be given an array reference, after which
103             more key =E value pairs may follow, or can simply use the rest of the
104             argument list as its value.
105              
106             If the C key is given, the C method will read the indicated
107             files, construct an index, and return nothing. If the C key is not
108             given, the C method will construct and return an index builder.
109              
110             All index builders accept at least the C key for specifying where to
111             store the index. See the documentation for WARC::Index::File::*::Builder
112             for more information.
113              
114             =cut
115              
116       0 1   sub build {
117             }
118              
119             =back
120              
121             =head2 Optional Methods
122              
123             Some index systems may also provide these methods:
124              
125             =over
126              
127             =item $entry = $index-Efirst_entry
128              
129             An index that has a sequential ordering may provide this method to obtain
130             the first entry in the index. Indexes that do not have a meaningful
131             sequence amongst their entries do not provide this method.
132              
133             =item $entry = $index-Eentry_at( $position )
134              
135             An index that has a sequential ordering may provide this method to obtain
136             an entry at a specified position in the index. The exact format of the
137             position parameter is not specified in general, but should be a value
138             previously obtained from the C method on an entry from the
139             same index. Valid positions may be sparse.
140              
141             =back
142              
143             =head2 Index system registration
144              
145             The C package also provides a registry of loaded index
146             support. The C function adds the calling package to the list.
147              
148             =cut
149              
150             # Array of arrays listing index implementations and filename patterns.
151             # Each element: [ Package => qr/pattern1/, qr/pattern2/, ... ]
152             our @Index_Handlers = ();
153              
154             =over
155              
156             =item WARC::Index::register( filename =E $filename_re )
157              
158             Add the calling package to an internal list of available index handlers.
159             The calling package must be a subclass of C or this function
160             will croak().
161              
162             The C key indicates that the calling package expects to handle
163             index files with names matching the provided regex.
164              
165             =cut
166              
167             sub register {
168 13     13 1 3241 my %opt = @_;
169 13         37 my $caller = scalar caller;
170              
171 13 100       382 croak "WARC::Index implementations must subclass WARC::Index"
172             unless $caller->isa('WARC::Index');
173              
174             croak "WARC::Index implementations must handle a filename pattern"
175 12 100       290 unless $opt{filename};
176              
177 10         28 foreach my $row (grep {$_->[0] eq $caller} @Index_Handlers) {
  8         23  
178 2         4 push @$row, $opt{filename}; # add pattern to existing row
179             return # ensure that there will be at most one row per package
180 2         6 }
181 8         25 push @Index_Handlers, [$caller => $opt{filename}];
182              
183             return # nothing
184 8         25 }
185              
186             =item WARC::Index::find_handler( $filename )
187              
188             Return the registered handler for $filename or undef if none match. If
189             multiple handlers match, which one is returned is unspecified.
190              
191             =cut
192              
193             sub find_handler {
194 20     20 1 948 my $filename = shift;
195 20         43 my @match = grep {grep {$filename =~ $_} @$_[1..$#$_]} @Index_Handlers;
  43         96  
  55         272  
196 20 100       64 return undef unless @match;
197 18         68 return $match[0][0];
198             }
199              
200             =back
201              
202             =cut
203              
204             1;
205             __END__