File Coverage

blib/lib/WARC/Collection.pm
Criterion Covered Total %
statement 70 70 100.0
branch 24 24 100.0
condition 6 6 100.0
subroutine 6 6 100.0
pod 3 3 100.0
total 109 109 100.0


line stmt bran cond sub pod time code
1             package WARC::Collection; # -*- CPerl -*-
2              
3 27     27   66963 use strict;
  27         60  
  27         795  
4 27     27   127 use warnings;
  27         51  
  27         1532  
5              
6             our @ISA = qw();
7              
8             require WARC; *WARC::Collection::VERSION = \$WARC::VERSION;
9              
10 27     27   185 use Carp;
  27         49  
  27         20457  
11              
12             require WARC::Index;
13             require WARC::Index::Entries;
14              
15             =head1 NAME
16              
17             WARC::Collection - Interface to a group of WARC files
18              
19             =head1 SYNOPSIS
20              
21             use WARC::Collection;
22              
23             $collection = assemble WARC::Collection ($index_1, $index_2, ...);
24             $collection = assemble WARC::Collection from => ($index_1, ...);
25              
26             $yes_or_no = $collection->searchable( $key );
27              
28             $record = $collection->search(url => $url, time => $when);
29             @records = $collection->search(url => $url, time => $when);
30              
31             =cut
32              
33             # This implementation uses a hash as the underlying structure.
34             # Keys defined by this class:
35             #
36             # indexes
37             # Array of indexes used for this collection.
38              
39             =head1 DESCRIPTION
40              
41             The C class is the primary means by which user code is
42             expected to use the WARC library. This class uses indexes to efficiently
43             search for records in one or more WARC files.
44              
45             =head2 Search Keys
46              
47             The C method accepts a list of parameters as I =E I
48             pairs with each pair narrowing the search, sorting the results, or both,
49             indicated in the following list with S<"C<[N ]>">, S<"C<[ S]>">, or "C<[NS]>",
50             respectively.
51              
52             Supplying an array reference as a I indicates a search where any of
53             the values in the array are acceptable. This does not affect sorting.
54              
55             The same search keys documented here are used for searching indexes, since
56             C is a wrapper around one or more indexes, but index
57             support modules do not sort their results. Only C sorts
58             the returned entries, so keys listed below as "sort-only" are ignored by
59             the index support modules.
60              
61             The keys supported are:
62              
63             =over
64              
65             =item C<[N ]> url
66              
67             An exact match for a URL.
68              
69             =item C<[NS]> url_prefix
70              
71             A prefix match for a URL. Prefers records with shorter URLs.
72              
73             =item C<[ S]> time
74              
75             Prefer records collected nearer to the requested time.
76              
77             =item C<[N ]> record_id
78              
79             An exact match for a (presumably unique) WARC-Record-ID.
80              
81             =item C<[N ]> segment_origin_id
82              
83             Exact match for continuation records for a WARC-Record-ID that identifies a
84             logical record stored using WARC record segmentation. Searching on this
85             key returns only the continuation records.
86              
87             =back
88              
89             =for comment
90             Matching these keys is implemented in WARC::Index::Entry::_distance_for_item
91             via %_distance_value_map and in various index support modules.
92              
93             =head2 Methods
94              
95             =over
96              
97             =item $collection = assemble WARC::Collection ($index_1, $index_2, ...);
98              
99             =item $collection = assemble WARC::Collection from =E ($index_1, ...);
100              
101             Assemble a collection of WARC files from one index or multiple indexes,
102             specified either as objects derived from C or filenames.
103              
104             While multiple indexes can be used in a collection, note that searching a
105             collection requires individually searching every index in the collection.
106              
107             =cut
108              
109             sub assemble {
110 20     20 1 7214 my $class = shift;
111 20 100 100     126 shift if scalar @_ and $_[0] eq 'from'; # discard optional noise word
112              
113 20 100       377 carp "assembling empty collection" unless scalar @_;
114              
115 20         52 my @indexes = ();
116              
117 20         45 while (@_) {
118 23         94 my $index = shift;
119              
120 23 100       111 if (UNIVERSAL::isa($index, 'WARC::Index'))
121 10         29 { push @indexes, $index } # add index object to list
122             else { # or assume filename and find an index handler
123 13         37 my $isys = WARC::Index::find_handler($index);
124 13 100       174 croak "no known handler for index '$index'" unless $isys;
125 12         38 push @indexes, (attach $isys $index);
126             }
127             }
128              
129 19         222 bless { indexes => \@indexes }, $class
130             }
131              
132             =item $yes_or_no = $collection-Esearchable( $key )
133              
134             Return true or false to reflect if any index in the collection can search
135             for the requested key.
136              
137             =cut
138              
139             sub searchable {
140 15     15 1 387 my $self = shift;
141 15         20 my $key = shift;
142              
143 15         24 foreach my $index (@{$self->{indexes}})
  15         44  
144 18 100       58 { return 1 if $index->searchable($key) }
145              
146             # none of the indexes recognize $key
147 4         29 return 0;
148             }
149              
150             =item $record = $collection-Esearch( ... )
151              
152             =item @records = $collection-Esearch( ... )
153              
154             Search the indexes for records matching the parameters and return the best
155             match in scalar context or a list of all matches in list context. The
156             returned values are C objects.
157              
158             See L for more information about the parameters.
159              
160             =cut
161              
162             sub search {
163 66     66 1 13981 my $self = shift;
164              
165 66 100       159 unless (defined wantarray)
166 1         120 { carp "calling 'search' method in void context"; return }
  1         62  
167              
168 65 100       227 croak "no arguments given to 'search' method"
169             unless scalar @_;
170 64 100       244 croak "odd number of arguments given to 'search' method"
171             if scalar @_ % 2;
172              
173             # collect all matches from all indexes
174 63         101 my %results = (); # map: tag => array of index entries for record
175 63 100       449 if (grep UNIVERSAL::isa($_, 'ARRAY'), @_) {
176             # at least one parameter is an arrayref; perform nested loop join
177 19         89 my @step = @_; my @state = (); my @varpos = ();
  19         35  
  19         28  
178 19         66 for (my $i = 1; $i <= $#step; $i += 2) {
179 29 100       89 next unless UNIVERSAL::isa($step[$i], 'ARRAY');
180 20         34 push @state, 0;
181 20         39 push @varpos, $i;
182 20         95 $step[$i] = $step[$i]->[0];
183             }
184             # search indexes with all combinations from the input
185 19         35 while ($state[0] <= $#{$_[$varpos[0]]}) {
  54         137  
186 35         51 foreach my $index (@{$self->{indexes}})
  35         71  
187 35         90 { foreach my $entry ($index->search(@step))
188 38         317 { push @{$results{$entry->tag}}, $entry } }
  38         79  
189             } continue {
190             # count in variable base in @state using the input arrayrefs
191 35         278 my $i = $#state;
192 35   100     91 $i-- while $i > 0 && $state[$i] >= $#{$_[$varpos[$i]]};
  8         30  
193 35         104 $step[$varpos[$i]] = $_[$varpos[$i]]->[++$state[$i]];
194 35         113 $step[$varpos[$i]] = $_[$varpos[$i]]->[$state[$i] = 0]
195             while ++$i <= $#state;
196             }
197             } else {
198             # simple case with single values; only one scan needed
199 44         69 foreach my $index (@{$self->{indexes}})
  44         109  
200 66         323 { foreach my $entry ($index->search(@_))
201 70         423 { push @{$results{$entry->tag}}, $entry } }
  70         172  
202             }
203              
204             # coalesce and sort the collected index entries
205             my @results =
206 63         299 map {coalesce WARC::Index::Entries ($results{$_})} keys %results;
  108         460  
207             @results = # sort by distance using Schwartzian transform
208 108         216 (map { $_->[0] } sort { $a->[1] <=> $b->[1] }
  87         140  
209 63         182 map { [$_, scalar $_->distance(@_)] } @results);
  108         256  
210              
211             # return either the entire sorted list or the best match
212 63 100       160 if (wantarray) { return map {$_->record(collection => $self)} @results }
  31         59  
  67         1103  
213             else { return (scalar @results
214 32 100       141 ? $results[0]->record(collection => $self) : undef) }
215             }
216              
217             =back
218              
219             =cut
220              
221             1;
222             __END__