File Coverage

blib/lib/WARC/Index/Entry.pm
Criterion Covered Total %
statement 66 66 100.0
branch 36 36 100.0
condition 6 6 100.0
subroutine 13 13 100.0
pod 7 7 100.0
total 128 128 100.0


line stmt bran cond sub pod time code
1             package WARC::Index::Entry; # -*- CPerl -*-
2              
3 27     27   68390 use strict;
  27         67  
  27         776  
4 27     27   140 use warnings;
  27         47  
  27         1485  
5              
6             our @ISA = qw();
7              
8             require WARC; *WARC::Index::Entry::VERSION = \$WARC::VERSION;
9              
10 27     27   161 use Carp;
  27         47  
  27         22948  
11              
12             require WARC::Record::Stub;
13              
14             =head1 NAME
15              
16             WARC::Index::Entry - abstract base class for WARC::Index entries
17              
18             =head1 SYNOPSIS
19              
20             use WARC; # or ...
21             use WARC::Index;
22              
23             # WARC::Index::Entry objects are returned from directly searching an index
24              
25             # match search criteria against entry
26             $distance = $entry->distance( ... );
27             @report = $entry->distance( ... );
28              
29             $index = $entry->index; # get index containing entry
30             $volume = $entry->volume; # get WARC::Volume containing record
31             $record = $entry->record; # get WARC record
32              
33             =head1 DESCRIPTION
34              
35             =head2 Common Methods
36              
37             Entries from all index systems support these methods:
38              
39             =over
40              
41             =item @report = $entry-Edistance( ... )
42              
43             =item $distance = $entry-Edistance( ... )
44              
45             In list context, return a detailed report mapping each search I to a
46             distance value. In scalar context, return an overall summary distance,
47             such that sorting entries by the return values of this method in ascending
48             order will place the closest matches at the top of the list.
49              
50             A valid distance is non-negative. Negative distance values indicate that
51             the record does not match the criteria at all. An undefined value
52             indicates that the entry is from an index that does not store the
53             information needed to evaluate distance for that search key. Undefined
54             values are ignored when computing the summarized distance, but the
55             summarized distance will be negative if any keys do not match at all and
56             itself undefined if none of the requested keys can be evaluated.
57              
58             For details on available search keys, see the L<"Search Keys"
59             section|WARC::Collection/"Search Keys"> of the C page.
60             If multiple values are given in an arrayref, the best match is reported.
61              
62             =cut
63              
64             sub distance {
65 698     698 1 44334 my $self = shift;
66              
67 698 100       1315 unless (defined wantarray)
68 1         216 { carp "calling 'distance' method in void context"; return }
  1         68  
69              
70 697 100       1273 croak "no arguments given to 'distance' method"
71             unless scalar @_;
72 696 100       1428 croak "odd number of arguments given to 'distance' method"
73             if scalar @_ % 2;
74              
75 695 100       1097 if (wantarray) { return $self->_distance_report(@_) }
  20         45  
76 675         1165 else { return $self->_distance_summary(@_) }
77             }
78              
79             sub _distance_report {
80 20     20   27 my $self = shift;
81 20         30 my @report = ();
82              
83 20         42 for (my $i = 0; $i < @_; $i += 2)
84 24         61 { push @report, $_[$i] => $self->_distance_for_item($_[$i] => $_[1+$i]) }
85              
86 20         64 return @report;
87             }
88              
89             sub _distance_summary {
90 675     675   846 my $self = shift;
91              
92 675         854 my $summary = 0;
93 675         772 my $match = 1;
94 675         810 my $seen = 0;
95              
96 675         1145 while (@_) {
97 936         1719 my $distance = $self->_distance_for_item(splice @_, 0, 2);
98 934 100       1629 next unless defined $distance;
99 920         1130 $seen++;
100 920 100       1463 if ($distance < 0) { $match = 0 }
  475         899  
101 445         845 else { $summary += $distance }
102             }
103              
104 673 100       1498 return undef unless $seen;
105 660 100       1888 return $match ? $summary : -(1+$summary);
106             }
107              
108             # Single Point of Truth for index key definitions
109             our %_distance_value_map =
110             ( time => [numeric => 'time'],
111             record_id => [exact => 'record_id'],
112             segment_origin_id => [exact => 'segment_origin_id'],
113             url => [exact => 'url'],
114             url_prefix => [prefix => 'url'],
115             );
116              
117             sub _distance_for_item {
118 960     960   1223 my $self = shift;
119 960         1200 my $item = shift;
120 960         1533 my @sought = (scalar shift);
121 960 100       2855 @sought = @{$sought[0]} if UNIVERSAL::isa($sought[0], 'ARRAY');
  55         126  
122              
123             croak "index distance requested for unknown item $item"
124 960 100       1957 unless defined $_distance_value_map{$item};
125 959         1967 my $actual = $self->value($_distance_value_map{$item}[1]);
126              
127 959 100       2620 return undef unless defined $actual;
128              
129 943         1315 my $mode = $_distance_value_map{$item}[0];
130              
131 943         1226 my $distance = -1;
132 943 100       1698 if ($mode eq 'exact') {
    100          
    100          
133 725         1070 foreach my $sought (@sought) {
134 758 100       1566 $distance = 0 if $sought eq $actual;
135             }
136             } elsif ($mode eq 'numeric') {
137 124         177 foreach my $sought (@sought) {
138 128         390 my $here = abs($actual - $sought);
139 128 100 100     2119 $distance = $here if $distance < 0 || $here < $distance;
140             }
141             } elsif ($mode eq 'prefix') {
142 93         142 foreach my $sought (@sought) {
143 97 100       238 next unless $sought eq substr $actual, 0, length $sought;
144 56         87 my $here = length($actual) - length($sought);
145 56 100 100     146 $distance = $here if $distance < 0 || $here < $distance;
146             }
147 1         11 } else { die "unknown mode '$mode' for item '$item'" }
148 942         1748 return $distance;
149             }
150              
151             =item $index = $entry-Eindex
152              
153             Return the C containing this entry.
154              
155             =cut
156              
157             sub index {
158 1     1 1 930 die __PACKAGE__." is an abstract base class and "
159             .(ref shift)." must override the 'index' method"
160             }
161              
162             =item $volume = $entry-Evolume
163              
164             Return the C object representing the file in which this index
165             entry's record is located.
166              
167             =cut
168              
169             sub volume {
170 1     1 1 528 die __PACKAGE__." is an abstract base class and "
171             .(ref shift)." must override the 'volume' method"
172             }
173              
174             =item $record = $entry-Erecord( ... )
175              
176             Return the C this index entry represents. Arguments if given
177             are additional key =E value pairs for the record object.
178              
179             =cut
180              
181             sub record {
182 235     235 1 8813 my $self = shift;
183 235         526 return new WARC::Record::Stub ($self->volume, $self->record_offset, @_);
184             }
185              
186             =item $record_offset = $entry-Erecord_offset
187              
188             Return the file offset at which this index entry's record is located.
189              
190             =cut
191              
192             sub record_offset {
193 1     1 1 420 die __PACKAGE__." is an abstract base class and "
194             .(ref shift)." must override the 'offset' method"
195             }
196              
197             =item $value = $entry-Evalue( $key )
198              
199             Return the value this index entry holds for a given search key.
200              
201             =cut
202              
203             sub value {
204 1     1 1 418 die __PACKAGE__." is an abstract base class and "
205             .(ref shift)." must override the 'value' method"
206             }
207              
208             =item $tag = $entry-Etag
209              
210             Return a tag for this index entry. The exact format of the tag is
211             unspecified and platform-dependent. Two index entries that refer to
212             different records are guaranteed (if the underlying system software behaves
213             correctly) to have different tag values, while two entries that refer to
214             the same record in the same volume will normally have the same tag value,
215             except in edge cases.
216              
217             =cut
218              
219             sub tag {
220 44     44 1 856 my $self = shift;
221 44         92 return (($self->volume->_file_tag).':'.($self->record_offset));
222             }
223              
224             =back
225              
226             =head2 Optional Methods
227              
228             Some index entries may additionally support any of these methods:
229              
230             =over
231              
232             =item $next_entry = $entry-Enext
233              
234             Indexes with an inherent sequence of entries may provide a method to obtain
235             the next entry in the index. Some index systems have this, while others do
236             not have a meaningful order amongst their entries.
237              
238             =item $position = $entry-Eentry_position
239              
240             Indexes with an inherent sequence of entries may provide a method to obtain
241             some kind of index-specific entry number or location parameter. This is
242             most useful for metaindexes to record the location of an index entry.
243              
244             =back
245              
246             =cut
247              
248             1;
249             __END__