File Coverage

blib/lib/WARC/Index/Entry.pm
Criterion Covered Total %
statement 66 66 100.0
branch 36 36 100.0
condition 6 6 100.0
subroutine 13 13 100.0
pod 7 7 100.0
total 128 128 100.0


line stmt bran cond sub pod time code
1             package WARC::Index::Entry; # -*- CPerl -*-
2              
3 29     29   70734 use strict;
  29         58  
  29         832  
4 29     29   134 use warnings;
  29         50  
  29         1510  
5              
6             our @ISA = qw();
7              
8             require WARC; *WARC::Index::Entry::VERSION = \$WARC::VERSION;
9              
10 29     29   176 use Carp;
  29         62  
  29         24145  
11              
12             require WARC::Record::Stub;
13              
14             =head1 NAME
15              
16             WARC::Index::Entry - abstract base class for WARC::Index entries
17              
18             =head1 SYNOPSIS
19              
20             use WARC; # or ...
21             use WARC::Index;
22              
23             # WARC::Index::Entry objects are returned from directly searching an index
24              
25             # match search criteria against entry
26             $distance = $entry->distance( ... );
27             @report = $entry->distance( ... );
28              
29             $index = $entry->index; # get index containing entry
30             $volume = $entry->volume; # get WARC::Volume containing record
31             $record = $entry->record; # get WARC record
32              
33             =head1 DESCRIPTION
34              
35             =head2 Common Methods
36              
37             Entries from all index systems support these methods:
38              
39             =over
40              
41             =item @report = $entry-Edistance( ... )
42              
43             =item $distance = $entry-Edistance( ... )
44              
45             In list context, return a detailed report mapping each search I to a
46             distance value. In scalar context, return an overall summary distance,
47             such that sorting entries by the return values of this method in ascending
48             order will place the closest matches at the top of the list.
49              
50             A valid distance is non-negative. Negative distance values indicate that
51             the record does not match the criteria at all. An undefined value
52             indicates that the entry is from an index that does not store the
53             information needed to evaluate distance for that search key. Undefined
54             values are ignored when computing the summarized distance, but the
55             summarized distance will be negative if any keys do not match at all and
56             itself undefined if none of the requested keys can be evaluated.
57              
58             For details on available search keys, see the L<"Search Keys"
59             section|WARC::Collection/"Search Keys"> of the C page.
60             If multiple values are given in an arrayref, the best match is reported.
61              
62             =cut
63              
64             sub distance {
65 698     698 1 38983 my $self = shift;
66              
67 698 100       1210 unless (defined wantarray)
68 1         202 { carp "calling 'distance' method in void context"; return }
  1         64  
69              
70 697 100       1217 croak "no arguments given to 'distance' method"
71             unless scalar @_;
72 696 100       1260 croak "odd number of arguments given to 'distance' method"
73             if scalar @_ % 2;
74              
75 695 100       1011 if (wantarray) { return $self->_distance_report(@_) }
  20         43  
76 675         1048 else { return $self->_distance_summary(@_) }
77             }
78              
79             sub _distance_report {
80 20     20   25 my $self = shift;
81 20         29 my @report = ();
82              
83 20         47 for (my $i = 0; $i < @_; $i += 2)
84 24         57 { push @report, $_[$i] => $self->_distance_for_item($_[$i] => $_[1+$i]) }
85              
86 20         62 return @report;
87             }
88              
89             sub _distance_summary {
90 675     675   829 my $self = shift;
91              
92 675         770 my $summary = 0;
93 675         768 my $match = 1;
94 675         738 my $seen = 0;
95              
96 675         1055 while (@_) {
97 936         1598 my $distance = $self->_distance_for_item(splice @_, 0, 2);
98 934 100       1540 next unless defined $distance;
99 920         1050 $seen++;
100 920 100       1306 if ($distance < 0) { $match = 0 }
  475         833  
101 445         748 else { $summary += $distance }
102             }
103              
104 673 100       1239 return undef unless $seen;
105 660 100       1771 return $match ? $summary : -(1+$summary);
106             }
107              
108             # Single Point of Truth for index key definitions
109             our %_distance_value_map =
110             ( time => [numeric => 'time'],
111             record_id => [exact => 'record_id'],
112             segment_origin_id => [exact => 'segment_origin_id'],
113             url => [exact => 'url'],
114             url_prefix => [prefix => 'url'],
115             );
116              
117             sub _distance_for_item {
118 960     960   1150 my $self = shift;
119 960         1144 my $item = shift;
120 960         1452 my @sought = (scalar shift);
121 960 100       2582 @sought = @{$sought[0]} if UNIVERSAL::isa($sought[0], 'ARRAY');
  55         116  
122              
123             croak "index distance requested for unknown item $item"
124 960 100       1818 unless defined $_distance_value_map{$item};
125 959         1874 my $actual = $self->value($_distance_value_map{$item}[1]);
126              
127 959 100       2604 return undef unless defined $actual;
128              
129 943         1278 my $mode = $_distance_value_map{$item}[0];
130              
131 943         1071 my $distance = -1;
132 943 100       1540 if ($mode eq 'exact') {
    100          
    100          
133 725         1064 foreach my $sought (@sought) {
134 758 100       1440 $distance = 0 if $sought eq $actual;
135             }
136             } elsif ($mode eq 'numeric') {
137 124         171 foreach my $sought (@sought) {
138 128         307 my $here = abs($actual - $sought);
139 128 100 100     1987 $distance = $here if $distance < 0 || $here < $distance;
140             }
141             } elsif ($mode eq 'prefix') {
142 93         228 foreach my $sought (@sought) {
143 97 100       231 next unless $sought eq substr $actual, 0, length $sought;
144 56         113 my $here = length($actual) - length($sought);
145 56 100 100     139 $distance = $here if $distance < 0 || $here < $distance;
146             }
147 1         11 } else { die "unknown mode '$mode' for item '$item'" }
148 942         1971 return $distance;
149             }
150              
151             =item $index = $entry-Eindex
152              
153             Return the C containing this entry.
154              
155             =cut
156              
157             sub index {
158 1     1 1 582 die __PACKAGE__." is an abstract base class and "
159             .(ref shift)." must override the 'index' method"
160             }
161              
162             =item $volume = $entry-Evolume
163              
164             Return the C object representing the file in which this index
165             entry's record is located.
166              
167             =cut
168              
169             sub volume {
170 1     1 1 348 die __PACKAGE__." is an abstract base class and "
171             .(ref shift)." must override the 'volume' method"
172             }
173              
174             =item $record = $entry-Erecord( ... )
175              
176             Return the C this index entry represents. Arguments if given
177             are additional key =E value pairs for the record object.
178              
179             =cut
180              
181             sub record {
182 235     235 1 7748 my $self = shift;
183 235         459 return new WARC::Record::Stub ($self->volume, $self->record_offset, @_);
184             }
185              
186             =item $record_offset = $entry-Erecord_offset
187              
188             Return the file offset at which this index entry's record is located.
189              
190             =cut
191              
192             sub record_offset {
193 1     1 1 307 die __PACKAGE__." is an abstract base class and "
194             .(ref shift)." must override the 'offset' method"
195             }
196              
197             =item $value = $entry-Evalue( $key )
198              
199             Return the value this index entry holds for a given search key.
200              
201             =cut
202              
203             sub value {
204 1     1 1 302 die __PACKAGE__." is an abstract base class and "
205             .(ref shift)." must override the 'value' method"
206             }
207              
208             =item $tag = $entry-Etag
209              
210             Return a tag for this index entry. The exact format of the tag is
211             unspecified and platform-dependent. Two index entries that refer to
212             different records are guaranteed (if the underlying system software behaves
213             correctly) to have different tag values, while two entries that refer to
214             the same record in the same volume will normally have the same tag value,
215             except in edge cases.
216              
217             =cut
218              
219             sub tag {
220 44     44 1 647 my $self = shift;
221 44         85 return (($self->volume->_file_tag).':'.($self->record_offset));
222             }
223              
224             =back
225              
226             =head2 Optional Methods
227              
228             Some index entries may additionally support any of these methods:
229              
230             =over
231              
232             =item $next_entry = $entry-Enext
233              
234             Indexes with an inherent sequence of entries may provide a method to obtain
235             the next entry in the index. Some index systems have this, while others do
236             not have a meaningful order amongst their entries.
237              
238             =item $position = $entry-Eentry_position
239              
240             Indexes with an inherent sequence of entries may provide a method to obtain
241             some kind of index-specific entry number or location parameter. This is
242             most useful for metaindexes to record the location of an index entry.
243              
244             =back
245              
246             =cut
247              
248             1;
249             __END__