File Coverage

blib/lib/WARC/Record.pm
Criterion Covered Total %
statement 70 70 100.0
branch 17 18 94.4
condition n/a
subroutine 27 27 100.0
pod 19 19 100.0
total 133 134 99.2


line stmt bran cond sub pod time code
1             package WARC::Record; # -*- CPerl -*-
2              
3 28     28   70596 use strict;
  28         57  
  28         752  
4 28     28   128 use warnings;
  28         45  
  28         588  
5              
6 28     28   112 use Carp;
  28         58  
  28         1294  
7 28     28   178 use Scalar::Util;
  28         54  
  28         2526  
8              
9             our @ISA = qw();
10              
11             require WARC; *WARC::Record::VERSION = \$WARC::VERSION;
12             require WARC::Date;
13              
14             =head1 NAME
15              
16             WARC::Record - one record from a WARC file
17              
18             =head1 SYNOPSIS
19              
20             use WARC; # or ...
21             use WARC::Volume; # or ...
22             use WARC::Collection;
23              
24             # WARC::Record objects are returned from ->record_at and ->search methods
25              
26             # Construct a record, as when preparing a WARC file
27             $warcinfo = new WARC::Record (type => 'warcinfo');
28              
29             # Accessors
30              
31             $value = $record->field($name);
32              
33             $version = $record->protocol; # analogous to HTTP::Message::protocol
34             $volume = $record->volume;
35             $offset = $record->offset;
36             $record = $record->next;
37              
38             $fields = $record->fields;
39              
40             # Supply a data block for an in-memory record
41             $warcinfo->block(new WARC::Fields ( ... ));
42              
43             =cut
44              
45 28     28   25024 use overload '<=>' => 'compareTo', 'cmp' => 'compareTo';
  28         19852  
  28         154  
46 28     28   2249 use overload fallback => 1;
  28         64  
  28         122  
47              
48             # This implementation uses a hash as the underlying object.
49             # Keys defined by this class:
50             #
51             # fields
52             # Embedded WARC::Fields object
53             #
54             # Keys defined by this class but overridden and unused in subclasses:
55             #
56             # block
57             # Data block for this record
58              
59             sub _dbg_dump {
60 24     24   40 my $self = shift;
61              
62 24         29 my $out = "WARC record [dumping as base class]\n";
63 24         35 my @out = map {s/^/ /gm; $_} $self->fields->as_string;
  24         146  
  24         78  
64 24         79 $out .= join("\n", @out);
65              
66 24         76 return $out;
67             }
68              
69             =head1 DESCRIPTION
70              
71             C objects come in two flavors with a common interface.
72             Records read from WARC files are read-only and have meaningful return
73             values from the methods listed in "Methods on records from WARC files".
74             Records constructed in memory can be updated and those same methods all
75             return undef.
76              
77             =head2 Common Methods
78              
79             =over
80              
81             =item $record-Efields
82              
83             Get the internal C object that contains WARC record headers.
84              
85             =cut
86              
87 3357     3357 1 7482 sub fields { (shift)->{fields} }
88              
89             =item $record-Efield( $name )
90              
91             Get the value of the WARC header named $name from the internal
92             C object.
93              
94             =cut
95              
96             sub field {
97 5272     5272 1 6991 my $self = shift;
98 5272         7686 return $self->fields->field(shift);
99             }
100              
101             =item $record E=E $other_record
102              
103             =item $record-EcompareTo( $other_record )
104              
105             Compare two C objects according to a simple total order:
106             ordering by starting offset for two records in the same file, and by
107             filename of the containing C objects for records in different
108             files. Constructed C objects are assumed to come from a
109             volume named "" (the empty string) for this purpose, and are ordered in an
110             arbitrary but stable manner amongst themselves. Distinct constructed
111             C objects never compare as equal.
112              
113             Perl constructs a C<==> operator using this method, so WARC record objects
114             will compare as equal iff they refer to the same physical record.
115              
116             =cut
117              
118             sub compareTo {
119 14     14 1 4445 my $a = shift;
120 14         24 my $b = shift;
121 14         20 my $swap = shift;
122              
123             # sort in-memory-only records ahead of on-disk records
124 14 100       30 return $swap ? 1 : -1 if defined $b->volume;
    100          
125              
126             # neither record is from a WARC volume
127 7         20 my $cmp = (Scalar::Util::refaddr $a) <=> (Scalar::Util::refaddr $b);
128              
129 7 50       51 return $swap ? 0-$cmp : 0+$cmp;
130             }
131              
132             =back
133              
134             =head3 Convenience getters
135              
136             =over
137              
138             =item $record-Etype
139              
140             Alias for C<$record-Efield('WARC-Type')>.
141              
142             =cut
143              
144 302     302 1 708 sub type { (shift)->field('WARC-Type') }
145              
146             =item $record-Eid
147              
148             Alias for C<$record-Efield('WARC-Record-ID')>.
149              
150             =cut
151              
152 659     659 1 103510 sub id { (shift)->field('WARC-Record-ID') }
153              
154             =item $record-Econtent_length
155              
156             Alias for C<$record-Efield('Content-Length')>.
157              
158             =cut
159              
160 4     4 1 1313 sub content_length { (shift)->field('Content-Length') }
161              
162             =item $record-Edate
163              
164             Return the C<'WARC-Date'> field as a C object.
165              
166             =cut
167              
168 262     262 1 3477 sub date { WARC::Date->from_string((shift)->field('WARC-Date')) }
169              
170             =back
171              
172             =head2 Methods on records from WARC files
173              
174             These methods all return undef if called on a C object that
175             does not represent a record in a WARC file.
176              
177             =over
178              
179             =item $record-Eprotocol
180              
181             Return the format and version tag for this record. For WARC 1.0, this
182             method returns 'WARC/1.0'.
183              
184             =cut
185              
186 1     1 1 5 sub protocol { return undef }
187              
188             =item $record-Evolume
189              
190             Return the C object representing the file in which this
191             record is located.
192              
193             =cut
194              
195 11     11 1 50 sub volume { return undef }
196              
197             =item $record-Eoffset
198              
199             Return the file offset at which this record can be found.
200              
201             =cut
202              
203 1     1 1 5 sub offset { return undef }
204              
205             =item $record-Elogical
206              
207             Return the logical record object for this record. Logical records
208             reassemble WARC continuation segments. Records recorded without using WARC
209             segmentation are their own logical records. Reassembled logical records
210             are also their own logical records.
211              
212             =cut
213              
214 1     1 1 4 sub logical { return undef }
215              
216             =item $record-Esegments
217              
218             Return a list of segments for this record. A record recorded without using
219             WARC segmentation, including a segment of a larger logical record, is
220             considered its own only segment. A constructed record is considered to
221             have no segments at all.
222              
223             This method exists on all records to allow
224             C<$record-Elogical-Esegments> to work.
225              
226             =cut
227              
228 1     1 1 6 sub segments { return () }
229              
230             =item $record-Enext
231              
232             Return the next C in the WARC file that contains this record.
233             Returns an undefined value if called on the last record in a file.
234              
235             =cut
236              
237 1     1 1 4 sub next { return undef }
238              
239             =item $record-Eopen_block
240              
241             Return a tied filehandle that reads the WARC record block.
242              
243             The WARC record block is the content of a WARC record, analogous to the
244             entity body in an C.
245              
246             =cut
247              
248 5     5 1 12 sub open_block { return undef }
249              
250             =item $record-Eopen_continued
251              
252             Return a tied filehandle that reads the logical WARC record block.
253              
254             For records that do not use WARC segmentation, this is effectively an alias
255             for C<$record-Eopen_block>. For records that span multiple segments,
256             this is an alias for C<$record-Elogical-Eopen_block>.
257              
258             =cut
259              
260 1     1 1 4 sub open_continued { return undef }
261              
262             =item $record-Ereplay
263              
264             =item $record-Ereplay( as =E $type )
265              
266             Return a protocol-specific object representing the record contents.
267              
268             This method returns undef if the library does not recognize the protocol
269             message stored in the record and croaks if a requested conversion is not
270             supported.
271              
272             A record with Content-Type "application/http" with an appropriate "msgtype"
273             parameter produces an C or C object. The
274             returned object may be a subclass to support deferred loading of entity
275             bodies.
276              
277             A request to replay a record "as =E http" attempts to convert whatever
278             is stored in the record to an HTTP exchange, analogous to the "everything
279             is HTTP" interface that C provides.
280              
281             =cut
282              
283 1     1 1 5 sub replay { return undef }
284              
285             =item $record-Eopen_payload
286              
287             Return a tied filehandle that reads the WARC record payload.
288              
289             The WARC record payload is defined as the decoded content of the protocol
290             response or other resource stored in the record. This method returns undef
291             if called on a WARC record that has no payload or that has content that we
292             do not recognize.
293              
294             =cut
295              
296 1     1 1 4 sub open_payload { return undef }
297              
298             =back
299              
300             =head2 Methods on fresh WARC records
301              
302             =over
303              
304             =item $record = new WARC::Record (I =E I, ...)
305              
306             Construct a fresh WARC record, suitable for use with C.
307              
308             =cut
309              
310             sub new {
311 9     9 1 26153 my $class = shift;
312 9         28 my %opt = @_;
313              
314 9         23 foreach my $name (qw/type/)
315 9 100       290 { croak "required field '$name' not specified" unless $opt{$name} }
316              
317 8         41 my $fields = new WARC::Fields ('WARC-Type' => $opt{type});
318              
319 8         69 bless { fields => $fields }, $class;
320             }
321              
322             =item $record-Eblock
323              
324             =item $record-Eblock( $new_value )
325              
326             Get or set the block contents of an in-memory record. This method returns
327             undef if called on a WARC record from a volume and croaks if setting the
328             contents is attempted on a record from a volume.
329              
330             =cut
331              
332             sub block {
333 10     10 1 718 my $self = shift;
334 10         18 my $block = $self->{block};
335              
336 10 100       27 if (@_) {
337 6         10 my $new = shift;
338              
339 6 100       15 if (ref $new) {
340 4 100       56 if ($new->can('as_block'))
    100          
    100          
341 1         4 { $self->{block} = $new->as_block }
342             elsif ($new->isa('HTTP::Message'))
343 1         6 { $self->{block} = $new->as_string(WARC::CRLF()) }
344             elsif ($new->can('as_string'))
345 1         4 { $self->{block} = $new->as_string }
346             else
347 1         165 { croak "unrecognized object submitted as WARC record block" }
348 2         6 } else { $self->{block} = $new }
349              
350 28     28   33306 use bytes;
  28         343  
  28         144  
351 5         366 $self->{fields}->field('Content-Length', length $self->{block});
352             }
353              
354 9         31 return $block;
355             }
356              
357             =back
358              
359             =cut
360              
361             1;
362             __END__