File Coverage

blib/lib/WARC/Builder.pm
Criterion Covered Total %
statement 9 9 100.0
branch n/a
condition n/a
subroutine 3 5 60.0
pod 2 2 100.0
total 14 16 87.5


line stmt bran cond sub pod time code
1             package WARC::Builder; # -*- CPerl -*-
2              
3 1     1   70665 use strict;
  1         11  
  1         30  
4 1     1   4 use warnings;
  1         2  
  1         26  
5              
6 1     1   5 use Carp;
  1         1  
  1         187  
7              
8             our @ISA = qw();
9              
10             require WARC; *WARC::Builder::VERSION = \$WARC::VERSION;
11              
12             require WARC::Record;
13              
14             =head1 NAME
15              
16             WARC::Builder - Web ARChive construction support for Perl
17              
18             =head1 SYNOPSIS
19              
20             use WARC::Builder;
21              
22             $warcinfo_data = new WARC::Fields (software => 'MyWebCrawler/1.2.3 ...',
23             format => 'WARC File Format 1.0',
24             # other fields omitted ...
25             );
26              
27             $warcinfo = new WARC::Record (type => 'warcinfo',
28             content => $warcinfo_data);
29              
30             # for a small-scale crawl
31             $build = new WARC::Builder (warcinfo => $warcinfo,
32             filename => $warcfilename);
33              
34             # for a large-scale crawl
35             $index1 = build WARC::Index::File::CDX (into => $indexprefix.'.cdx');
36             $index2 = build WARC::Index::File::SDBM (into => $indexprefix.'.sdbm');
37             $build = new WARC::Builder (warcinfo => $warcinfo,
38             filename_template =>
39             $warcprefix.'-%s-%05d-'.$hostname.'.warc.gz',
40             index => [$index1, $index2]);
41              
42             # for each collected object
43             $build->append(@records); # or ...
44             $build->append($record1, $record2, ... );
45              
46             =head1 DESCRIPTION
47              
48             The C class is the high-level interface for writing WARC
49             archives. It is a very simple interface, because, at this level, WARC is a
50             very simple format: a simple sequence of WARC records, which
51             C accepts as C objects to append to the
52             in-progress WARC file.
53              
54             WARC file size limits are handled automatically if configured.
55              
56             =head2 Methods
57              
58             =over
59              
60             =item $build = new WARC::Builder (I =E I, ...)
61              
62             Construct a C object. The following keys are supported:
63              
64             =over
65              
66             =item index =E [$index]
67              
68             =item index =E [$index1, $index2, ...]
69              
70             If set, must be an array reference of a list of index builder objects.
71             Each newly-added WARC::Record will be presented to all index builder
72             objects in this list.
73              
74             =item filename =E $warcfilename
75              
76             If set, create a single WARC file with the given file name. The file name
77             must match m/\.warc(?:\.gz)?$/. The presence of a final ".gz" indicates
78             that the WARC file should be written with per-record gzip compression.
79              
80             This option is mutually exclusive with the C option.
81              
82             Using this option inhibits starting a new WARC file and causes the
83             C option to be ignored. A warning is emitted in this case.
84              
85             =item filename_template =E $warcprefix.'-%s-%05d-'.$hostname.'.warc.gz'
86              
87             Establish an sprintf format string to construct file names. The file name
88             produced by the template string must match m/\.warc(?:\.gz)?$/. The
89             presence of a final ".gz" indicates that the WARC file should be written
90             with per-record gzip compression.
91              
92             The C option gives the format string, while
93             C gives an array reference of named parameters to
94             be used with the format.
95              
96             If constructing file names in accordance with the IIPC WARC implementation
97             guidelines, this string should be of the form
98             'PREFIX-%s-%05d-HOSTNAME.warc.gz' where PREFIX is any chosen prefix to name
99             the crawl and HOSTNAME is the name or other identifier for the machine
100             writing the file.
101              
102             This option is mutually exclusive with the C option.
103              
104             =item filename_template_vars =E [qw/timestamp serial/]
105              
106             Provide the list of parameters to the sprintf call used to produce a WARC
107             filename from the C option.
108              
109             The available variables are:
110              
111             =over
112              
113             =item serial
114              
115             A number, incremented each time adding a record causes a new WARC file to
116             be started.
117              
118             =item timestamp
119              
120             A 14-digit timestamp in the YYYYmmddHHMMSS format recommended in the IIPC
121             WARC implementation guidelines. The timestamp is always in UTC. The time
122             used is the time at which the C object was constructed and
123             is constant between WARC files. This should be substituted as a string.
124              
125             =back
126              
127             Default [qw/timestamp serial/] in accordance with IIPC guidelines.
128              
129             =item first_serial =E $count
130              
131             The initial value of the C filename variable for this object.
132             Default 0.
133              
134             =item max_file_size =E $size
135              
136             Maximum size of a WARC file. A new WARC file is started if appending a
137             record would cause the current file to exceed this length.
138              
139             The limit can be specified as an exact number of bytes, or a number
140             followed by a size suffix m/[KMG]i?/. The "K", "M", and "G" suffixes
141             indicate base-10 multiples (10**(3*n)), while the "Ki", "Mi", and "Gi"
142             suffixes indicate base-2 multiples (2**(10*n)) widely used in computing.
143              
144             Default "1G" == 1_000_000_000.
145              
146             =item warcinfo =E $warcinfo_record
147              
148             A C object of type "warcinfo" that will be written at the
149             start of each WARC file. This record will be cloned and written with a
150             distinct "WARC-Record-ID" as the first record in each WARC file, including
151             the first. As a consequence, it does not require a "WARC-Record-ID" header
152             and any "WARC-Record-ID" given is silently ignored.
153              
154             Each clone of this record will also have the "WARC-Filename" header added.
155              
156             Each clone of this record will also have the "WARC-Date" header set to the
157             time at which the C object was constructed.
158              
159             =item warcversion =E 'WARC/1.0'
160              
161             Set the version of the WARC format to be written. This string is the first
162             line of each WARC record. It must begin with the prefix 'WARC/' and should
163             be the version from the WARC specification that the crawler follows.
164              
165             Default "WARC/1.0".
166              
167             =back
168              
169             =cut
170              
171       0 1   sub new {
172             }
173              
174             =item $build-Eappend( $record1, ... )
175              
176             Add any number of C objects to the growing WARC file. If
177             WARC file size limits are configured, and a record would cause the current
178             WARC file to exceed the configured size limits, a new WARC file is opened
179             automatically.
180              
181             All records passed to a single C call are added to the same WARC
182             file. If a new WARC file is to be started, it will be started B
183             any records are written.
184              
185             All records passed to a single C call are considered "concurrent"
186             and all subsequent records will have a "WARC-Concurrent-To" header added
187             referencing the first record, if they do not already have a
188             "WARC-Concurrent-To" header. This is a convenience feature for simpler
189             crawlers and is inhibited if any record already has a "WARC-Concurrent-To"
190             header when C is called.
191              
192             If a C passed to this method lacks a "WARC-Record-ID" header,
193             a warning will be emitted using carp(), a UUID will be generated, and a
194             record ID of the form "urn:uuid:UUID" will be assigned. If the record
195             object is read-only, this method will croak() instead.
196              
197             If a C passed to this method lacks any of the "WARC-Date",
198             "WARC-Type", or "Content-Length" headers, this method will croak().
199              
200             =cut
201              
202       0 1   sub append {
203             }
204              
205             =back
206              
207             =cut
208              
209             1;
210             __END__