File Coverage

blib/lib/WARC/Index/File/CDX/Builder.pm
Criterion Covered Total %
statement 65 65 100.0
branch 17 20 85.0
condition 3 3 100.0
subroutine 12 12 100.0
pod 1 1 100.0
total 98 101 97.0


line stmt bran cond sub pod time code
1             package WARC::Index::File::CDX::Builder; # -*- CPerl -*-
2              
3 1     1   71687 use strict;
  1         11  
  1         29  
4 1     1   5 use warnings;
  1         1  
  1         65  
5              
6             require WARC::Index::Builder;
7             our @ISA = qw(WARC::Index::Builder);
8              
9             require WARC; *WARC::Index::File::CDX::Builder::VERSION = \$WARC::VERSION;
10              
11 1     1   591 use URI;
  1         7123  
  1         36  
12 1     1   7 use Carp;
  1         3  
  1         77  
13 1     1   6 use Cwd qw//;
  1         3  
  1         16  
14 1     1   5 use File::Spec;
  1         2  
  1         23  
15 1     1   5 use Fcntl qw/:seek/;
  1         2  
  1         1423  
16             require File::Spec::Unix;
17              
18             our %Record_Field_Handlers =
19             # each handler is called with WARC::Record and index builder objects and
20             # returns the text value for that field or undef, which is written as '-'
21             (a => sub { (shift)->field('WARC-Target-URI') },
22             k => sub { (shift)->field('WARC-Payload-Digest') },
23             u => sub { (shift)->id },
24              
25             b => sub { my $date = (shift)->date;
26             $date =~ y/-T:Z//d; substr $date, 0, 14 },
27             N => sub { my $uri = (shift)->field('WARC-Target-URI');
28             return undef unless $uri;
29             $uri = new URI ($uri);
30             return undef unless $uri->can('host') && $uri->can('path');
31             my $surt_host = join ',', reverse split /[.]/, $uri->host;
32             return $surt_host.')'.$uri->path
33             },
34              
35             g => sub { my $record = shift; my $builder = shift;
36             return $builder->_get_relvolname($record->volume) },
37             S => sub { (shift)->{sl_packed_size} },
38             v => sub { my $record = shift;
39             return undef if $record->volume->filename !~ m/[.]warc\z/;
40             return $record->offset },
41             V => sub { my $record = shift;
42             return undef if $record->volume->filename =~ m/[.]warc\z/;
43             return $record->offset },
44              
45             # HTTP responses only
46             m => sub { my $response = (shift)->replay;
47             return undef unless UNIVERSAL::can($response, 'headers');
48             $response->headers->content_type },
49             r => sub { my $response = (shift)->replay;
50             return undef unless UNIVERSAL::can($response, 'headers');
51             $response->headers->header('Location') },
52             s => sub { my $response = (shift)->replay;
53             return undef unless UNIVERSAL::can($response, 'code');
54             $response->code },
55             );
56              
57             # This implementation uses a hash as the underlying structure.
58              
59             # Keys defined by this class:
60             #
61             # file_name
62             # Name of CDX file where records will be appended.
63             # file
64             # Handle opened for writing/appending on that file.
65             # fields
66             # CDX field letters to be written.
67             # fieldgen
68             # Array of handlers to call to produce field values.
69             # delimiter
70             # Field delimiter used in CDX file. Default is space; cannot be set
71             # as an option but can be read from an existing CDX file header.
72             # volnames
73             # Hash mapping volume names to relative paths from the CDX file.
74              
75             sub _get_relvolname {
76 37     37   53 my $self = shift;
77 37         78 my $name = (shift)->filename;
78              
79 37 100       143 return $self->{volnames}->{$name} if defined $self->{volnames}{$name};
80              
81             # otherwise ...
82 4         74 my ($vol, $cdx_dirs, $file) = File::Spec->splitpath($self->{file_name});
83 4         310 my $relname = File::Spec->abs2rel
84             ($name, File::Spec->catpath($vol, $cdx_dirs, undef));
85 4         39 my ($rvol, $rel_dirs, $rel_file) = File::Spec->splitpath($relname);
86 4         16 my @rel_dirs = File::Spec->splitdir($rel_dirs);
87 4         30 my $warcfilename = File::Spec::Unix->catpath
88             ($rvol, File::Spec::Unix->catdir(@rel_dirs), $rel_file);
89 4         24 return $self->{volnames}{$name} = $warcfilename;
90             }
91              
92             sub _new {
93 7     7   5040 my $class = shift;
94 7         26 my %args = @_;
95              
96 7 100       252 croak "required parameter 'into' missing" unless $args{into};
97              
98             my $ob = { delimiter => ' ', fields => [qw/N b a m s k r M S V g u/],
99 6         354 file_name => Cwd::abs_path($args{into}) };
100              
101 6 100       31 $ob->{fields} = $args{fields} if $args{fields};
102              
103 6 50       307 open my $fh, '+>>', $args{into} or croak $args{into}.': '.$!;
104             {
105 6         18 local $/ = "\012";
  6         32  
106 6 50       53 seek $fh, 0, SEEK_SET or croak 'seek '.$args{into}.': '.$!;
107 6         87 my $header = <$fh>;
108 6 100       22 if ($header) {
109             $header =~ m/^(.)CDX((?:\1[[:alpha:]])+)/
110 3 100       438 or croak $args{into}.' exists but lacks CDX header';
111 1         4 $ob->{delimiter} = $1;
112 1         18 $ob->{fields} = [split /\Q$1/, $2];
113 1         3 shift @{$ob->{fields}}; # remove leading empty field
  1         3  
114             } else {
115             # write CDX header
116 3         10 print $fh ' CDX ', join(' ', @{$ob->{fields}}), "\012";
  3         21  
117             }
118 4 50       112 seek $fh, 0, SEEK_END or croak 'seek '.$args{into}.': '.$!;
119             }
120 4         15 $ob->{file} = $fh;
121              
122             $ob->{fieldgen} =
123 4 100   29   7 [map { $Record_Field_Handlers{$_} or sub { undef } } @{$ob->{fields}}];
  30         94  
  29         57  
  4         12  
124              
125 4         92 bless $ob, $class
126             }
127              
128             # inherit add
129              
130             sub _add_record {
131 37     37   67 my $self = shift;
132 37         53 my $record = shift;
133              
134 356 100 100     1036 my $line = join $self->{delimiter}, map { defined $_ && $_ ne '' ? $_ : '-' }
135 37         75 map { $_->($record, $self) } @{$self->{fieldgen}};
  372         2562  
  37         89  
136              
137 37         101 print {$self->{file}} $line, "\012";
  37         224  
138             }
139              
140             sub flush {
141 4     4 1 28 my $self = shift;
142              
143 4         125 seek $self->{file}, 0, SEEK_END
144             }
145              
146             1;
147             __END__