File Coverage

blib/lib/WARC/Volume.pm
Criterion Covered Total %
statement 52 52 100.0
branch 3 4 75.0
condition 3 8 37.5
subroutine 17 17 100.0
pod 5 5 100.0
total 80 86 93.0


line stmt bran cond sub pod time code
1             package WARC::Volume; # -*- CPerl -*-
2              
3 27     27   71412 use strict;
  27         59  
  27         766  
4 27     27   127 use warnings;
  27         54  
  27         622  
5              
6 27     27   116 use Carp;
  27         43  
  27         1392  
7 27     27   189 use Cwd qw//;
  27         46  
  27         948  
8              
9             our @ISA = qw();
10              
11 27     27   549 use WARC; *WARC::Volume::VERSION = \$WARC::VERSION;
  27         55  
  27         1151  
12              
13 27     27   10094 use WARC::Record;
  27         56  
  27         816  
14 27     27   10477 use WARC::Record::FromVolume;
  27         75  
  27         1072  
15              
16             =head1 NAME
17              
18             WARC::Volume - Web ARChive file access for Perl
19              
20             =head1 SYNOPSIS
21              
22             use WARC::Volume;
23              
24             $volume = mount WARC::Volume ($filename);
25              
26             $filename = $volume->filename;
27              
28             $handle = $volume->open;
29              
30             $record = $volume->first_record;
31              
32             $record = $volume->record_at($offset);
33              
34             =cut
35              
36 27     27   166 use overload '""' => 'filename';
  27         53  
  27         152  
37 27     27   1710 use overload fallback => 1;
  27         60  
  27         113  
38              
39             # This implementation is almost laughably simple, needing to store only a
40             # single data value: the absolute filename of the WARC file. As such, the
41             # underlying implementation, is, in fact, a blessed string.
42              
43             =head1 DESCRIPTION
44              
45             A C object represents a WARC file in the filesystem and
46             provides access to the WARC records within as C objects.
47              
48             =head2 Methods
49              
50             =over
51              
52             =item $volume = mount WARC::Volume ($filename)
53              
54             Construct a C object. The parameter is the name of an
55             existing WARC file. An exception is raised if the first record does not
56             have a valid WARC header.
57              
58             =cut
59              
60             sub mount {
61 48     48 1 34177 my $class = shift;
62 48         80 my $filename = shift;
63              
64 48         2357 my $fullfilename = Cwd::abs_path($filename);
65 48         165 my $ob = bless \$fullfilename, $class;
66              
67 48         163 $ob->first_record;
68              
69 48         729 return $ob;
70             }
71              
72             =item $volume-Efilename
73              
74             Return the filename for this volume.
75              
76             =cut
77              
78 1000     1000 1 10610 sub filename { ${(shift)} }
  1000         12658  
79              
80             =item $volume-Eopen
81              
82             Return a readable and seekable file handle for this volume. The returned
83             value may be a tied handle. Do not assume that it is an C.
84              
85             =cut
86              
87             sub open {
88 857     857 1 1088 my $self = shift;
89 857         1365 my $filename = $$self;
90              
91 857 100       28834 open my $fh, '<', $filename or die "$filename: $!";
92 856         5062 binmode $fh, ':raw'; # WARC files contain binary data and UTF-8 headers
93 856         2553 return $fh;
94             }
95              
96             =item $volume-Efirst_record
97              
98             Construct and return a C object representing the first WARC
99             record in $volume. This should be a "warcinfo" record, but it is not
100             required to be so.
101              
102             =cut
103              
104 87     87 1 6825 sub first_record { (shift)->record_at(0) }
105              
106             =item $volume-Erecord_at( $offset )
107              
108             Construct and return a C object representing the WARC record
109             beginning at $offset within $volume. An exception is raised if an
110             appropriate magic number is not found at $offset.
111              
112             =cut
113              
114 106     106 1 6503 sub record_at { _read WARC::Record::FromVolume @_ }
115              
116             =back
117              
118             =cut
119              
120             # $volume->_file_tag
121             #
122             # Return a "file tag" for this volume.
123             #
124             # This is an internal procedure. The exact definition of "file tag" is
125             # platform-dependent, but it will be the same value if both file names can
126             # be proven to be the same underlying file.
127              
128             BEGIN {
129 27     27   5576 use constant ();
  27         57  
  27         1752  
130              
131 27     27   79 my $have_valid_inodes = 0;
132              
133             # We accept DEV:INO as valid if two files in the same directory have the
134             # same DEV and different INO values. We use two modules from this
135             # library for this test and retrieve their actual locations from %INC.
136 27         607 my @stat_record = stat $INC{'WARC/Record.pm'};
137 27         404 my @stat_volume = stat $INC{'WARC/Volume.pm'};
138              
139 27 50 50     407 $have_valid_inodes = 1
      33        
      33        
140             if (scalar @stat_record && scalar @stat_volume # both stat calls worked
141             && $stat_record[0] == $stat_volume[0] # both have same DEV
142             && $stat_record[1] != $stat_volume[1]); # different INO values
143              
144 27         3014 constant->import(HAVE_VALID_INODES => $have_valid_inodes);
145             }
146             sub _file_tag {
147 521     521   646 if (HAVE_VALID_INODES) {
148             # Two modules have been found to have distinct inode numbers, therefore
149             # we are probably running in a POSIX environment. Use the dev:ino
150             # pair from the stat builtin as file tag.
151              
152             # POSIX requires that this be sufficient to distinguish files, although
153             # there are situations, particularly in complex network environments,
154             # where two different dev:ino pairs may correspond to the same file.
155             # Such situations can be avoided with careful administration.
156 521         1081 return join ':', ((stat shift)[0, 1])
157             } else {
158             # Use the absolute filename and assume no links on other platforms
159              
160             # The file name stored in the WARC::Volume object is already absolute.
161             return (shift)->filename
162             }
163             }
164              
165             1;
166             __END__