File Coverage

blib/lib/WARC/Fields.pm
Criterion Covered Total %
statement 720 720 100.0
branch 250 252 100.0
condition 32 32 100.0
subroutine 99 101 98.0
pod 7 7 100.0
total 1108 1112 99.8


line stmt bran cond sub pod time code
1             package WARC::Fields; # -*- CPerl -*-
2              
3 28     28   73498 use strict;
  28         70  
  28         840  
4 28     28   137 use warnings;
  28         53  
  28         687  
5              
6 28     28   129 use Carp;
  28         60  
  28         1474  
7 28     28   14208 use Encode;
  28         363300  
  28         2067  
8 28     28   221 use Scalar::Util;
  28         73  
  28         1483  
9              
10             our @ISA = qw();
11              
12 28     28   623 use WARC; *WARC::Fields::VERSION = \$WARC::VERSION;
  28         53  
  28         1641  
13              
14             =head1 NAME
15              
16             WARC::Fields - WARC record headers and application/warc-fields
17              
18             =head1 SYNOPSIS
19              
20             require WARC::Fields;
21              
22             $f = new WARC::Fields;
23             $f = $record->fields; # get WARC record headers
24             $g = $f->clone; # make writable copy
25              
26             $g->set_readonly; # make read-only
27              
28             $f->field('WARC-Type' => 'metadata'); # set
29             $value = $f->field('WARC-Type'); # get
30              
31             $fields_text = $f->as_string; # get WARC header lines for display
32             $fields_block = $f->as_block; # format for WARC file
33              
34             tie @field_names, ref $f, $f; # bind ordered list of field names
35              
36             tie %fields, ref $f, $f; # bind hash of field names => values
37              
38             $entry = $f->[$num]; # tie an anonymous array and access it
39             $value = $f->{$name}; # likewise with an anonymous tied hash
40              
41             $name = "$entry"; # tied array returns objects
42             $value = $entry->value; # one specific value
43             $offset = $entry->offset; # N of M with same name
44              
45             foreach (keys %{$f}) { ... } # iterate over names, in order
46              
47             =cut
48              
49 28     28   2349 use overload '@{}' => \&_as_tied_array, '%{}' => \&_as_tied_hash;
  28         1808  
  28         261  
50 28     28   2024 use overload fallback => 1;
  28         52  
  28         95  
51              
52             # This implementation uses column-oriented storage, with an array as the
53             # underlying object and constants to select array offsets.
54             #
55             # The NAMES and VALUES columns are always valid, but the MVOFF and INDEX
56             # positions may be undefined and are lazily rebuilt when needed.
57              
58 28         2915 use constant { NAMES => 0, VALUES => 1, MVOFF => 2,
59 28     28   1634 INDEX => 3, IS_RO => 4, C_TA => 5, C_TH => 6 };
  28         51  
60 28     28   174 use constant OBJECT_INDEX => qw/NAMES VALUES MVOFF INDEX IS_RO/;
  28         60  
  28         1607  
61 28     28   181 use constant OBJECT_INIT => undef, undef, undef, undef, 0, undef, undef;
  28         57  
  28         83024  
62              
63 744     744   7484 sub DESTROY { my $ob = shift;
64 744 100       1846 untie @{$$ob->[C_TA]} if defined $$ob->[C_TA];
  3         23  
65 744 100       1394 untie %{$$ob->[C_TH]} if defined $$ob->[C_TH];
  63         167  
66 744         861 our $_total_destroyed; $_total_destroyed++ }
  744         6833  
67              
68             # NAMES: array of field names, exactly as written
69             # VALUES: array of field values
70             # MVOFF: array of offsets for multiple-valued fields
71             # INDEX: hash of case-folded field names to array of row numbers
72             # IS_RO: boolean: TRUE if this object is read-only
73              
74             # C_TA: cache: tied array for array dereference
75             # C_TH: cache: tide hash for hash dereference
76              
77             sub _rebuild_INDEX {
78 30     30   50 my $self = shift;
79 30         50 my %idx = ();
80              
81 30         89 for (my $i = 0; $i < @{$$self->[NAMES]}; $i++) {
  113         261  
82 83         106 push @{$idx{lc $$self->[NAMES][$i]}}, $i;
  83         227  
83             }
84              
85 30         75 $$self->[INDEX] = \%idx;
86             }
87              
88             sub _update_INDEX {
89 75     75   100 my $self = shift; # INDEX slot must be valid
90 75         120 my $base = shift; # row number where an insertion or removal was made
91 75         92 my $count = shift; # how many rows were inserted (+) or removed (-)
92              
93 75         118 my %done = ();
94              
95 75 100       168 for (my $i = ($base < 0) ? 0 : $base; $i < @{$$self->[NAMES]}; $i++) {
  189         460  
96 114         200 my $key = lc $$self->[NAMES][$i];
97 114 100       216 next if $done{$key};
98 72         176 @{$$self->[INDEX]{$key}} =
99 72 100       88 map { $_ + ($_ > $base ? $count : 0) } @{$$self->[INDEX]{$key}};
  118         249  
  72         146  
100 72         174 $done{$key}++;
101             }
102             }
103              
104             sub _rebuild_MVOFF {
105 6     6   17 my $self = shift;
106              
107 6         18 my @mvoff = ();#(undef) x scalar @{$$self->[NAMES]};
108 6         14 my %cidx = (); # counted index; references to unique entries
109              
110 6         12 foreach my $name (@{$$self->[NAMES]}) {
  6         20  
111 10028         12282 my $key = lc $name;
112 10028 100       17835 if (not defined $cidx{$key}) {
    100          
113             # first time this key is seen
114 26         39 push @mvoff, undef;
115 26         68 $cidx{$key} = \$mvoff[$#mvoff];
116             } elsif (ref $cidx{$key} eq 'SCALAR') {
117             # second time this key is seen
118 4         10 push @mvoff, 1;
119 4         9 ${$cidx{$key}} = 0; # replace undefined value
  4         10  
120 4         10 $cidx{$key} = 2; # prepare counter
121             } else {
122             # third or later time this key is seen
123 9998         14268 push @mvoff, $cidx{$key}++;
124             }
125             }
126              
127 6         30 $$self->[MVOFF] = \@mvoff;
128             }
129              
130             sub _dbg_dump {
131 51     51   5429 my $self = shift;
132              
133 51         91 my @mvoff = (' ') x @{$$self->[NAMES]};
  51         2512  
134 20016 100       29449 @mvoff = map { defined $_ ? $_ : 'U' }
135 51 100       176 @{$$self->[MVOFF]}[0 .. $#{$$self->[NAMES]}]
  5         217  
  5         341  
136             if defined $$self->[MVOFF];
137 51         565 my @widths = map {length} qw/ROW MVO NAME/;
  153         275  
138              
139 51         92 foreach my $row (0 .. $#{$$self->[NAMES]}) {
  51         192  
140 20233 100       31361 $widths[0] = length $row if length $row > $widths[0];
141 20233 100       33715 $widths[1] = length $mvoff[$row] if length $mvoff[$row] > $widths[1];
142 20233 100       35795 $widths[2] = length $$self->[NAMES][$row]
143             if length $$self->[NAMES][$row] > $widths[2];
144             }
145              
146 51         308 my $out = sprintf ' %4$*1$s %5$*2$s %6$*3$s %7$s', @widths,
147             qw/ ROW MVO NAME VALUE /;
148 51         201 $out .= "\n".('=' x length $out)."\n";
149             $out .= join "\n", map
150 20233 100       82271 { sprintf ' %4$*1$d %5$*2$s %6$*3$s%7$1s %8$s', @widths,
    100          
151             ($_, $mvoff[$_], $$self->[NAMES][$_],
152             (defined $$self->[VALUES][$_] ? ':' : ' '),
153             (defined $$self->[VALUES][$_] ? $$self->[VALUES][$_] : '*deleted*')) }
154 51         87 0 .. $#{$$self->[NAMES]};
  51         357  
155              
156 51         2446 return $out;
157             }
158              
159             # From RFC2616:
160             # CTL =
161             # (octets 0 - 31) and DEL (127)>
162             # LWS = [CRLF] 1*( SP | HT )
163             # separators = "(" | ")" | "<" | ">" | "@"
164             # | "," | ";" | ":" | "\" | <">
165             # | "/" | "[" | "]" | "?" | "="
166             # | "{" | "}" | SP | HT
167             my $PARSE_RE__LWS = qr/(?:\015\012)?[ \t]+/;
168             my $PARSE_RE__separator = qr[[][)(><}{@,;:/"\\?=[:space:]]];
169             my $PARSE_RE__not_separator = qr[[^][)(><}{@,;:/"\\?=[:space:]]];
170              
171             # From WARC specification:
172             # field-name = token
173             # token = 1*
174             # except CTLs or separators>
175             my $PARSE_RE__token = qr/[!#$%'*+-.0-9A-Z^_`a-z|~]+/;
176              
177             =head1 DESCRIPTION
178              
179             The C class encapsulates information in the
180             "application/warc-fields" format used for WARC record headers. This is a
181             simple key-value format closely analogous to HTTP headers, however
182             differences are significant enough that the C class cannot
183             be reliably reused for WARC fields.
184              
185             Instances of this class are usually created as member variables of the
186             C class, but can also be returned as the content of WARC
187             records with Content-Type "application/warc-fields".
188              
189             Instances of C retrieved from WARC files are read-only and
190             will croak() if any attempt is made to change their contents.
191              
192             This class strives to faithfully represent the contents of a WARC file,
193             while providing a simple interface to answer simple questions.
194              
195             =head2 Multiple Values
196              
197             Most WARC headers may only appear once and with a single value in valid
198             WARC records, with the notable exception of the WARC-Concurrent-To header.
199             C neither attempts to enforce nor relies upon this
200             constraint. Headers that appear multiple times are considered to have
201             multiple values. When iterating a tied hash, all values of a recurring
202             header are collected and returned with the B occurrence of its key.
203              
204             Multiple values are returned from the C method and tied hash
205             interface as array references, and are set by passing in an array
206             reference. Existing rows are reused where possible when updating a field
207             with multiple values. If the new array reference contains fewer items
208             (including the special case of replacing multiple values with a single
209             value) excess rows are deleted. If the new array reference requires
210             additional rows to be inserted, they are inserted immediately after the
211             last existing row for a field, with the same name case as that row.
212              
213             Precise control of the layout is available using the tied array interface,
214             but the ordering of the header rows is not constrained in the WARC
215             specification.
216              
217             =head2 Field Name Mangling
218              
219             As with C, the '_' character is converted to '-' in field
220             names unless the first character of the name is ':', which cannot itself
221             appear in a field name. Unlike C, the leading ':' is
222             stripped off immediately and the name stored otherwise exactly as given.
223             The C method and tied hash interface allow this convenience feature.
224             The field names exposed via the tied array interface are reported
225             B as they appear in the WARC file.
226              
227             Strictly, "X-Crazy-Header" and "X_Crazy_Header" are two B
228             headers that the above convenience mechanism conflates. The solution is
229             simple: if (and only if) a header field B with the B
230             name given, it is used, otherwise C occurs and the name is
231             rechecked for another exact match. If no match is found, case is folded
232             and a third check performed. If a match is found, the existing header is
233             updated, otherwise a new header is created with character case as given.
234              
235             The WARC specification specifically states that field names are
236             case-insensitive, accordingly, "X-Crazy-Header" and "X-CRAZY-HeAdEr" are
237             considered the same header for the C method and tied hash interface.
238             They will appear exactly as given in the tied array interface, however.
239              
240             =cut
241              
242             # This function handles two different canonicalizations:
243             # (1) case folding as required by the WARC specification
244             # (2) convenience translation s/_/-/g,
245             # (2a) suppressed if m/^:/, which is removed
246             # (2b) overridden by an exact match
247             # To make this work:
248             # --- all keys in INDEX are case-folded
249             # --- all keys in NAMES preserve case
250             # --- existing keys are case-folded by this function
251             # --- new keys translate s/_/-/g but preserve case
252             sub _find_key {
253 8608     8608   9249 my $self = shift; # INDEX slot must be valid
254 8608         9029 my $k = shift;
255 8608         8399 my $key; ($key = $k) =~ s/^://;
  8608         11261  
256 8608         9102 my $pad = $key;
257 8608         10196 my $is_quoted = ($k =~ m/^:/);
258              
259             # exact case-folded match?
260 8608 100       21715 return lc $key if defined $$self->[INDEX]{lc $key};
261              
262             # case-folded match after s/_/-/g?
263 3825         6434 $pad =~ s/_/-/g;
264 3825 100 100     8843 return lc $pad if defined $$self->[INDEX]{lc $pad} && !$is_quoted;
265              
266             # not found ==> a new key will be made
267 2836 100       4980 return $is_quoted ? $key : $pad;
268             }
269              
270             # called only if there is no or one current value
271             sub _set_single_value {
272 22     22   37 my $self = shift; # INDEX slot must be valid
273 22         32 my $key = shift; # as returned from _find_key
274 22         35 my $value = shift;
275              
276 22 100       276 croak "attempt to modify read-only object" if $$self->[IS_RO];
277              
278 21 100       86 unless (defined $$self->[INDEX]{lc $key}) {
279             # insert new key
280 12         19 push @{$$self->[NAMES]}, $key; # preserve original key
  12         31  
281 12         29 $key = lc $key; # fold key case
282 12         17 push @{$$self->[INDEX]{$key}}, $#{$$self->[NAMES]};
  12         40  
  12         37  
283             }
284              
285 21         60 $$self->[VALUES][$$self->[INDEX]{$key}[0]] = $value;
286             }
287              
288             sub _key_multiple_value_p {
289 4723     4723   5327 my $self = shift; # INDEX slot must be valid
290 4723         5140 my $key = shift; # as returned from _find_key
291              
292             # For this to be true, the key must already exist, which means that
293             # _find_key has case-folded it already.
294             return (defined $$self->[INDEX]{$key}
295 4723   100     8989 && 1 < scalar @{$$self->[INDEX]{$key}})
296             }
297              
298             # called in all cases where multiple values are involved
299             sub _set_multiple_value {
300 59     59   90 my $self = shift; # INDEX slot must be valid
301 59         78 my $key = shift; # as returned from _find_key
302 59         78 my $value_aref = shift;
303              
304 59 100       354 croak "attempt to modify read-only object" if $$self->[IS_RO];
305              
306             my $cur_count = (defined $$self->[INDEX]{$key}
307 57   100     149 && scalar @{$$self->[INDEX]{$key}});
308 57         103 my $new_count = scalar @$value_aref;
309              
310 57 100       129 unless ($cur_count) {
311             # insert new key
312 16         31 push @{$$self->[NAMES]}, $key; # preserve original key
  16         43  
313 16         38 push @{$$self->[VALUES]}, undef; # prepare slot
  16         35  
314 16         30 $key = lc $key; # fold key case
315 16         24 push @{$$self->[INDEX]{$key}}, $#{$$self->[NAMES]};
  16         56  
  16         39  
316 16         37 $cur_count = 1; # account for the added slot
317             }
318             # $key is always case-folded at this point
319              
320             # adjust table to accommodate new number of values
321 57 100       141 if ($cur_count > $new_count) {
    100          
322             # remove extra rows
323 30         46 foreach my $extra_row (reverse sort
324 30         130 splice @{$$self->[INDEX]{$key}}, $new_count) {
325 43         59 splice @{$$self->[NAMES]}, $extra_row, 1;
  43         82  
326 43         68 splice @{$$self->[VALUES]}, $extra_row, 1;
  43         71  
327 43         88 _update_INDEX($self, $extra_row, -1);
328             }
329             # special case: removing a field entirely
330 30 100       87 if ($new_count == 0) {
331             # This is here to catch a hypothetical bug before data is corrupted.
332             die "stray INDEX entries left after removing field"
333             # uncoverable branch true
334 22 50       30 unless scalar @{$$self->[INDEX]{$key}} == 0;
  22         66  
335 22         63 delete $$self->[INDEX]{$key};
336             }
337             } elsif ($cur_count < $new_count) {
338             # add more rows
339 15         36 my $last_row = $$self->[INDEX]{$key}[-1];
340 15         29 my $new_rows = $new_count - $cur_count;
341 15         44 _update_INDEX($self, $last_row, $new_rows);
342 15         26 splice @{$$self->[NAMES]}, 1+$last_row, 0,
  15         2561  
343             (($$self->[NAMES][$last_row]) x $new_rows);
344 15         29 splice @{$$self->[VALUES]}, 1+$last_row, 0, ((undef) x $new_rows);
  15         587  
345 15         24 push @{$$self->[INDEX]{$key}}, 1+$last_row .. $last_row+$new_rows;
  15         1322  
346             } # otherwise, $cur_count == $new_count
347 57 100       364 $$self->[MVOFF] = undef unless $cur_count == $new_count;
348             # there are always $new_count rows with $key at this point
349              
350 57         146 for (my $i = 0; $i < $new_count; $i++)
351 9968         21228 { $$self->[VALUES][$$self->[INDEX]{$key}[$i]] = $value_aref->[$i] }
352             }
353              
354             =head2 Methods
355              
356             =over
357              
358             =item $f = WARC::Fields-Enew
359              
360             Construct a new C object. Initial contents can be passed as
361             key-value pairs to this constructor and will be added in the given order.
362              
363             Repeating a key or supplying an array reference as a value assigns multiple
364             values to a key. To reduce the risk of confusion, only quoting with a
365             leading ':' overrides the convenience feature of applying C when
366             constructing a C object. The exact match rules used when
367             setting values on an existing object do not apply here.
368              
369             Field names given when constructing a WARC::Fields object are otherwise
370             stored exactly as given, with case preserved, even when other names that
371             fold to the same string have been given earlier in the argument list.
372              
373             =cut
374              
375             sub new {
376 224     224 1 13885 my $class = shift;
377 224         409 my $ob = [OBJECT_INIT];
378 224         365 my $k; my $v;
379              
380             # explicitly initialize NAMES and VALUES to allow as_string and as_block
381             # methods to be called on empty objects
382 224         284 $ob->[NAMES] = [];
383 224         320 $ob->[VALUES] = [];
384              
385 224         523 while (($k, $v) = splice @_, 0, 2) {
386 797 100       1556 croak "key without value" unless defined $v;
387              
388 796 100       1190 if ($k =~ m/^:/) { $k =~ s/^:// } else { $k =~ s/_/-/g }
  7         17  
  789         1540  
389              
390 796 100       1935 croak "reference to field with no name" unless $k =~ m/./;
391 794 100       1964 croak "reference to invalid field name" if $k !~ m/^$PARSE_RE__token$/o;
392              
393 792 100       1115 if (ref $v eq 'ARRAY') {
394 3         8 foreach my $value (@$v) {
395 8         10 push @{$ob->[NAMES]}, $k;
  8         19  
396 8         13 push @{$ob->[VALUES]}, $value;
  8         13  
397 8         12 push @{$ob->[INDEX]{lc $k}}, $#{$ob->[NAMES]};
  8         15  
  8         23  
398             }
399             } else {
400 789         803 push @{$ob->[NAMES]}, $k;
  789         1262  
401 789         833 push @{$ob->[VALUES]}, $v;
  789         1028  
402 789         785 push @{$ob->[INDEX]{lc $k}}, $#{$ob->[NAMES]};
  789         1471  
  789         2207  
403             }
404             }
405              
406 219         262 {our $_total_newly_constructed; $_total_newly_constructed++}
  219         223  
  219         231  
407 219         918 bless \ $ob, $class;
408             }
409              
410             =item $f-Eclone
411              
412             Copy a C object. A copy of a read-only object is writable.
413              
414             =cut
415              
416             sub clone {
417 8     8 1 15 my $self = shift;
418 8         18 my $new = [OBJECT_INIT];
419              
420 8         12 $new->[NAMES] = [@{$$self->[NAMES]}];
  8         686  
421 8         15 $new->[VALUES] = [@{$$self->[VALUES]}];
  8         1401  
422 8 100       34 $new->[MVOFF] = [@{$$self->[MVOFF]}] if defined $$self->[MVOFF];
  1         338  
423 30         33 $new->[INDEX] = {map {$_ => [@{$$self->[INDEX]{$_}}]}
  30         68  
424 8 100       25 keys %{$$self->[INDEX]}} if defined $$self->[INDEX];
  7         24  
425              
426 8         16 {our $_total_newly_cloned; $_total_newly_cloned++}
  8         11  
  8         14  
427 8         30 bless \ $new, ref $self;
428             }
429              
430             =item $f-Efield( $name )
431              
432             =item $f-Efield( $name =E $value )
433              
434             =item $f-Efield( $n1 =E $v1, $n2 =E $v2, ... )
435              
436             Get or set the value of one or more fields. The field name is not case
437             sensitive, but C will preserve its case if a new entry is
438             created.
439              
440             Setting a field to C effectively deletes that field, although it
441             remains visible in the tied array interface and will retain its position if
442             a new value is assigned. Setting a field to an empty array reference
443             removes that field entirely.
444              
445             =cut
446              
447             sub field {
448 7565     7565 1 30794 my $self = shift;
449              
450 7565 100       13531 _rebuild_INDEX($self) unless defined $$self->[INDEX];
451              
452 7565         8660 my $k; my $v; my $have_value_arg = scalar @_ > 1;
  7565         9290  
453 7565         14708 while (($k, $v) = splice @_, 0, 2) {
454 7565         10534 my $key = $self->_find_key($k);
455              
456 7565 100       17582 croak "reference to field with no name" unless $key =~ m/./;
457 7563 100       16262 croak "reference to invalid field name" if $key !~ m/^$PARSE_RE__token$/o;
458              
459 7560 100       10769 if (not $have_value_arg) {
    100          
    100          
460             # get a value
461 7485 100       16225 return undef unless defined $$self->[INDEX]{$key};
462 4699 100       6893 return $$self->[VALUES][$$self->[INDEX]{$key}[0]]
463             unless $self->_key_multiple_value_p($key);
464 78         286 return [grep {defined $_}
465 25         46 map {$$self->[VALUES][$_]} @{$$self->[INDEX]{$key}}];
  78         163  
  25         69  
466             } # otherwise set a value
467             elsif (UNIVERSAL::isa($v, 'ARRAY'))
468 51         107 { $self->_set_multiple_value($key, $v) }
469             elsif ($self->_key_multiple_value_p($key))
470             # has multiple values, but now only setting a single value
471 2         9 { $self->_set_multiple_value($key, [$v]) }
472             else
473 22         55 { $self->_set_single_value($key, $v) }
474 72         268 $have_value_arg = scalar @_ > 1;
475             }
476 72         243 return (); # return nothing
477             # Note that setting one or more fields and then getting a field is
478             # possible as a side-effect of this organization, but is explicitly NOT
479             # supported. That trick is NOT part of the stable API.
480             }
481              
482             =item $f = WARC::Fields-Eparse( $text )
483              
484             =item $f = WARC::Fields-Eparse( from =E $fh )
485              
486             =item $f = parse WARC::Fields from =E $fh
487              
488             Construct a new C object, reading initial contents from the
489             provided text string or filehandle.
490              
491             The C method throws an exception if it encounters input that it does
492             not understand.
493              
494             If the C method encounters a field name with a leading ':', which
495             implies an empty name and is not allowed, the leading ':' is silently
496             dropped from the line and parsing retried. If the line is not valid after
497             this change, the C method throws an exception. This feature is in
498             keeping with the general principle of "be liberal in what you accept" and
499             is a preemptive workaround for a predicted bug in other implementations.
500              
501             =cut
502              
503             sub parse {
504 642     642 1 1909 my $class = shift;
505 642         771 my $text = shift;
506 642         744 my $rd;
507              
508 642 100       1099 if ($text eq 'from') {
509 637         777 $rd = shift;
510             } else {
511             # This fails iff perl was built without PerlIO, which is non-default.
512             # uncoverable branch true
513 5 50       68 open $rd, '<', \$text or die "failure opening stream on variable: $!";
514             }
515              
516 642         945 my @names = ();
517 642         764 my @values = ();
518 642         808 my %idx = ();
519 642         818 my $at_end = 0;
520              
521 642         1387 local *_;
522 642         1764 while (<$rd>) {
523 4848         44680 s/[\015\012]+$//;
524 4848 100       14376 if (m/^:?($PARSE_RE__token):\s+(.*)$/o) # $1 -- name $2 -- value
    100          
    100          
525 4206         8748 { push @names, $1; push @values, $2; push @{$idx{lc $1}}, $#names }
  4206         6646  
  4206         4229  
  4206         15809  
526             elsif (m/^\s+(\S.*)$/) # $1 -- continued value
527 1         6 { $values[$#values] .= ' '.$1 }
528 639         921 elsif (m/^$/) { $at_end = 1; last }
  639         967  
529 2         365 else { croak "unrecognized input: $_" }
530             }
531              
532 640 100       1280 carp "end-of-input before end marker" unless $at_end;
533              
534 640         1300 @values = map {Encode::decode_utf8($_)} @values;
  4206         24834  
535              
536 640         3893 my $ob = [OBJECT_INIT];
537 640         1036 $ob->[NAMES] = \@names;
538 640         824 $ob->[VALUES] = \@values;
539 640         827 $ob->[INDEX] = \%idx;
540              
541 640         753 {our $_total_newly_parsed; $_total_newly_parsed++}
  640         684  
  640         777  
542 640         2349 bless \ $ob, $class;
543             }
544              
545             =item $f-Eas_block
546              
547             =item $f-Eas_string
548              
549             Return the contents as a formatted WARC header or application/warc-fields
550             block. The C method uses network line endings and UTF-8 as
551             specified for the WARC format, while the C method uses the local
552             line endings and does not perform encoding.
553              
554             =cut
555              
556             sub _as_text {
557 51     51   85 my $self = shift;
558 51         67 my $newline = shift;
559 51         74 my $out = '';
560              
561 51         82 for (my $i = 0; $i < @{$$self->[NAMES]}; $i++) {
  267         615  
562 216 100       389 next unless defined $$self->[VALUES][$i];
563 209         435 $out .= $$self->[NAMES][$i] . ': ' . $$self->[VALUES][$i] . $newline;
564             }
565              
566 51         226 return $out;
567             }
568              
569 4     4 1 16 sub as_block { Encode::encode('UTF-8', _as_text(shift, WARC::CRLF)) }
570 47     47 1 4681 sub as_string { _as_text(shift, "\n") }
571              
572             =item $f-Eset_readonly
573              
574             Mark a C object read-only. All methods that modify the
575             object will croak() if called on a read-only object.
576              
577             =cut
578              
579             sub set_readonly {
580 766     766 1 1110 my $self = shift;
581              
582 766         1841 $$self->[IS_RO] = 1;
583             }
584              
585             =back
586              
587             =head2 Tied Array Access
588              
589             The order of fields can be fully controlled by tying an array to a
590             C object and manipulating the array using ordinary Perl
591             operations. The C and C functions are likely to be useful
592             for reordering array elements if desired.
593              
594             C will croak() if an attempt is made to set a field name with
595             a leading ':' using the tied array interface.
596              
597             =cut
598              
599             sub TIEARRAY {
600 4     4   17 my $class = shift;
601 4         9 my $ob = shift;
602              
603             # This method must ignore the given class to allow the "empty subclass"
604             # test to pass. If a subclass really wants, an override for TIEARRAY
605             # itself can call SUPER::TIEARRAY and re-bless the returned reference
606             # into the desired class.
607 4         9 $WARC::Fields::TiedArray::_total_tied++;
608 4         27 bless \ $ob, 'WARC::Fields::TiedArray';
609             }
610              
611             {
612             package WARC::Fields::TiedArray::Entry;
613              
614 28     28   283 use Carp;
  28         62  
  28         2523  
615              
616             BEGIN { $WARC::Fields::TiedArray::Entry::{$_} = $WARC::Fields::{$_}
617 28     28   1200 for WARC::Fields::OBJECT_INDEX; }
618              
619 28     28   212 use constant { NAME => 0, VALUE => 1, TABLE => 2, ROW => 3 };
  28         52  
  28         2473  
620              
621 28     28   186 use overload '""' => 'name', fallback => 1;
  28         67  
  28         204  
622              
623             =pod
624              
625             The tied array interface accepts simple string values but returns objects
626             with additional information. The returned object has an overloaded string
627             conversion that yields the name for that entry but additionally has
628             C and C methods.
629              
630             An entry object is bound to a slot in its parent C object,
631             but will be copied if it is assigned to another slot in the same or another
632             C object.
633              
634             Due to complex aliasing rules necessary for array slice assignment to work
635             for permuting rows in the table, entry objects must be short-lived.
636             Storing the object read from a tied array and attempting to use it after
637             modifying its parent C object produces unspecified results.
638              
639             =over
640              
641             =item $entry = $array[$n]
642              
643             =item $entry = $f-E[$n]
644              
645             The tied array C method returns a "entry object" instead of the name
646             itself.
647              
648             =cut
649              
650             sub _new {
651 223     223   305 my $class = shift;
652 223         308 my $table = shift;
653 223         266 my $row = shift;
654              
655 223         989 bless [$$table->[NAMES][$row], $$table->[VALUES][$row],
656             $table, $row], $class;
657             }
658              
659             =item $name = "$entry"
660              
661             =item $name = $entry-Ename
662              
663             =item $name = "$f-E[$n]"
664              
665             =item $name = $f-E[$n]-Ename
666              
667             The C method on a entry object returns the field name.
668             String conversion is overloaded to call this method.
669              
670             =cut
671              
672 62     62   10122 sub name { (shift)->[NAME] }
673              
674             =item $value = $entry-Evalue
675              
676             =item $value = $array[$n]-Evalue
677              
678             =item $value = $f-E[$n]-Evalue
679              
680             =item $entry-Evalue( $new_value )
681              
682             =item $array[$n]-Evalue( $new_value )
683              
684             =item $f-E[$n]-Evalue( $new_value )
685              
686             The C method on a entry object returns the field value for this
687             particular entry. Only a single scalar is returned, even if multiple
688             entries share the same name.
689              
690             If given an argument, the C method replaces the value for this
691             particular entry. The argument will be coerced to a string.
692              
693             =cut
694              
695             sub value {
696 154     154   219 my $self = shift;
697              
698 154 100       313 if (scalar @_ == 0) { # get this value
699 34         148 return $self->[VALUE];
700             } else { # update this value
701 120 100       142 croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO];
  120         430  
702 119         171 my $newval = shift;
703 119         194 ${$self->[TABLE]}->[VALUES]->[$self->[ROW]] = $self->[VALUE] = "$newval";
  119         227  
704 119         293 return (); # and return nothing
705             }
706             }
707              
708             =item $offset = $entry-Eoffset
709              
710             =item $offset = $array[$n]-Eoffset
711              
712             =item $offset = $f-E[$n]-Eoffset
713              
714             The C method on a entry object returns the position of this entry
715             amongst multiple entries with the same field name. These positions are
716             numbered from zero and are identical to the positions in the array
717             reference returned for this entry's field name from the C method or
718             the tied hash interface.
719              
720             =cut
721              
722             sub offset {
723 21     21   36 my $self = shift;
724 21 100       31 $self->[TABLE]->_rebuild_MVOFF unless defined ${$self->[TABLE]}->[MVOFF];
  21         90  
725 21         35 return ${$self->[TABLE]}->[MVOFF]->[$self->[ROW]];
  21         120  
726             }
727              
728             =back
729              
730             =cut
731              
732             }
733              
734             {
735             package WARC::Fields::TiedArray::LooseEntry;
736              
737 28     28   9569 use Carp;
  28         73  
  28         2304  
738              
739             BEGIN { $WARC::Fields::TiedArray::LooseEntry::{$_} = $WARC::Fields::{$_}
740 28     28   934 for WARC::Fields::OBJECT_INDEX; }
741              
742 28     28   203 use constant { NAME => 0, VALUE => 1 };
  28         62  
  28         2063  
743              
744 28     28   4854 BEGIN { our @ISA = qw(WARC::Fields::TiedArray::Entry) }
745              
746             # This is a special type of "entry object" that is not associated with a
747             # table row, returned from POP, SHIFT, and SPLICE when needed.
748              
749             sub _new {
750 24     24   400 my $class = shift;
751 24         35 my $name = shift;
752 24         36 my $value = shift;
753              
754 24         79 bless [$name, $value], $class;
755             }
756              
757 18     18   870 sub name { return (shift)->[NAME] }
758              
759             sub value {
760 17     17   29 my $self = shift;
761              
762 17 100       38 if (scalar @_ == 0) # get
763 16         40 { return $self->[VALUE] }
764             else # set
765 1         121 { croak "Loose array entries are read-only." }
766             }
767              
768 1     1   7 sub offset { return undef }
769             }
770              
771             {
772             package WARC::Fields::TiedArray;
773              
774 28     28   569 use Carp;
  28         83  
  28         2171  
775              
776             BEGIN { $WARC::Fields::TiedArray::{$_} = $WARC::Fields::{$_}
777 28     28   43962 for WARC::Fields::OBJECT_INDEX; }
778              
779             # The underlying object is a reference to a WARC::Fields object.
780              
781             sub FETCH {
782 223     223   9068 my $self = shift;
783 223         271 my $row = shift;
784 223         712 return (ref($self).'::Entry')->_new($$self, $row);
785             }
786              
787             sub STORE {
788 13     13   432 my $self = shift;
789 13         22 my $row = shift;
790 13         19 my $name = shift;
791              
792 13 100       151 croak "attempt to modify read-only object" if $$$self->[IS_RO];
793              
794 12 100       15 $self->STORESIZE($row + 1) if $#{$$$self->[NAMES]} < $row;
  12         41  
795              
796 12 100       61 if (UNIVERSAL::isa($name, ref($self).'::Entry')) {
797             # copy entry
798 8 100       20 croak "attempt to set invalid name"
799             if $name->name !~ m/^$PARSE_RE__token$/o;
800 7         19 $$$self->[NAMES]->[$row] = $name->name;
801 7         16 $$$self->[VALUES]->[$row] = $name->value;
802             } else {
803             # set name
804 4 100       390 croak "attempt to set invalid name"
805             if "$name" !~ m/^$PARSE_RE__token$/o;
806 2         10 $$$self->[NAMES]->[$row] = "$name";
807             }
808 9         16 $$$self->[MVOFF] = undef;
809 9         38 $$$self->[INDEX] = undef;
810             }
811              
812             sub FETCHSIZE {
813 38     38   4877 my $self = shift;
814 38         57 return scalar @{$$$self->[NAMES]};
  38         307  
815             }
816              
817             sub STORESIZE {
818 5     5   13 my $self = shift;
819 5         9 my $count = shift;
820              
821 5 100 100     28 croak "attempt to modify read-only object"
822             if $$$self->[IS_RO] && $count != $self->FETCHSIZE();
823              
824 4 100       14 if ($count > $self->FETCHSIZE()) {
    100          
825 2         7 my $needed = $count - $self->FETCHSIZE();
826 2         4 push @{$$$self->[NAMES]}, ('X-Undefined-Field-Name') x $needed;
  2         9  
827 2         3 push @{$$$self->[VALUES]}, (undef) x $needed;
  2         10  
828             } elsif ($count < $self->FETCHSIZE()) {
829 1         3 splice @{$$$self->[NAMES]}, $count;
  1         3  
830 1         3 splice @{$$$self->[VALUES]}, $count;
  1         3  
831 1         6 $$$self->[INDEX] = undef;
832 1         4 } else { return } # no actual change
833             }
834              
835       0     sub EXTEND {
836             # do nothing
837             }
838              
839             sub EXISTS {
840 14     14   1992 my $self = shift;
841 14         22 my $row = shift;
842 14         86 return defined $$$self->[VALUES]->[$row];
843             }
844              
845             sub DELETE {
846 2     2   6 my $self = shift;
847 2         6 my $row = shift;
848              
849 2 100       119 croak "attempt to modify read-only object" if $$$self->[IS_RO];
850              
851 1         3 my $old_value = $$$self->[VALUES]->[$row];
852 1         4 $$$self->[VALUES]->[$row] = undef;
853 1         3 $$$self->[MVOFF] = undef;
854 1         6 $$$self->[INDEX] = undef;
855 1         4 return $old_value;
856             }
857              
858             sub CLEAR {
859 3     3   11 my $self = shift;
860              
861 3 100       120 croak "attempt to modify read-only object" if $$$self->[IS_RO];
862              
863 2         9 $$$self->[NAMES] = [];
864 2         6 $$$self->[VALUES] = [];
865 2         6 $$$self->[MVOFF] = undef;
866 2         7 $$$self->[INDEX] = undef;
867 2         7 return undef;
868             }
869              
870             sub PUSH {
871 7     7   755 my $self = shift;
872              
873 7 100 100     145 croak "attempt to modify read-only object"
874             if $$$self->[IS_RO] && scalar @_;
875              
876 6         19 foreach my $item (@_) {
877 103         143 my $name; my $value;
878 103 100       330 if (UNIVERSAL::isa($item, ref($self).'::Entry'))
879 2         7 { $name = $item->name; $value = $item->value }
  2         7  
880             else
881 101         152 { $name = "$item"; $value = undef }
  101         117  
882 103 100       603 croak "attempt to set invalid name" if $name !~ m/^$PARSE_RE__token$/o;
883 101         167 push @{$$$self->[NAMES]}, $name;
  101         558  
884 101         127 push @{$$$self->[VALUES]}, $value;
  101         311  
885             }
886 4         10 $$$self->[MVOFF] = undef;
887 4         123 $$$self->[INDEX] = undef;
888 4         8 return scalar @{$$$self->[NAMES]};
  4         18  
889             }
890              
891             sub POP {
892 5     5   10 my $self = shift;
893              
894 5 100       134 croak "attempt to modify read-only object" if $$$self->[IS_RO];
895              
896             my $ret = WARC::Fields::TiedArray::LooseEntry->_new
897 4         9 (pop @{$$$self->[NAMES]}, pop @{$$$self->[VALUES]});
  4         9  
  4         16  
898 4 100       16 pop @{$$$self->[MVOFF]} if defined $$$self->[MVOFF];
  2         5  
899 4         8 $$$self->[INDEX] = undef;
900              
901 4         14 return $ret;
902             }
903              
904             sub SHIFT {
905 2     2   5 my $self = shift;
906              
907 2 100       121 croak "attempt to modify read-only object" if $$$self->[IS_RO];
908              
909             my $ret = WARC::Fields::TiedArray::LooseEntry->_new
910 1         3 (shift @{$$$self->[NAMES]}, shift @{$$$self->[VALUES]});
  1         4  
  1         5  
911 1         3 $$$self->[MVOFF] = undef;
912 1         5 $$$self->[INDEX] = undef;
913              
914 1         4 return $ret;
915             }
916              
917             sub UNSHIFT {
918 6     6   732 my $self = shift;
919              
920 6 100 100     135 croak "attempt to modify read-only object"
921             if $$$self->[IS_RO] && scalar @_;
922              
923 5         13 foreach my $item (@_) {
924 4         9 my $name; my $value;
925 4 100       23 if (UNIVERSAL::isa($item, ref($self).'::Entry'))
926 2         5 { $name = $item->name; $value = $item->value }
  2         7  
927             else
928 2         4 { $name = "$item"; $value = undef }
  2         4  
929 4 100       295 croak "attempt to set invalid name" if $name !~ m/^$PARSE_RE__token$/o;
930 2         5 unshift @{$$$self->[NAMES]}, $name;
  2         7  
931 2         5 unshift @{$$$self->[VALUES]}, $value;
  2         7  
932             }
933 3         7 $$$self->[MVOFF] = undef;
934 3         9 $$$self->[INDEX] = undef;
935 3         5 return scalar @{$$$self->[NAMES]};
  3         12  
936             }
937              
938             sub SPLICE {
939 10     10   757 my $self = shift;
940 10         17 my $offset = shift;
941 10         19 my $length = shift;
942              
943 10 100       24 $offset = 0 unless defined $offset;
944 10 100       38 $length = $self->FETCHSIZE() - $offset unless defined $length;
945              
946 10 100 100     49 return () unless ($length != 0 || scalar @_);
947              
948 9 100       246 croak "attempt to modify read-only object" if $$$self->[IS_RO];
949              
950 7         13 my @new_names = (); my @new_values = (); my @old_names; my @old_values;
  7         13  
  7         12  
951              
952 7         18 foreach my $item (@_) {
953 17 100       63 if (UNIVERSAL::isa($item, ref($self).'::Entry')) {
954 11         25 push @new_names, $item->name;
955 11         23 push @new_values, $item->value;
956             } else {
957 6         15 push @new_names, "$item";
958 6         13 push @new_values, undef;
959             }
960             }
961              
962             croak "attempt to set invalid name"
963 7 100       16 if grep { $_ !~ m/^$PARSE_RE__token$/o } @new_names;
  17         329  
964              
965 5         12 @old_names = splice @{$$$self->[NAMES]}, $offset, $length, @new_names;
  5         23  
966 5         10 @old_values = splice @{$$$self->[VALUES]}, $offset, $length, @new_values;
  5         16  
967              
968 5         9 my @ret = ();
969              
970 5         18 for (my $i = 0; $i < scalar @old_names; $i++)
971 18         41 { push @ret, WARC::Fields::TiedArray::LooseEntry->_new
972             ($old_names[$i], $old_values[$i]) }
973              
974 5         12 $$$self->[MVOFF] = undef;
975 5         14 $$$self->[INDEX] = undef;
976 5         27 return @ret;
977             }
978              
979 4     4   13 sub UNTIE { our $_total_untied; $_total_untied++ }
  4         21  
980              
981 4     4   7 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  4         18  
982             }
983              
984             =head2 Tied Hash Access
985              
986             The contents of a C object can be easily examined by tying a
987             hash to the object. Reading or setting a hash key is equivalent to the
988             C method, but the tied hash will iterate keys and values in the
989             order in which each key B appears in the internal table.
990              
991             Like the tied array interface, the tied hash interface returns magical
992             objects that internally refer back to the parent C object.
993             These objects remain valid if the underlying C object is
994             changed, but further use may produce surprising and unspecified results.
995              
996             The use of magical objects enables the values in a tied hash to B
997             be arrays, even for keys that do not exist (the array will have zero
998             elements) or that have only one value (the array will have a string
999             conversion that produces that one value). This allows a tied hash to
1000             support autovivification of an array value just as Perl's own hashes do.
1001              
1002             =cut
1003              
1004             sub TIEHASH {
1005 70     70   114 my $class = shift;
1006 70         97 my $ob = shift;
1007              
1008             # This method must ignore the given class to allow the "empty subclass"
1009             # test to pass. If a subclass really wants, an override for TIEHASH
1010             # itself can call SUPER::TIEHASH and re-bless the returned reference
1011             # into the desired class.
1012 70         87 $WARC::Fields::TiedHash::_total_tied++;
1013 70         338 bless \ $ob, 'WARC::Fields::TiedHash';
1014             }
1015              
1016             {
1017             package WARC::Fields::TiedHash::ValueArray;
1018              
1019 28     28   247 use Carp;
  28         68  
  28         2224  
1020              
1021             BEGIN { $WARC::Fields::TiedHash::ValueArray::{$_} = $WARC::Fields::{$_}
1022 28     28   1074 for WARC::Fields::OBJECT_INDEX; }
1023              
1024 28     28   170 use constant { TABLE => 0, KEY => 1, KEYc => 2 }; # KEYc -- canonical KEY
  28         71  
  28         53991  
1025              
1026             sub TIEARRAY {
1027 458     458   565 my $class = shift;
1028 458         519 my $table = shift;
1029 458 100       960 $table->_rebuild_INDEX unless defined $$table->[INDEX];
1030 458         716 my $key = $table->_find_key(shift); # needs INDEX
1031              
1032 458         636 { our $_total_tied; $_total_tied++ }
  458         496  
  458         548  
1033 458         1325 bless [$table, $key, lc $key], $class;
1034             }
1035              
1036             sub FETCH {
1037 764     764   1370 my $self = shift;
1038 764         759 my $offset = shift;
1039              
1040 764 100       778 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  764         1381  
1041              
1042 764         838 my $row = ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset];
  764         1171  
1043 764 100       1044 return defined $row ? ${$self->[TABLE]}->[VALUES][$row] : undef;
  748         2868  
1044             }
1045              
1046             sub STORE {
1047 10     10   556 my $self = shift;
1048 10         17 my $offset = shift;
1049 10         15 my $value = shift;
1050              
1051 10         17 my $T = $self->[TABLE];
1052              
1053 10 100       143 croak "attempt to modify read-only object" if $$T->[IS_RO];
1054              
1055 9 100       26 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1056              
1057             $self->STORESIZE($offset + 1)
1058             if not defined $$T->[INDEX]{$self->[KEYc]}
1059 9 100 100     49 or $#{$$T->[INDEX]{$self->[KEYc]}} < $offset;
  8         33  
1060              
1061 9         47 $$T->[VALUES][$$T->[INDEX]{$self->[KEYc]}[$offset]] = "$value";
1062             }
1063              
1064             sub FETCHSIZE {
1065 788     788   1540 my $self = shift;
1066              
1067 788 100       789 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  788         1559  
1068              
1069 778         808 return scalar @{${$self->[TABLE]}->[INDEX]{$self->[KEYc]}}
  778         2536  
1070 788 100       904 if defined ${$self->[TABLE]}->[INDEX]{$self->[KEYc]};
  788         1516  
1071 10         76 return 0; # otherwise: key does not exist
1072             }
1073              
1074             sub STORESIZE {
1075 8     8   14 my $self = shift;
1076 8         15 my $count = shift;
1077              
1078 8         13 my $T = $self->[TABLE];
1079              
1080 8 100 100     31 croak "attempt to modify read-only object"
1081             if $$T->[IS_RO] && $count != $self->FETCHSIZE();
1082              
1083 7 100       23 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1084              
1085 7         17 my @new = ();
1086 6         23 @new = @{$$T->[VALUES]}[@{$$T->[INDEX]{$self->[KEYc]}}]
  6         14  
1087 7 100       18 if defined $$T->[INDEX]{$self->[KEYc]};
1088 7 100       18 if ($count > $self->FETCHSIZE())
    100          
1089 4         15 { push @new, (undef) x ($count - $self->FETCHSIZE()) }
1090             elsif ($count < $self->FETCHSIZE())
1091 2         13 { @new = @new[0..($count-1)] }
1092 1         5 else { return } # no actual change
1093 6         21 $T->field($self->[KEY] => \@new);
1094             }
1095              
1096       0     sub EXTEND {
1097             # do nothing
1098             }
1099              
1100             sub EXISTS {
1101 5     5   541 my $self = shift;
1102 5         11 my $offset = shift;
1103              
1104 5 100       8 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  5         24  
1105              
1106 5         7 return exists ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset];
  5         35  
1107             }
1108              
1109             sub DELETE {
1110 4     4   538 my $self = shift;
1111 4         9 my $offset = shift;
1112              
1113 4 100       6 croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO];
  4         123  
1114              
1115 3 100       6 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  3         14  
1116              
1117 3         6 my $row = ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset];
  3         11  
1118 3         7 my $old_value = ${$self->[TABLE]}->[VALUES][$row];
  3         7  
1119 3         6 ${$self->[TABLE]}->[VALUES][$row] = undef;
  3         6  
1120 3         13 return $old_value;
1121             }
1122              
1123             sub CLEAR {
1124 2     2   7 my $self = shift;
1125              
1126 2 100       4 croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO];
  2         117  
1127              
1128 1         58 $self->[TABLE]->field($self->[KEY] => []);
1129             }
1130              
1131             sub PUSH {
1132 11     11   23 my $self = shift;
1133              
1134 11         17 my $T = $self->[TABLE];
1135              
1136 11 100 100     168 croak "attempt to modify read-only object"
1137             if $$T->[IS_RO] && scalar @_;
1138              
1139 10 100       30 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1140              
1141 10 100       31 if (defined $$T->[INDEX]{$self->[KEYc]}) {
1142             # key exists ==> extend table efficiently
1143 5         14 my $last_row = $$T->[INDEX]{$self->[KEYc]}[-1];
1144 5         11 my $new_rows = scalar @_;
1145 5         8 splice @{$$T->[NAMES]}, 1+$last_row, 0,
  5         21  
1146             (($$T->[NAMES][$last_row]) x $new_rows);
1147 5         10 splice @{$$T->[VALUES]}, 1+$last_row, 0, map {"$_"} @_;
  5         17  
  4         15  
1148 5         21 $T->_update_INDEX($last_row, $new_rows);
1149 5         9 push @{$$T->[INDEX]{$self->[KEYc]}}, 1+$last_row .. $last_row+$new_rows;
  5         18  
1150             } else {
1151             # make key ==> use existing setter
1152 5         31 $T->_set_multiple_value($self->[KEY], [map {"$_"} @_]);
  9909         19624  
1153             }
1154 10         1389 $$T->[MVOFF] = undef;
1155             }
1156              
1157             sub POP {
1158 7     7   563 my $self = shift;
1159              
1160 7         13 my $T = $self->[TABLE];
1161              
1162 7 100       132 croak "attempt to modify read-only object" if $$T->[IS_RO];
1163              
1164 6 100       17 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1165              
1166 6 100       27 return undef unless defined $$T->[INDEX]{$self->[KEYc]};
1167              
1168 5         13 my $rem_row = $$T->[INDEX]{$self->[KEYc]}[-1];
1169 5         10 my $value = $$T->[VALUES][$rem_row];
1170              
1171 5         6 splice @{$$T->[NAMES]}, $rem_row, 1;
  5         14  
1172 5         9 splice @{$$T->[VALUES]}, $rem_row, 1;
  5         11  
1173 5 100       14 splice @{$$T->[MVOFF]}, $rem_row, 1 if defined $$T->[MVOFF];
  1         4  
1174 5         14 $T->_update_INDEX($rem_row, -1);
1175 5         8 pop @{$$T->[INDEX]{$self->[KEYc]}};
  5         12  
1176             # special case: popped last value
1177             delete $$T->[INDEX]{$self->[KEYc]}
1178 5 100       8 if scalar @{$$T->[INDEX]{$self->[KEYc]}} == 0;
  5         19  
1179              
1180 5         29 return $value;
1181             }
1182              
1183             sub SHIFT {
1184 6     6   534 my $self = shift;
1185              
1186 6         14 my $T = $self->[TABLE];
1187              
1188 6 100       133 croak "attempt to modify read-only object" if $$T->[IS_RO];
1189              
1190 5 100       16 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1191              
1192 5 100       20 return undef unless defined $$T->[INDEX]{$self->[KEYc]};
1193              
1194 4         9 my $rem_row = $$T->[INDEX]{$self->[KEYc]}[0];
1195 4         9 my $value = $$T->[VALUES][$rem_row];
1196              
1197 4         5 splice @{$$T->[NAMES]}, $rem_row, 1;
  4         12  
1198 4         6 splice @{$$T->[VALUES]}, $rem_row, 1;
  4         12  
1199 4         8 $$T->[MVOFF] = undef;
1200 4         11 $T->_update_INDEX($rem_row, -1);
1201 4         8 shift @{$$T->[INDEX]{$self->[KEYc]}};
  4         10  
1202             # special case: shifted last value
1203             delete $$T->[INDEX]{$self->[KEYc]}
1204 4 100       7 if scalar @{$$T->[INDEX]{$self->[KEYc]}} == 0;
  4         15  
1205              
1206 4         27 return $value;
1207             }
1208              
1209             sub UNSHIFT {
1210 5     5   13 my $self = shift;
1211              
1212 5         11 my $T = $self->[TABLE];
1213              
1214 5 100 100     142 croak "attempt to modify read-only object"
1215             if $$T->[IS_RO] && scalar @_;
1216              
1217 4 100       14 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1218              
1219 4 100       19 if (defined $$T->[INDEX]{$self->[KEYc]}) {
1220             # key exists ==> extend table efficiently
1221 3         9 my $first_row = $$T->[INDEX]{$self->[KEYc]}[0];
1222 3         7 my $new_rows = scalar @_;
1223 3         6 splice @{$$T->[NAMES]}, $first_row, 0,
  3         14  
1224             (($$T->[NAMES][$first_row]) x $new_rows);
1225 3         6 splice @{$$T->[VALUES]}, $first_row, 0, map {"$_"} @_;
  3         11  
  2         9  
1226 3         13 $T->_update_INDEX($first_row - 1, $new_rows);
1227 3         5 unshift @{$$T->[INDEX]{$self->[KEYc]}},
  3         13  
1228             $first_row .. $first_row-1+$new_rows;
1229             } else {
1230             # make key ==> use existing setter
1231 1         4 $T->_set_multiple_value($self->[KEY], [map {"$_"} @_]);
  4         12  
1232             }
1233 4         16 $$T->[MVOFF] = undef;
1234             }
1235              
1236             sub SPLICE {
1237 9     9   18 my $self = shift;
1238 9         13 my $offset = shift;
1239 9         13 my $length = shift;
1240              
1241 9 100       21 $offset = 0 unless defined $offset;
1242 9 100       20 $length = $self->FETCHSIZE() - $offset unless defined $length;
1243              
1244 9 100 100     37 return () unless ($length != 0 || scalar @_);
1245              
1246 8         15 my $T = $self->[TABLE];
1247              
1248 8 100       239 croak "attempt to modify read-only object" if $$T->[IS_RO];
1249              
1250 6 100       15 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1251              
1252 6         10 my @new = ();
1253 5         19 @new = @{$$T->[VALUES]}[@{$$T->[INDEX]{$self->[KEYc]}}]
  5         12  
1254 6 100       18 if defined $$T->[INDEX]{$self->[KEYc]};
1255 6         18 my @old = splice @new, $offset, $length, map {"$_"} @_;
  3         12  
1256 6         22 $self->[TABLE]->field($self->[KEY] => \@new);
1257              
1258 6         44 return @old;
1259             }
1260              
1261 458     458   502 sub UNTIE { our $_total_untied; $_total_untied++ }
  458         778  
1262              
1263 458     458   468 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  458         1340  
1264             }
1265              
1266             {
1267             package WARC::Fields::TiedHash::Value;
1268              
1269             # This package is a magical array that appears to be a string if it has
1270             # only one value, otherwise string conversion gives the usual
1271             # nearly-useless debugging value.
1272              
1273             # The actual underlying array is a tied array that forwards mutating
1274             # operations to the original WARC::Fields object.
1275              
1276 28     28   267 use overload '""' => '_as_string', fallback => 1;
  28         173  
  28         201  
1277              
1278 28     28   2628 use Scalar::Util qw/refaddr reftype/;
  28         148  
  28         5523  
1279              
1280             sub _new {
1281 458     458   678 my $class = shift;
1282 458         510 my $parent = shift;
1283 458         554 my $key = shift;
1284              
1285 458         506 my @values;
1286 458         1285 tie @values, (ref($parent).'::ValueArray'), $$parent, $key;
1287              
1288 458         2307 bless \@values, $class;
1289             }
1290              
1291             sub _as_string {
1292 676     676   10414 my $self = shift;
1293              
1294 676 100       1542 return scalar @$self == 1
1295             ? $self->[0] : sprintf ('%s(0x%x)', reftype $self, refaddr $self);
1296             }
1297              
1298 458     458   2853 sub DESTROY { untie @{(shift)} }
  458         864  
1299             }
1300              
1301             {
1302             package WARC::Fields::TiedHash;
1303              
1304 28     28   203 use Carp;
  28         102  
  28         2243  
1305              
1306             BEGIN { $WARC::Fields::TiedHash::{$_} = $WARC::Fields::{$_}
1307 28     28   22756 for WARC::Fields::OBJECT_INDEX; }
1308              
1309             # The underlying object is a reference to a WARC::Fields object.
1310              
1311             sub FETCH {
1312 461     461   1804 my $self = shift;
1313 461         548 my $key = shift;
1314 461 100       2276 croak "reference to invalid field name" if $key !~ m/^$PARSE_RE__token$/o;
1315 458         1335 return (ref($self).'::Value')->_new($self, $key);
1316             }
1317              
1318             sub STORE {
1319 11     11   436 my $self = shift;
1320 11         21 my $key = shift;
1321 11         15 my $value = shift;
1322              
1323 11 100       258 croak "attempt to modify read-only object" if $$$self->[IS_RO];
1324              
1325 9         28 $$self->field($key => $value);
1326             }
1327              
1328             sub DELETE {
1329 6     6   12 my $self = shift;
1330 6         12 my $key = shift;
1331              
1332 6 100       131 croak "attempt to modify read-only object" if $$$self->[IS_RO];
1333              
1334 5         17 $$self->field($key => []);
1335             }
1336              
1337             sub CLEAR {
1338 2     2   6 my $self = shift;
1339              
1340 2 100       116 croak "attempt to modify read-only object" if $$$self->[IS_RO];
1341              
1342 1         3 $$$self->[NAMES] = [];
1343 1         4 $$$self->[VALUES] = [];
1344 1         3 $$$self->[MVOFF] = undef;
1345 1         3 $$$self->[INDEX] = undef;
1346 1         3 return undef;
1347             }
1348              
1349             sub EXISTS {
1350 15     15   34 my $self = shift;
1351 15         29 my $key = shift;
1352              
1353 15 100       49 $$self->_rebuild_INDEX unless defined $$$self->[INDEX];
1354 15         41 return exists $$$self->[INDEX]->{$$self->_find_key($key)};
1355             }
1356              
1357             sub FIRSTKEY {
1358 64     64   84 my $self = shift;
1359              
1360 64         207 return $$$self->[NAMES][0];
1361             }
1362              
1363             sub NEXTKEY {
1364 313     313   394 my $self = shift;
1365 313         335 my $from_key = shift;
1366              
1367 313 100       523 $$self->_rebuild_INDEX unless defined $$$self->[INDEX];
1368              
1369 313         315 my $i;
1370 313   100     459 for ($i = $$$self->[INDEX]{$$self->_find_key($from_key)}[0] + 1;
1371             defined $$$self->[NAMES][$i] and
1372             $i != $$$self->[INDEX]{$$self->_find_key($$$self->[NAMES][$i])}[0];
1373             $i++) {}
1374 313         914 return $$$self->[NAMES][$i];
1375             }
1376              
1377             sub SCALAR {
1378 3     3   11 my $self = shift;
1379 3         5 return scalar @{$$$self->[NAMES]};
  3         19  
1380             }
1381              
1382 70     70   92 sub UNTIE { our $_total_untied; $_total_untied++ }
  70         150  
1383              
1384 70     70   78 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  70         124  
1385             }
1386              
1387             =head2 Overloaded Dereference Operators
1388              
1389             The C class provides overloaded dereference operators for
1390             array and hash dereferencing. The overloaded operators provide an
1391             anonymous tied array or hash as needed, allowing the object itself to be
1392             used as a reference to its tied array and hash interfaces. There is a
1393             caveat, however, so read on.
1394              
1395             =cut
1396              
1397             sub _as_tied_array {
1398             # To avoid confusing bugs due to typos producing overloaded dereferences
1399             # instead of intended accesses to the internal object, this feature
1400             # cannot be used from within this module.
1401 168 100   168   7589 if (scalar caller =~ m/^WARC::Fields/) {
1402 1         5 local $Carp::CarpLevel = 1;
1403 1         191 confess "overloaded array dereference in internal code"
1404             }
1405              
1406 167         254 my $self = shift;
1407              
1408 167 100       721 return $$self->[C_TA] if defined $$self->[C_TA];
1409              
1410 3         5 my @array; $$self->[C_TA] = \@array;
  3         16  
1411 3         7 Scalar::Util::weaken ${tie @array, ref $self, $self};
  3         15  
1412 3         16 return $$self->[C_TA];
1413             }
1414              
1415             sub _as_tied_hash {
1416             # To avoid confusing bugs due to typos producing overloaded dereferences
1417             # instead of intended accesses to the internal object, this feature
1418             # cannot be used from within this module.
1419 272 100   272   8052 if (scalar caller =~ m/^WARC::Fields/) {
1420 1         4 local $Carp::CarpLevel = 1;
1421 1         73 confess "overloaded hash dereference in internal code"
1422             }
1423              
1424 271         363 my $self = shift;
1425              
1426 271 100       1123 return $$self->[C_TH] if defined $$self->[C_TH];
1427              
1428 63         82 my %hash; $$self->[C_TH] = \%hash;
  63         101  
1429 63         81 Scalar::Util::weaken ${tie %hash, ref $self, $self};
  63         212  
1430 63         254 return $$self->[C_TH];
1431             }
1432              
1433             =head3 Reference Count Trickery with Overloaded Dereference Operators
1434              
1435             To avoid problems, the underlying tied object is a reference to the parent
1436             object. For ordinary use of C, this is a strong reference, however,
1437             the anonymous tied array and hash are cached in the object to avoid having
1438             to C a new object every time the dereference operators are used.
1439              
1440             To prevent memory leaks due to circular references, the overloaded
1441             dereference operators tie a I reference to the parent object. The
1442             tied aggregate always holds a strong reference to its object, but when the
1443             dereference operators are used, that inner object is a I reference to
1444             the actual C object.
1445              
1446             The caveat is thus: do not attempt to save a reference to the array or hash
1447             produced by dereferencing a C object. The parent
1448             C object must remain in scope for as long as any anonymous
1449             tied aggregates exist.
1450              
1451             =cut
1452              
1453             1;
1454             __END__