File Coverage

blib/lib/WARC/Fields.pm
Criterion Covered Total %
statement 720 720 100.0
branch 250 252 100.0
condition 32 32 100.0
subroutine 99 101 98.0
pod 7 7 100.0
total 1108 1112 99.8


line stmt bran cond sub pod time code
1             package WARC::Fields; # -*- CPerl -*-
2              
3 26     26   68687 use strict;
  26         61  
  26         776  
4 26     26   127 use warnings;
  26         48  
  26         619  
5              
6 26     26   122 use Carp;
  26         62  
  26         1311  
7 26     26   13732 use Encode;
  26         235637  
  26         1756  
8 26     26   191 use Scalar::Util;
  26         52  
  26         1198  
9              
10             our @ISA = qw();
11              
12 26     26   506 use WARC; *WARC::Fields::VERSION = \$WARC::VERSION;
  26         55  
  26         1310  
13              
14             =head1 NAME
15              
16             WARC::Fields - WARC record headers and application/warc-fields
17              
18             =head1 SYNOPSIS
19              
20             require WARC::Fields;
21              
22             $f = new WARC::Fields;
23             $f = $record->fields; # get WARC record headers
24             $g = $f->clone; # make writable copy
25              
26             $g->set_readonly; # make read-only
27              
28             $f->field('WARC-Type' => 'metadata'); # set
29             $value = $f->field('WARC-Type'); # get
30              
31             $fields_text = $f->as_string; # get WARC header lines for display
32             $fields_block = $f->as_block; # format for WARC file
33              
34             tie @field_names, ref $f, $f; # bind ordered list of field names
35              
36             tie %fields, ref $f, $f; # bind hash of field names => values
37              
38             $entry = $f->[$num]; # tie an anonymous array and access it
39             $value = $f->{$name}; # likewise with an anonymous tied hash
40              
41             $name = "$entry"; # tied array returns objects
42             $value = $entry->value; # one specific value
43             $offset = $entry->offset; # N of M with same name
44              
45             foreach (keys %{$f}) { ... } # iterate over names, in order
46              
47             =cut
48              
49 26     26   2170 use overload '@{}' => \&_as_tied_array, '%{}' => \&_as_tied_hash;
  26         1756  
  26         235  
50 26     26   1906 use overload fallback => 1;
  26         51  
  26         79  
51              
52             # This implementation uses column-oriented storage, with an array as the
53             # underlying object and constants to select array offsets.
54             #
55             # The NAMES and VALUES columns are always valid, but the MVOFF and INDEX
56             # positions may be undefined and are lazily rebuilt when needed.
57              
58 26         2862 use constant { NAMES => 0, VALUES => 1, MVOFF => 2,
59 26     26   1540 INDEX => 3, IS_RO => 4, C_TA => 5, C_TH => 6 };
  26         51  
60 26     26   173 use constant OBJECT_INDEX => qw/NAMES VALUES MVOFF INDEX IS_RO/;
  26         48  
  26         1639  
61 26     26   151 use constant OBJECT_INIT => undef, undef, undef, undef, 0, undef, undef;
  26         48  
  26         77460  
62              
63 654     654   7551 sub DESTROY { my $ob = shift;
64 654 100       1713 untie @{$$ob->[C_TA]} if defined $$ob->[C_TA];
  3         23  
65 654 100       1237 untie %{$$ob->[C_TH]} if defined $$ob->[C_TH];
  61         179  
66 654         837 our $_total_destroyed; $_total_destroyed++ }
  654         6481  
67              
68             # NAMES: array of field names, exactly as written
69             # VALUES: array of field values
70             # MVOFF: array of offsets for multiple-valued fields
71             # INDEX: hash of case-folded field names to array of row numbers
72             # IS_RO: boolean: TRUE if this object is read-only
73              
74             # C_TA: cache: tied array for array dereference
75             # C_TH: cache: tide hash for hash dereference
76              
77             sub _rebuild_INDEX {
78 30     30   45 my $self = shift;
79 30         52 my %idx = ();
80              
81 30         61 for (my $i = 0; $i < @{$$self->[NAMES]}; $i++) {
  113         262  
82 83         99 push @{$idx{lc $$self->[NAMES][$i]}}, $i;
  83         222  
83             }
84              
85 30         71 $$self->[INDEX] = \%idx;
86             }
87              
88             sub _update_INDEX {
89 72     72   104 my $self = shift; # INDEX slot must be valid
90 72         93 my $base = shift; # row number where an insertion or removal was made
91 72         94 my $count = shift; # how many rows were inserted (+) or removed (-)
92              
93 72         115 my %done = ();
94              
95 72 100       162 for (my $i = ($base < 0) ? 0 : $base; $i < @{$$self->[NAMES]}; $i++) {
  186         457  
96 114         191 my $key = lc $$self->[NAMES][$i];
97 114 100       226 next if $done{$key};
98 72         154 @{$$self->[INDEX]{$key}} =
99 72 100       91 map { $_ + ($_ > $base ? $count : 0) } @{$$self->[INDEX]{$key}};
  118         255  
  72         159  
100 72         167 $done{$key}++;
101             }
102             }
103              
104             sub _rebuild_MVOFF {
105 6     6   11 my $self = shift;
106              
107 6         14 my @mvoff = ();#(undef) x scalar @{$$self->[NAMES]};
108 6         13 my %cidx = (); # counted index; references to unique entries
109              
110 6         10 foreach my $name (@{$$self->[NAMES]}) {
  6         15  
111 10028         13164 my $key = lc $name;
112 10028 100       18062 if (not defined $cidx{$key}) {
    100          
113             # first time this key is seen
114 26         36 push @mvoff, undef;
115 26         61 $cidx{$key} = \$mvoff[$#mvoff];
116             } elsif (ref $cidx{$key} eq 'SCALAR') {
117             # second time this key is seen
118 4         10 push @mvoff, 1;
119 4         7 ${$cidx{$key}} = 0; # replace undefined value
  4         9  
120 4         10 $cidx{$key} = 2; # prepare counter
121             } else {
122             # third or later time this key is seen
123 9998         14386 push @mvoff, $cidx{$key}++;
124             }
125             }
126              
127 6         23 $$self->[MVOFF] = \@mvoff;
128             }
129              
130             sub _dbg_dump {
131 51     51   5421 my $self = shift;
132              
133 51         75 my @mvoff = (' ') x @{$$self->[NAMES]};
  51         2527  
134 20016 100       29332 @mvoff = map { defined $_ ? $_ : 'U' }
135 51 100       168 @{$$self->[MVOFF]}[0 .. $#{$$self->[NAMES]}]
  5         210  
  5         279  
136             if defined $$self->[MVOFF];
137 51         508 my @widths = map {length} qw/ROW MVO NAME/;
  153         264  
138              
139 51         88 foreach my $row (0 .. $#{$$self->[NAMES]}) {
  51         172  
140 20233 100       30996 $widths[0] = length $row if length $row > $widths[0];
141 20233 100       33252 $widths[1] = length $mvoff[$row] if length $mvoff[$row] > $widths[1];
142 20233 100       35139 $widths[2] = length $$self->[NAMES][$row]
143             if length $$self->[NAMES][$row] > $widths[2];
144             }
145              
146 51         263 my $out = sprintf ' %4$*1$s %5$*2$s %6$*3$s %7$s', @widths,
147             qw/ ROW MVO NAME VALUE /;
148 51         181 $out .= "\n".('=' x length $out)."\n";
149             $out .= join "\n", map
150 20233 100       79413 { sprintf ' %4$*1$d %5$*2$s %6$*3$s%7$1s %8$s', @widths,
    100          
151             ($_, $mvoff[$_], $$self->[NAMES][$_],
152             (defined $$self->[VALUES][$_] ? ':' : ' '),
153             (defined $$self->[VALUES][$_] ? $$self->[VALUES][$_] : '*deleted*')) }
154 51         100 0 .. $#{$$self->[NAMES]};
  51         353  
155              
156 51         2401 return $out;
157             }
158              
159             # From RFC2616:
160             # CTL =
161             # (octets 0 - 31) and DEL (127)>
162             # LWS = [CRLF] 1*( SP | HT )
163             # separators = "(" | ")" | "<" | ">" | "@"
164             # | "," | ";" | ":" | "\" | <">
165             # | "/" | "[" | "]" | "?" | "="
166             # | "{" | "}" | SP | HT
167             my $PARSE_RE__LWS = qr/(?:\015\012)?[ \t]+/;
168             my $PARSE_RE__separator = qr[[][)(><}{@,;:/"\\?=[:space:]]];
169             my $PARSE_RE__not_separator = qr[[^][)(><}{@,;:/"\\?=[:space:]]];
170              
171             # From WARC specification:
172             # field-name = token
173             # token = 1*
174             # except CTLs or separators>
175             my $PARSE_RE__token = qr/[!#$%'*+-.0-9A-Z^_`a-z|~]+/;
176              
177             =head1 DESCRIPTION
178              
179             The C class encapsulates information in the
180             "application/warc-fields" format used for WARC record headers. This is a
181             simple key-value format closely analogous to HTTP headers, however
182             differences are significant enough that the C class cannot
183             be reliably reused for WARC fields.
184              
185             Instances of this class are usually created as member variables of the
186             C class, but can also be returned as the content of WARC
187             records with Content-Type "application/warc-fields".
188              
189             Instances of C retrieved from WARC files are read-only and
190             will croak() if any attempt is made to change their contents.
191              
192             This class strives to faithfully represent the contents of a WARC file,
193             while providing a simple interface to answer simple questions.
194              
195             =head2 Multiple Values
196              
197             Most WARC headers may only appear once and with a single value in valid
198             WARC records, with the notable exception of the WARC-Concurrent-To header.
199             C neither attempts to enforce nor relies upon this
200             constraint. Headers that appear multiple times are considered to have
201             multiple values. When iterating a tied hash, all values of a recurring
202             header are collected and returned with the B occurrence of its key.
203              
204             Multiple values are returned from the C method and tied hash
205             interface as array references, and are set by passing in an array
206             reference. Existing rows are reused where possible when updating a field
207             with multiple values. If the new array reference contains fewer items
208             (including the special case of replacing multiple values with a single
209             value) excess rows are deleted. If the new array reference requires
210             additional rows to be inserted, they are inserted immediately after the
211             last existing row for a field, with the same name case as that row.
212              
213             Precise control of the layout is available using the tied array interface,
214             but the ordering of the header rows is not constrained in the WARC
215             specification.
216              
217             =head2 Field Name Mangling
218              
219             As with C, the '_' character is converted to '-' in field
220             names unless the first character of the name is ':', which cannot itself
221             appear in a field name. Unlike C, the leading ':' is
222             stripped off immediately and the name stored otherwise exactly as given.
223             The C method and tied hash interface allow this convenience feature.
224             The field names exposed via the tied array interface are reported
225             B as they appear in the WARC file.
226              
227             Strictly, "X-Crazy-Header" and "X_Crazy_Header" are two B
228             headers that the above convenience mechanism conflates. The solution is
229             simple: if (and only if) a header field B with the B
230             name given, it is used, otherwise C occurs and the name is
231             rechecked for another exact match. If no match is found, case is folded
232             and a third check performed. If a match is found, the existing header is
233             updated, otherwise a new header is created with character case as given.
234              
235             The WARC specification specifically states that field names are
236             case-insensitive, accordingly, "X-Crazy-Header" and "X-CRAZY-HeAdEr" are
237             considered the same header for the C method and tied hash interface.
238             They will appear exactly as given in the tied array interface, however.
239              
240             =cut
241              
242             # This function handles two different canonicalizations:
243             # (1) case folding as required by the WARC specification
244             # (2) convenience translation s/_/-/g,
245             # (2a) suppressed if m/^:/, which is removed
246             # (2b) overridden by an exact match
247             # To make this work:
248             # --- all keys in INDEX are case-folded
249             # --- all keys in NAMES preserve case
250             # --- existing keys are case-folded by this function
251             # --- new keys translate s/_/-/g but preserve case
252             sub _find_key {
253 7553     7553   9385 my $self = shift; # INDEX slot must be valid
254 7553         9625 my $k = shift;
255 7553         8485 my $key; ($key = $k) =~ s/^://;
  7553         11430  
256 7553         9183 my $pad = $key;
257 7553         10453 my $is_quoted = ($k =~ m/^:/);
258              
259             # exact case-folded match?
260 7553 100       21340 return lc $key if defined $$self->[INDEX]{lc $key};
261              
262             # case-folded match after s/_/-/g?
263 3698         7522 $pad =~ s/_/-/g;
264 3698 100 100     10002 return lc $pad if defined $$self->[INDEX]{lc $pad} && !$is_quoted;
265              
266             # not found ==> a new key will be made
267 2709 100       5593 return $is_quoted ? $key : $pad;
268             }
269              
270             # called only if there is no or one current value
271             sub _set_single_value {
272 12     12   18 my $self = shift; # INDEX slot must be valid
273 12         17 my $key = shift; # as returned from _find_key
274 12         20 my $value = shift;
275              
276 12 100       217 croak "attempt to modify read-only object" if $$self->[IS_RO];
277              
278 11 100       32 unless (defined $$self->[INDEX]{lc $key}) {
279             # insert new key
280 6         9 push @{$$self->[NAMES]}, $key; # preserve original key
  6         16  
281 6         14 $key = lc $key; # fold key case
282 6         7 push @{$$self->[INDEX]{$key}}, $#{$$self->[NAMES]};
  6         18  
  6         11  
283             }
284              
285 11         37 $$self->[VALUES][$$self->[INDEX]{$key}[0]] = $value;
286             }
287              
288             sub _key_multiple_value_p {
289 3801     3801   4758 my $self = shift; # INDEX slot must be valid
290 3801         4882 my $key = shift; # as returned from _find_key
291              
292             # For this to be true, the key must already exist, which means that
293             # _find_key has case-folded it already.
294             return (defined $$self->[INDEX]{$key}
295 3801   100     7733 && 1 < scalar @{$$self->[INDEX]{$key}})
296             }
297              
298             # called in all cases where multiple values are involved
299             sub _set_multiple_value {
300 56     56   82 my $self = shift; # INDEX slot must be valid
301 56         81 my $key = shift; # as returned from _find_key
302 56         74 my $value_aref = shift;
303              
304 56 100       351 croak "attempt to modify read-only object" if $$self->[IS_RO];
305              
306             my $cur_count = (defined $$self->[INDEX]{$key}
307 54   100     146 && scalar @{$$self->[INDEX]{$key}});
308 54         108 my $new_count = scalar @$value_aref;
309              
310 54 100       105 unless ($cur_count) {
311             # insert new key
312 16         26 push @{$$self->[NAMES]}, $key; # preserve original key
  16         44  
313 16         31 push @{$$self->[VALUES]}, undef; # prepare slot
  16         34  
314 16         30 $key = lc $key; # fold key case
315 16         26 push @{$$self->[INDEX]{$key}}, $#{$$self->[NAMES]};
  16         51  
  16         40  
316 16         29 $cur_count = 1; # account for the added slot
317             }
318             # $key is always case-folded at this point
319              
320             # adjust table to accommodate new number of values
321 54 100       144 if ($cur_count > $new_count) {
    100          
322             # remove extra rows
323 27         38 foreach my $extra_row (reverse sort
324 27         136 splice @{$$self->[INDEX]{$key}}, $new_count) {
325 40         65 splice @{$$self->[NAMES]}, $extra_row, 1;
  40         79  
326 40         58 splice @{$$self->[VALUES]}, $extra_row, 1;
  40         66  
327 40         89 _update_INDEX($self, $extra_row, -1);
328             }
329             # special case: removing a field entirely
330 27 100       89 if ($new_count == 0) {
331             # This is here to catch a hypothetical bug before data is corrupted.
332             die "stray INDEX entries left after removing field"
333             # uncoverable branch true
334 19 50       30 unless scalar @{$$self->[INDEX]{$key}} == 0;
  19         63  
335 19         49 delete $$self->[INDEX]{$key};
336             }
337             } elsif ($cur_count < $new_count) {
338             # add more rows
339 15         30 my $last_row = $$self->[INDEX]{$key}[-1];
340 15         26 my $new_rows = $new_count - $cur_count;
341 15         46 _update_INDEX($self, $last_row, $new_rows);
342 15         39 splice @{$$self->[NAMES]}, 1+$last_row, 0,
  15         2428  
343             (($$self->[NAMES][$last_row]) x $new_rows);
344 15         22 splice @{$$self->[VALUES]}, 1+$last_row, 0, ((undef) x $new_rows);
  15         571  
345 15         27 push @{$$self->[INDEX]{$key}}, 1+$last_row .. $last_row+$new_rows;
  15         1257  
346             } # otherwise, $cur_count == $new_count
347 54 100       307 $$self->[MVOFF] = undef unless $cur_count == $new_count;
348             # there are always $new_count rows with $key at this point
349              
350 54         136 for (my $i = 0; $i < $new_count; $i++)
351 9968         21741 { $$self->[VALUES][$$self->[INDEX]{$key}[$i]] = $value_aref->[$i] }
352             }
353              
354             =head2 Methods
355              
356             =over
357              
358             =item $f = WARC::Fields-Enew
359              
360             Construct a new C object. Initial contents can be passed as
361             key-value pairs to this constructor and will be added in the given order.
362              
363             Repeating a key or supplying an array reference as a value assigns multiple
364             values to a key. To reduce the risk of confusion, only quoting with a
365             leading ':' overrides the convenience feature of applying C when
366             constructing a C object. The exact match rules used when
367             setting values on an existing object do not apply here.
368              
369             Field names given when constructing a WARC::Fields object are otherwise
370             stored exactly as given, with case preserved, even when other names that
371             fold to the same string have been given earlier in the argument list.
372              
373             =cut
374              
375             sub new {
376 220     220 1 15014 my $class = shift;
377 220         468 my $ob = [OBJECT_INIT];
378 220         352 my $k; my $v;
379              
380             # explicitly initialize NAMES and VALUES to allow as_string and as_block
381             # methods to be called on empty objects
382 220         322 $ob->[NAMES] = [];
383 220         371 $ob->[VALUES] = [];
384              
385 220         624 while (($k, $v) = splice @_, 0, 2) {
386 792 100       1665 croak "key without value" unless defined $v;
387              
388 791 100       1414 if ($k =~ m/^:/) { $k =~ s/^:// } else { $k =~ s/_/-/g }
  7         18  
  784         1860  
389              
390 791 100       2149 croak "reference to field with no name" unless $k =~ m/./;
391 789 100       2210 croak "reference to invalid field name" if $k !~ m/^$PARSE_RE__token$/o;
392              
393 787 100       1369 if (ref $v eq 'ARRAY') {
394 3         8 foreach my $value (@$v) {
395 8         11 push @{$ob->[NAMES]}, $k;
  8         14  
396 8         11 push @{$ob->[VALUES]}, $value;
  8         12  
397 8         11 push @{$ob->[INDEX]{lc $k}}, $#{$ob->[NAMES]};
  8         17  
  8         24  
398             }
399             } else {
400 784         957 push @{$ob->[NAMES]}, $k;
  784         1468  
401 784         1029 push @{$ob->[VALUES]}, $v;
  784         1262  
402 784         964 push @{$ob->[INDEX]{lc $k}}, $#{$ob->[NAMES]};
  784         1673  
  784         2587  
403             }
404             }
405              
406 215         296 {our $_total_newly_constructed; $_total_newly_constructed++}
  215         248  
  215         265  
407 215         1007 bless \ $ob, $class;
408             }
409              
410             =item $f-Eclone
411              
412             Copy a C object. A copy of a read-only object is writable.
413              
414             =cut
415              
416             sub clone {
417 8     8 1 14 my $self = shift;
418 8         25 my $new = [OBJECT_INIT];
419              
420 8         15 $new->[NAMES] = [@{$$self->[NAMES]}];
  8         701  
421 8         16 $new->[VALUES] = [@{$$self->[VALUES]}];
  8         1440  
422 8 100       35 $new->[MVOFF] = [@{$$self->[MVOFF]}] if defined $$self->[MVOFF];
  1         301  
423 30         38 $new->[INDEX] = {map {$_ => [@{$$self->[INDEX]{$_}}]}
  30         94  
424 8 100       29 keys %{$$self->[INDEX]}} if defined $$self->[INDEX];
  7         30  
425              
426 8         18 {our $_total_newly_cloned; $_total_newly_cloned++}
  8         11  
  8         16  
427 8         31 bless \ $new, ref $self;
428             }
429              
430             =item $f-Efield( $name )
431              
432             =item $f-Efield( $name =E $value )
433              
434             =item $f-Efield( $n1 =E $v1, $n2 =E $v2, ... )
435              
436             Get or set the value of one or more fields. The field name is not case
437             sensitive, but C will preserve its case if a new entry is
438             created.
439              
440             Setting a field to C effectively deletes that field, although it
441             remains visible in the tied array interface and will retain its position if
442             a new value is assigned. Setting a field to an empty array reference
443             removes that field entirely.
444              
445             =cut
446              
447             sub field {
448 6519     6519 1 36562 my $self = shift;
449              
450 6519 100       13106 _rebuild_INDEX($self) unless defined $$self->[INDEX];
451              
452 6519         8474 my $k; my $v; my $have_value_arg = scalar @_ > 1;
  6519         9708  
453 6519         14832 while (($k, $v) = splice @_, 0, 2) {
454 6519         11055 my $key = $self->_find_key($k);
455              
456 6519 100       17274 croak "reference to field with no name" unless $key =~ m/./;
457 6517 100       15323 croak "reference to invalid field name" if $key !~ m/^$PARSE_RE__token$/o;
458              
459 6514 100       10679 if (not $have_value_arg) {
    100          
    100          
460             # get a value
461 6452 100       15886 return undef unless defined $$self->[INDEX]{$key};
462 3787 100       6040 return $$self->[VALUES][$$self->[INDEX]{$key}[0]]
463             unless $self->_key_multiple_value_p($key);
464 78         278 return [grep {defined $_}
465 25         55 map {$$self->[VALUES][$_]} @{$$self->[INDEX]{$key}}];
  78         153  
  25         74  
466             } # otherwise set a value
467             elsif (UNIVERSAL::isa($v, 'ARRAY'))
468 48         104 { $self->_set_multiple_value($key, $v) }
469             elsif ($self->_key_multiple_value_p($key))
470             # has multiple values, but now only setting a single value
471 2         7 { $self->_set_multiple_value($key, [$v]) }
472             else
473 12         27 { $self->_set_single_value($key, $v) }
474 59         211 $have_value_arg = scalar @_ > 1;
475             }
476 59         155 return (); # return nothing
477             # Note that setting one or more fields and then getting a field is
478             # possible as a side-effect of this organization, but is explicitly NOT
479             # supported. That trick is NOT part of the stable API.
480             }
481              
482             =item $f = WARC::Fields-Eparse( $text )
483              
484             =item $f = WARC::Fields-Eparse( from =E $fh )
485              
486             =item $f = parse WARC::Fields from =E $fh
487              
488             Construct a new C object, reading initial contents from the
489             provided text string or filehandle.
490              
491             The C method throws an exception if it encounters input that it does
492             not understand.
493              
494             If the C method encounters a field name with a leading ':', which
495             implies an empty name and is not allowed, the leading ':' is silently
496             dropped from the line and parsing retried. If the line is not valid after
497             this change, the C method throws an exception. This feature is in
498             keeping with the general principle of "be liberal in what you accept" and
499             is a preemptive workaround for a predicted bug in other implementations.
500              
501             =cut
502              
503             sub parse {
504 556     556 1 1964 my $class = shift;
505 556         748 my $text = shift;
506 556         694 my $rd;
507              
508 556 100       1002 if ($text eq 'from') {
509 551         724 $rd = shift;
510             } else {
511             # This fails iff perl was built without PerlIO, which is non-default.
512             # uncoverable branch true
513 5 50       64 open $rd, '<', \$text or die "failure opening stream on variable: $!";
514             }
515              
516 556         941 my @names = ();
517 556         721 my @values = ();
518 556         776 my %idx = ();
519 556         671 my $at_end = 0;
520              
521 556         1236 local *_;
522 556         1629 while (<$rd>) {
523 4043         41979 s/[\015\012]+$//;
524 4043 100       13373 if (m/^:?($PARSE_RE__token):\s+(.*)$/o) # $1 -- name $2 -- value
    100          
    100          
525 3487         7675 { push @names, $1; push @values, $2; push @{$idx{lc $1}}, $#names }
  3487         5791  
  3487         4011  
  3487         14329  
526             elsif (m/^\s+(\S.*)$/) # $1 -- continued value
527 1         10 { $values[$#values] .= ' '.$1 }
528 553         831 elsif (m/^$/) { $at_end = 1; last }
  553         888  
529 2         362 else { croak "unrecognized input: $_" }
530             }
531              
532 554 100       1198 carp "end-of-input before end marker" unless $at_end;
533              
534 554         970 @values = map {Encode::decode_utf8($_)} @values;
  3487         44876  
535              
536 554         8820 my $ob = [OBJECT_INIT];
537 554         963 $ob->[NAMES] = \@names;
538 554         807 $ob->[VALUES] = \@values;
539 554         805 $ob->[INDEX] = \%idx;
540              
541 554         722 {our $_total_newly_parsed; $_total_newly_parsed++}
  554         698  
  554         743  
542 554         2127 bless \ $ob, $class;
543             }
544              
545             =item $f-Eas_block
546              
547             =item $f-Eas_string
548              
549             Return the contents as a formatted WARC header or application/warc-fields
550             block. The C method uses network line endings and UTF-8 as
551             specified for the WARC format, while the C method uses the local
552             line endings and does not perform encoding.
553              
554             =cut
555              
556             sub _as_text {
557 48     48   74 my $self = shift;
558 48         68 my $newline = shift;
559 48         63 my $out = '';
560              
561 48         81 for (my $i = 0; $i < @{$$self->[NAMES]}; $i++) {
  258         630  
562 210 100       403 next unless defined $$self->[VALUES][$i];
563 203         451 $out .= $$self->[NAMES][$i] . ': ' . $$self->[VALUES][$i] . $newline;
564             }
565              
566 48         227 return $out;
567             }
568              
569 1     1 1 4 sub as_block { Encode::encode('UTF-8', _as_text(shift, WARC::CRLF)) }
570 47     47 1 4701 sub as_string { _as_text(shift, "\n") }
571              
572             =item $f-Eset_readonly
573              
574             Mark a C object read-only. All methods that modify the
575             object will croak() if called on a read-only object.
576              
577             =cut
578              
579             sub set_readonly {
580 680     680 1 1181 my $self = shift;
581              
582 680         1654 $$self->[IS_RO] = 1;
583             }
584              
585             =back
586              
587             =head2 Tied Array Access
588              
589             The order of fields can be fully controlled by tying an array to a
590             C object and manipulating the array using ordinary Perl
591             operations. The C and C functions are likely to be useful
592             for reordering array elements if desired.
593              
594             C will croak() if an attempt is made to set a field name with
595             a leading ':' using the tied array interface.
596              
597             =cut
598              
599             sub TIEARRAY {
600 4     4   15 my $class = shift;
601 4         13 my $ob = shift;
602              
603             # This method must ignore the given class to allow the "empty subclass"
604             # test to pass. If a subclass really wants, an override for TIEARRAY
605             # itself can call SUPER::TIEARRAY and re-bless the returned reference
606             # into the desired class.
607 4         7 $WARC::Fields::TiedArray::_total_tied++;
608 4         24 bless \ $ob, 'WARC::Fields::TiedArray';
609             }
610              
611             {
612             package WARC::Fields::TiedArray::Entry;
613              
614 26     26   240 use Carp;
  26         64  
  26         2251  
615              
616             BEGIN { $WARC::Fields::TiedArray::Entry::{$_} = $WARC::Fields::{$_}
617 26     26   1118 for WARC::Fields::OBJECT_INDEX; }
618              
619 26     26   198 use constant { NAME => 0, VALUE => 1, TABLE => 2, ROW => 3 };
  26         51  
  26         2565  
620              
621 26     26   174 use overload '""' => 'name', fallback => 1;
  26         49  
  26         186  
622              
623             =pod
624              
625             The tied array interface accepts simple string values but returns objects
626             with additional information. The returned object has an overloaded string
627             conversion that yields the name for that entry but additionally has
628             C and C methods.
629              
630             An entry object is bound to a slot in its parent C object,
631             but will be copied if it is assigned to another slot in the same or another
632             C object.
633              
634             Due to complex aliasing rules necessary for array slice assignment to work
635             for permuting rows in the table, entry objects must be short-lived.
636             Storing the object read from a tied array and attempting to use it after
637             modifying its parent C object produces unspecified results.
638              
639             =over
640              
641             =item $entry = $array[$n]
642              
643             =item $entry = $f-E[$n]
644              
645             The tied array C method returns a "entry object" instead of the name
646             itself.
647              
648             =cut
649              
650             sub _new {
651 223     223   311 my $class = shift;
652 223         292 my $table = shift;
653 223         260 my $row = shift;
654              
655 223         1018 bless [$$table->[NAMES][$row], $$table->[VALUES][$row],
656             $table, $row], $class;
657             }
658              
659             =item $name = "$entry"
660              
661             =item $name = $entry-Ename
662              
663             =item $name = "$f-E[$n]"
664              
665             =item $name = $f-E[$n]-Ename
666              
667             The C method on a entry object returns the field name.
668             String conversion is overloaded to call this method.
669              
670             =cut
671              
672 62     62   9094 sub name { (shift)->[NAME] }
673              
674             =item $value = $entry-Evalue
675              
676             =item $value = $array[$n]-Evalue
677              
678             =item $value = $f-E[$n]-Evalue
679              
680             =item $entry-Evalue( $new_value )
681              
682             =item $array[$n]-Evalue( $new_value )
683              
684             =item $f-E[$n]-Evalue( $new_value )
685              
686             The C method on a entry object returns the field value for this
687             particular entry. Only a single scalar is returned, even if multiple
688             entries share the same name.
689              
690             If given an argument, the C method replaces the value for this
691             particular entry. The argument will be coerced to a string.
692              
693             =cut
694              
695             sub value {
696 154     154   220 my $self = shift;
697              
698 154 100       272 if (scalar @_ == 0) { # get this value
699 34         160 return $self->[VALUE];
700             } else { # update this value
701 120 100       144 croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO];
  120         382  
702 119         174 my $newval = shift;
703 119         191 ${$self->[TABLE]}->[VALUES]->[$self->[ROW]] = $self->[VALUE] = "$newval";
  119         218  
704 119         290 return (); # and return nothing
705             }
706             }
707              
708             =item $offset = $entry-Eoffset
709              
710             =item $offset = $array[$n]-Eoffset
711              
712             =item $offset = $f-E[$n]-Eoffset
713              
714             The C method on a entry object returns the position of this entry
715             amongst multiple entries with the same field name. These positions are
716             numbered from zero and are identical to the positions in the array
717             reference returned for this entry's field name from the C method or
718             the tied hash interface.
719              
720             =cut
721              
722             sub offset {
723 21     21   36 my $self = shift;
724 21 100       27 $self->[TABLE]->_rebuild_MVOFF unless defined ${$self->[TABLE]}->[MVOFF];
  21         76  
725 21         39 return ${$self->[TABLE]}->[MVOFF]->[$self->[ROW]];
  21         115  
726             }
727              
728             =back
729              
730             =cut
731              
732             }
733              
734             {
735             package WARC::Fields::TiedArray::LooseEntry;
736              
737 26     26   8967 use Carp;
  26         56  
  26         2164  
738              
739             BEGIN { $WARC::Fields::TiedArray::LooseEntry::{$_} = $WARC::Fields::{$_}
740 26     26   927 for WARC::Fields::OBJECT_INDEX; }
741              
742 26     26   179 use constant { NAME => 0, VALUE => 1 };
  26         58  
  26         1985  
743              
744 26     26   4615 BEGIN { our @ISA = qw(WARC::Fields::TiedArray::Entry) }
745              
746             # This is a special type of "entry object" that is not associated with a
747             # table row, returned from POP, SHIFT, and SPLICE when needed.
748              
749             sub _new {
750 24     24   397 my $class = shift;
751 24         34 my $name = shift;
752 24         29 my $value = shift;
753              
754 24         74 bless [$name, $value], $class;
755             }
756              
757 18     18   892 sub name { return (shift)->[NAME] }
758              
759             sub value {
760 17     17   25 my $self = shift;
761              
762 17 100       36 if (scalar @_ == 0) # get
763 16         39 { return $self->[VALUE] }
764             else # set
765 1         157 { croak "Loose array entries are read-only." }
766             }
767              
768 1     1   7 sub offset { return undef }
769             }
770              
771             {
772             package WARC::Fields::TiedArray;
773              
774 26     26   196 use Carp;
  26         66  
  26         1975  
775              
776             BEGIN { $WARC::Fields::TiedArray::{$_} = $WARC::Fields::{$_}
777 26     26   40949 for WARC::Fields::OBJECT_INDEX; }
778              
779             # The underlying object is a reference to a WARC::Fields object.
780              
781             sub FETCH {
782 223     223   8869 my $self = shift;
783 223         289 my $row = shift;
784 223         642 return (ref($self).'::Entry')->_new($$self, $row);
785             }
786              
787             sub STORE {
788 13     13   464 my $self = shift;
789 13         17 my $row = shift;
790 13         18 my $name = shift;
791              
792 13 100       156 croak "attempt to modify read-only object" if $$$self->[IS_RO];
793              
794 12 100       19 $self->STORESIZE($row + 1) if $#{$$$self->[NAMES]} < $row;
  12         32  
795              
796 12 100       59 if (UNIVERSAL::isa($name, ref($self).'::Entry')) {
797             # copy entry
798 8 100       17 croak "attempt to set invalid name"
799             if $name->name !~ m/^$PARSE_RE__token$/o;
800 7         36 $$$self->[NAMES]->[$row] = $name->name;
801 7         14 $$$self->[VALUES]->[$row] = $name->value;
802             } else {
803             # set name
804 4 100       342 croak "attempt to set invalid name"
805             if "$name" !~ m/^$PARSE_RE__token$/o;
806 2         8 $$$self->[NAMES]->[$row] = "$name";
807             }
808 9         48 $$$self->[MVOFF] = undef;
809 9         35 $$$self->[INDEX] = undef;
810             }
811              
812             sub FETCHSIZE {
813 38     38   5182 my $self = shift;
814 38         51 return scalar @{$$$self->[NAMES]};
  38         295  
815             }
816              
817             sub STORESIZE {
818 5     5   12 my $self = shift;
819 5         7 my $count = shift;
820              
821 5 100 100     25 croak "attempt to modify read-only object"
822             if $$$self->[IS_RO] && $count != $self->FETCHSIZE();
823              
824 4 100       12 if ($count > $self->FETCHSIZE()) {
    100          
825 2         5 my $needed = $count - $self->FETCHSIZE();
826 2         4 push @{$$$self->[NAMES]}, ('X-Undefined-Field-Name') x $needed;
  2         7  
827 2         3 push @{$$$self->[VALUES]}, (undef) x $needed;
  2         9  
828             } elsif ($count < $self->FETCHSIZE()) {
829 1         2 splice @{$$$self->[NAMES]}, $count;
  1         4  
830 1         2 splice @{$$$self->[VALUES]}, $count;
  1         4  
831 1         5 $$$self->[INDEX] = undef;
832 1         5 } else { return } # no actual change
833             }
834              
835       0     sub EXTEND {
836             # do nothing
837             }
838              
839             sub EXISTS {
840 14     14   2053 my $self = shift;
841 14         22 my $row = shift;
842 14         82 return defined $$$self->[VALUES]->[$row];
843             }
844              
845             sub DELETE {
846 2     2   5 my $self = shift;
847 2         4 my $row = shift;
848              
849 2 100       115 croak "attempt to modify read-only object" if $$$self->[IS_RO];
850              
851 1         4 my $old_value = $$$self->[VALUES]->[$row];
852 1         3 $$$self->[VALUES]->[$row] = undef;
853 1         2 $$$self->[MVOFF] = undef;
854 1         5 $$$self->[INDEX] = undef;
855 1         4 return $old_value;
856             }
857              
858             sub CLEAR {
859 3     3   6 my $self = shift;
860              
861 3 100       118 croak "attempt to modify read-only object" if $$$self->[IS_RO];
862              
863 2         9 $$$self->[NAMES] = [];
864 2         5 $$$self->[VALUES] = [];
865 2         6 $$$self->[MVOFF] = undef;
866 2         6 $$$self->[INDEX] = undef;
867 2         6 return undef;
868             }
869              
870             sub PUSH {
871 7     7   769 my $self = shift;
872              
873 7 100 100     137 croak "attempt to modify read-only object"
874             if $$$self->[IS_RO] && scalar @_;
875              
876 6         16 foreach my $item (@_) {
877 103         141 my $name; my $value;
878 103 100       332 if (UNIVERSAL::isa($item, ref($self).'::Entry'))
879 2         6 { $name = $item->name; $value = $item->value }
  2         7  
880             else
881 101         149 { $name = "$item"; $value = undef }
  101         125  
882 103 100       553 croak "attempt to set invalid name" if $name !~ m/^$PARSE_RE__token$/o;
883 101         143 push @{$$$self->[NAMES]}, $name;
  101         550  
884 101         150 push @{$$$self->[VALUES]}, $value;
  101         332  
885             }
886 4         10 $$$self->[MVOFF] = undef;
887 4         119 $$$self->[INDEX] = undef;
888 4         8 return scalar @{$$$self->[NAMES]};
  4         16  
889             }
890              
891             sub POP {
892 5     5   10 my $self = shift;
893              
894 5 100       128 croak "attempt to modify read-only object" if $$$self->[IS_RO];
895              
896             my $ret = WARC::Fields::TiedArray::LooseEntry->_new
897 4         8 (pop @{$$$self->[NAMES]}, pop @{$$$self->[VALUES]});
  4         9  
  4         15  
898 4 100       14 pop @{$$$self->[MVOFF]} if defined $$$self->[MVOFF];
  2         18  
899 4         10 $$$self->[INDEX] = undef;
900              
901 4         14 return $ret;
902             }
903              
904             sub SHIFT {
905 2     2   5 my $self = shift;
906              
907 2 100       118 croak "attempt to modify read-only object" if $$$self->[IS_RO];
908              
909             my $ret = WARC::Fields::TiedArray::LooseEntry->_new
910 1         2 (shift @{$$$self->[NAMES]}, shift @{$$$self->[VALUES]});
  1         4  
  1         5  
911 1         4 $$$self->[MVOFF] = undef;
912 1         4 $$$self->[INDEX] = undef;
913              
914 1         4 return $ret;
915             }
916              
917             sub UNSHIFT {
918 6     6   730 my $self = shift;
919              
920 6 100 100     170 croak "attempt to modify read-only object"
921             if $$$self->[IS_RO] && scalar @_;
922              
923 5         12 foreach my $item (@_) {
924 4         7 my $name; my $value;
925 4 100       23 if (UNIVERSAL::isa($item, ref($self).'::Entry'))
926 2         7 { $name = $item->name; $value = $item->value }
  2         5  
927             else
928 2         4 { $name = "$item"; $value = undef }
  2         4  
929 4 100       286 croak "attempt to set invalid name" if $name !~ m/^$PARSE_RE__token$/o;
930 2         3 unshift @{$$$self->[NAMES]}, $name;
  2         8  
931 2         5 unshift @{$$$self->[VALUES]}, $value;
  2         7  
932             }
933 3         7 $$$self->[MVOFF] = undef;
934 3         6 $$$self->[INDEX] = undef;
935 3         8 return scalar @{$$$self->[NAMES]};
  3         12  
936             }
937              
938             sub SPLICE {
939 10     10   777 my $self = shift;
940 10         15 my $offset = shift;
941 10         17 my $length = shift;
942              
943 10 100       29 $offset = 0 unless defined $offset;
944 10 100       24 $length = $self->FETCHSIZE() - $offset unless defined $length;
945              
946 10 100 100     42 return () unless ($length != 0 || scalar @_);
947              
948 9 100       234 croak "attempt to modify read-only object" if $$$self->[IS_RO];
949              
950 7         13 my @new_names = (); my @new_values = (); my @old_names; my @old_values;
  7         12  
  7         11  
951              
952 7         17 foreach my $item (@_) {
953 17 100       62 if (UNIVERSAL::isa($item, ref($self).'::Entry')) {
954 11         22 push @new_names, $item->name;
955 11         19 push @new_values, $item->value;
956             } else {
957 6         15 push @new_names, "$item";
958 6         11 push @new_values, undef;
959             }
960             }
961              
962             croak "attempt to set invalid name"
963 7 100       14 if grep { $_ !~ m/^$PARSE_RE__token$/o } @new_names;
  17         322  
964              
965 5         10 @old_names = splice @{$$$self->[NAMES]}, $offset, $length, @new_names;
  5         20  
966 5         12 @old_values = splice @{$$$self->[VALUES]}, $offset, $length, @new_values;
  5         15  
967              
968 5         9 my @ret = ();
969              
970 5         15 for (my $i = 0; $i < scalar @old_names; $i++)
971 18         40 { push @ret, WARC::Fields::TiedArray::LooseEntry->_new
972             ($old_names[$i], $old_values[$i]) }
973              
974 5         12 $$$self->[MVOFF] = undef;
975 5         10 $$$self->[INDEX] = undef;
976 5         28 return @ret;
977             }
978              
979 4     4   9 sub UNTIE { our $_total_untied; $_total_untied++ }
  4         17  
980              
981 4     4   10 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  4         14  
982             }
983              
984             =head2 Tied Hash Access
985              
986             The contents of a C object can be easily examined by tying a
987             hash to the object. Reading or setting a hash key is equivalent to the
988             C method, but the tied hash will iterate keys and values in the
989             order in which each key B appears in the internal table.
990              
991             Like the tied array interface, the tied hash interface returns magical
992             objects that internally refer back to the parent C object.
993             These objects remain valid if the underlying C object is
994             changed, but further use may produce surprising and unspecified results.
995              
996             The use of magical objects enables the values in a tied hash to B
997             be arrays, even for keys that do not exist (the array will have zero
998             elements) or that have only one value (the array will have a string
999             conversion that produces that one value). This allows a tied hash to
1000             support autovivification of an array value just as Perl's own hashes do.
1001              
1002             =cut
1003              
1004             sub TIEHASH {
1005 68     68   117 my $class = shift;
1006 68         98 my $ob = shift;
1007              
1008             # This method must ignore the given class to allow the "empty subclass"
1009             # test to pass. If a subclass really wants, an override for TIEHASH
1010             # itself can call SUPER::TIEHASH and re-bless the returned reference
1011             # into the desired class.
1012 68         87 $WARC::Fields::TiedHash::_total_tied++;
1013 68         401 bless \ $ob, 'WARC::Fields::TiedHash';
1014             }
1015              
1016             {
1017             package WARC::Fields::TiedHash::ValueArray;
1018              
1019 26     26   216 use Carp;
  26         58  
  26         2146  
1020              
1021             BEGIN { $WARC::Fields::TiedHash::ValueArray::{$_} = $WARC::Fields::{$_}
1022 26     26   973 for WARC::Fields::OBJECT_INDEX; }
1023              
1024 26     26   186 use constant { TABLE => 0, KEY => 1, KEYc => 2 }; # KEYc -- canonical KEY
  26         114  
  26         51820  
1025              
1026             sub TIEARRAY {
1027 458     458   669 my $class = shift;
1028 458         672 my $table = shift;
1029 458 100       1021 $table->_rebuild_INDEX unless defined $$table->[INDEX];
1030 458         855 my $key = $table->_find_key(shift); # needs INDEX
1031              
1032 458         737 { our $_total_tied; $_total_tied++ }
  458         556  
  458         588  
1033 458         1571 bless [$table, $key, lc $key], $class;
1034             }
1035              
1036             sub FETCH {
1037 764     764   1630 my $self = shift;
1038 764         959 my $offset = shift;
1039              
1040 764 100       907 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  764         1613  
1041              
1042 764         974 my $row = ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset];
  764         1387  
1043 764 100       1243 return defined $row ? ${$self->[TABLE]}->[VALUES][$row] : undef;
  748         3277  
1044             }
1045              
1046             sub STORE {
1047 10     10   544 my $self = shift;
1048 10         13 my $offset = shift;
1049 10         16 my $value = shift;
1050              
1051 10         14 my $T = $self->[TABLE];
1052              
1053 10 100       142 croak "attempt to modify read-only object" if $$T->[IS_RO];
1054              
1055 9 100       21 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1056              
1057             $self->STORESIZE($offset + 1)
1058             if not defined $$T->[INDEX]{$self->[KEYc]}
1059 9 100 100     31 or $#{$$T->[INDEX]{$self->[KEYc]}} < $offset;
  8         31  
1060              
1061 9         50 $$T->[VALUES][$$T->[INDEX]{$self->[KEYc]}[$offset]] = "$value";
1062             }
1063              
1064             sub FETCHSIZE {
1065 788     788   1659 my $self = shift;
1066              
1067 788 100       1004 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  788         1801  
1068              
1069 778         924 return scalar @{${$self->[TABLE]}->[INDEX]{$self->[KEYc]}}
  778         2844  
1070 788 100       1015 if defined ${$self->[TABLE]}->[INDEX]{$self->[KEYc]};
  788         1826  
1071 10         82 return 0; # otherwise: key does not exist
1072             }
1073              
1074             sub STORESIZE {
1075 8     8   16 my $self = shift;
1076 8         10 my $count = shift;
1077              
1078 8         13 my $T = $self->[TABLE];
1079              
1080 8 100 100     28 croak "attempt to modify read-only object"
1081             if $$T->[IS_RO] && $count != $self->FETCHSIZE();
1082              
1083 7 100       21 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1084              
1085 7         14 my @new = ();
1086 6         19 @new = @{$$T->[VALUES]}[@{$$T->[INDEX]{$self->[KEYc]}}]
  6         14  
1087 7 100       25 if defined $$T->[INDEX]{$self->[KEYc]};
1088 7 100       16 if ($count > $self->FETCHSIZE())
    100          
1089 4         10 { push @new, (undef) x ($count - $self->FETCHSIZE()) }
1090             elsif ($count < $self->FETCHSIZE())
1091 2         9 { @new = @new[0..($count-1)] }
1092 1         4 else { return } # no actual change
1093 6         16 $T->field($self->[KEY] => \@new);
1094             }
1095              
1096       0     sub EXTEND {
1097             # do nothing
1098             }
1099              
1100             sub EXISTS {
1101 5     5   553 my $self = shift;
1102 5         8 my $offset = shift;
1103              
1104 5 100       8 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  5         21  
1105              
1106 5         11 return exists ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset];
  5         34  
1107             }
1108              
1109             sub DELETE {
1110 4     4   539 my $self = shift;
1111 4         8 my $offset = shift;
1112              
1113 4 100       5 croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO];
  4         124  
1114              
1115 3 100       7 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  3         14  
1116              
1117 3         6 my $row = ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset];
  3         7  
1118 3         6 my $old_value = ${$self->[TABLE]}->[VALUES][$row];
  3         7  
1119 3         5 ${$self->[TABLE]}->[VALUES][$row] = undef;
  3         6  
1120 3         12 return $old_value;
1121             }
1122              
1123             sub CLEAR {
1124 2     2   7 my $self = shift;
1125              
1126 2 100       3 croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO];
  2         115  
1127              
1128 1         5 $self->[TABLE]->field($self->[KEY] => []);
1129             }
1130              
1131             sub PUSH {
1132 11     11   21 my $self = shift;
1133              
1134 11         22 my $T = $self->[TABLE];
1135              
1136 11 100 100     146 croak "attempt to modify read-only object"
1137             if $$T->[IS_RO] && scalar @_;
1138              
1139 10 100       25 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1140              
1141 10 100       39 if (defined $$T->[INDEX]{$self->[KEYc]}) {
1142             # key exists ==> extend table efficiently
1143 5         13 my $last_row = $$T->[INDEX]{$self->[KEYc]}[-1];
1144 5         10 my $new_rows = scalar @_;
1145 5         8 splice @{$$T->[NAMES]}, 1+$last_row, 0,
  5         20  
1146             (($$T->[NAMES][$last_row]) x $new_rows);
1147 5         9 splice @{$$T->[VALUES]}, 1+$last_row, 0, map {"$_"} @_;
  5         15  
  4         15  
1148 5         17 $T->_update_INDEX($last_row, $new_rows);
1149 5         9 push @{$$T->[INDEX]{$self->[KEYc]}}, 1+$last_row .. $last_row+$new_rows;
  5         15  
1150             } else {
1151             # make key ==> use existing setter
1152 5         33 $T->_set_multiple_value($self->[KEY], [map {"$_"} @_]);
  9909         19171  
1153             }
1154 10         1349 $$T->[MVOFF] = undef;
1155             }
1156              
1157             sub POP {
1158 7     7   547 my $self = shift;
1159              
1160 7         12 my $T = $self->[TABLE];
1161              
1162 7 100       128 croak "attempt to modify read-only object" if $$T->[IS_RO];
1163              
1164 6 100       18 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1165              
1166 6 100       19 return undef unless defined $$T->[INDEX]{$self->[KEYc]};
1167              
1168 5         11 my $rem_row = $$T->[INDEX]{$self->[KEYc]}[-1];
1169 5         10 my $value = $$T->[VALUES][$rem_row];
1170              
1171 5         6 splice @{$$T->[NAMES]}, $rem_row, 1;
  5         12  
1172 5         9 splice @{$$T->[VALUES]}, $rem_row, 1;
  5         12  
1173 5 100       15 splice @{$$T->[MVOFF]}, $rem_row, 1 if defined $$T->[MVOFF];
  1         4  
1174 5         14 $T->_update_INDEX($rem_row, -1);
1175 5         10 pop @{$$T->[INDEX]{$self->[KEYc]}};
  5         12  
1176             # special case: popped last value
1177             delete $$T->[INDEX]{$self->[KEYc]}
1178 5 100       8 if scalar @{$$T->[INDEX]{$self->[KEYc]}} == 0;
  5         17  
1179              
1180 5         27 return $value;
1181             }
1182              
1183             sub SHIFT {
1184 6     6   535 my $self = shift;
1185              
1186 6         12 my $T = $self->[TABLE];
1187              
1188 6 100       125 croak "attempt to modify read-only object" if $$T->[IS_RO];
1189              
1190 5 100       15 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1191              
1192 5 100       21 return undef unless defined $$T->[INDEX]{$self->[KEYc]};
1193              
1194 4         10 my $rem_row = $$T->[INDEX]{$self->[KEYc]}[0];
1195 4         9 my $value = $$T->[VALUES][$rem_row];
1196              
1197 4         7 splice @{$$T->[NAMES]}, $rem_row, 1;
  4         10  
1198 4         8 splice @{$$T->[VALUES]}, $rem_row, 1;
  4         7  
1199 4         10 $$T->[MVOFF] = undef;
1200 4         11 $T->_update_INDEX($rem_row, -1);
1201 4         6 shift @{$$T->[INDEX]{$self->[KEYc]}};
  4         10  
1202             # special case: shifted last value
1203             delete $$T->[INDEX]{$self->[KEYc]}
1204 4 100       7 if scalar @{$$T->[INDEX]{$self->[KEYc]}} == 0;
  4         19  
1205              
1206 4         28 return $value;
1207             }
1208              
1209             sub UNSHIFT {
1210 5     5   10 my $self = shift;
1211              
1212 5         11 my $T = $self->[TABLE];
1213              
1214 5 100 100     127 croak "attempt to modify read-only object"
1215             if $$T->[IS_RO] && scalar @_;
1216              
1217 4 100       15 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1218              
1219 4 100       18 if (defined $$T->[INDEX]{$self->[KEYc]}) {
1220             # key exists ==> extend table efficiently
1221 3         11 my $first_row = $$T->[INDEX]{$self->[KEYc]}[0];
1222 3         4 my $new_rows = scalar @_;
1223 3         6 splice @{$$T->[NAMES]}, $first_row, 0,
  3         10  
1224             (($$T->[NAMES][$first_row]) x $new_rows);
1225 3         7 splice @{$$T->[VALUES]}, $first_row, 0, map {"$_"} @_;
  3         10  
  2         9  
1226 3         11 $T->_update_INDEX($first_row - 1, $new_rows);
1227 3         4 unshift @{$$T->[INDEX]{$self->[KEYc]}},
  3         12  
1228             $first_row .. $first_row-1+$new_rows;
1229             } else {
1230             # make key ==> use existing setter
1231 1         4 $T->_set_multiple_value($self->[KEY], [map {"$_"} @_]);
  4         10  
1232             }
1233 4         17 $$T->[MVOFF] = undef;
1234             }
1235              
1236             sub SPLICE {
1237 9     9   15 my $self = shift;
1238 9         12 my $offset = shift;
1239 9         15 my $length = shift;
1240              
1241 9 100       23 $offset = 0 unless defined $offset;
1242 9 100       20 $length = $self->FETCHSIZE() - $offset unless defined $length;
1243              
1244 9 100 100     32 return () unless ($length != 0 || scalar @_);
1245              
1246 8         15 my $T = $self->[TABLE];
1247              
1248 8 100       233 croak "attempt to modify read-only object" if $$T->[IS_RO];
1249              
1250 6 100       17 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1251              
1252 6         11 my @new = ();
1253 5         17 @new = @{$$T->[VALUES]}[@{$$T->[INDEX]{$self->[KEYc]}}]
  5         12  
1254 6 100       16 if defined $$T->[INDEX]{$self->[KEYc]};
1255 6         17 my @old = splice @new, $offset, $length, map {"$_"} @_;
  3         11  
1256 6         22 $self->[TABLE]->field($self->[KEY] => \@new);
1257              
1258 6         42 return @old;
1259             }
1260              
1261 458     458   587 sub UNTIE { our $_total_untied; $_total_untied++ }
  458         811  
1262              
1263 458     458   533 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  458         1491  
1264             }
1265              
1266             {
1267             package WARC::Fields::TiedHash::Value;
1268              
1269             # This package is a magical array that appears to be a string if it has
1270             # only one value, otherwise string conversion gives the usual
1271             # nearly-useless debugging value.
1272              
1273             # The actual underlying array is a tied array that forwards mutating
1274             # operations to the original WARC::Fields object.
1275              
1276 26     26   209 use overload '""' => '_as_string', fallback => 1;
  26         51  
  26         125  
1277              
1278 26     26   2242 use Scalar::Util qw/refaddr reftype/;
  26         144  
  26         4988  
1279              
1280             sub _new {
1281 458     458   670 my $class = shift;
1282 458         578 my $parent = shift;
1283 458         540 my $key = shift;
1284              
1285 458         603 my @values;
1286 458         1441 tie @values, (ref($parent).'::ValueArray'), $$parent, $key;
1287              
1288 458         2497 bless \@values, $class;
1289             }
1290              
1291             sub _as_string {
1292 676     676   11590 my $self = shift;
1293              
1294 676 100       1968 return scalar @$self == 1
1295             ? $self->[0] : sprintf ('%s(0x%x)', reftype $self, refaddr $self);
1296             }
1297              
1298 458     458   2921 sub DESTROY { untie @{(shift)} }
  458         973  
1299             }
1300              
1301             {
1302             package WARC::Fields::TiedHash;
1303              
1304 26     26   215 use Carp;
  26         160  
  26         2149  
1305              
1306             BEGIN { $WARC::Fields::TiedHash::{$_} = $WARC::Fields::{$_}
1307 26     26   20806 for WARC::Fields::OBJECT_INDEX; }
1308              
1309             # The underlying object is a reference to a WARC::Fields object.
1310              
1311             sub FETCH {
1312 461     461   1944 my $self = shift;
1313 461         614 my $key = shift;
1314 461 100       2289 croak "reference to invalid field name" if $key !~ m/^$PARSE_RE__token$/o;
1315 458         1467 return (ref($self).'::Value')->_new($self, $key);
1316             }
1317              
1318             sub STORE {
1319 11     11   402 my $self = shift;
1320 11         20 my $key = shift;
1321 11         18 my $value = shift;
1322              
1323 11 100       246 croak "attempt to modify read-only object" if $$$self->[IS_RO];
1324              
1325 9         25 $$self->field($key => $value);
1326             }
1327              
1328             sub DELETE {
1329 3     3   11 my $self = shift;
1330 3         6 my $key = shift;
1331              
1332 3 100       124 croak "attempt to modify read-only object" if $$$self->[IS_RO];
1333              
1334 2         9 $$self->field($key => []);
1335             }
1336              
1337             sub CLEAR {
1338 2     2   9 my $self = shift;
1339              
1340 2 100       120 croak "attempt to modify read-only object" if $$$self->[IS_RO];
1341              
1342 1         4 $$$self->[NAMES] = [];
1343 1         3 $$$self->[VALUES] = [];
1344 1         3 $$$self->[MVOFF] = undef;
1345 1         4 $$$self->[INDEX] = undef;
1346 1         3 return undef;
1347             }
1348              
1349             sub EXISTS {
1350 15     15   38 my $self = shift;
1351 15         23 my $key = shift;
1352              
1353 15 100       55 $$self->_rebuild_INDEX unless defined $$$self->[INDEX];
1354 15         39 return exists $$$self->[INDEX]->{$$self->_find_key($key)};
1355             }
1356              
1357             sub FIRSTKEY {
1358 61     61   101 my $self = shift;
1359              
1360 61         219 return $$$self->[NAMES][0];
1361             }
1362              
1363             sub NEXTKEY {
1364 307     307   462 my $self = shift;
1365 307         383 my $from_key = shift;
1366              
1367 307 100       595 $$self->_rebuild_INDEX unless defined $$$self->[INDEX];
1368              
1369 307         369 my $i;
1370 307   100     541 for ($i = $$$self->[INDEX]{$$self->_find_key($from_key)}[0] + 1;
1371             defined $$$self->[NAMES][$i] and
1372             $i != $$$self->[INDEX]{$$self->_find_key($$$self->[NAMES][$i])}[0];
1373             $i++) {}
1374 307         1096 return $$$self->[NAMES][$i];
1375             }
1376              
1377             sub SCALAR {
1378 3     3   11 my $self = shift;
1379 3         5 return scalar @{$$$self->[NAMES]};
  3         18  
1380             }
1381              
1382 68     68   90 sub UNTIE { our $_total_untied; $_total_untied++ }
  68         170  
1383              
1384 68     68   82 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  68         128  
1385             }
1386              
1387             =head2 Overloaded Dereference Operators
1388              
1389             The C class provides overloaded dereference operators for
1390             array and hash dereferencing. The overloaded operators provide an
1391             anonymous tied array or hash as needed, allowing the object itself to be
1392             used as a reference to its tied array and hash interfaces. There is a
1393             caveat, however, so read on.
1394              
1395             =cut
1396              
1397             sub _as_tied_array {
1398             # To avoid confusing bugs due to typos producing overloaded dereferences
1399             # instead of intended accesses to the internal object, this feature
1400             # cannot be used from within this module.
1401 168 100   168   7644 if (scalar caller =~ m/^WARC::Fields/) {
1402 1         4 local $Carp::CarpLevel = 1;
1403 1         148 confess "overloaded array dereference in internal code"
1404             }
1405              
1406 167         245 my $self = shift;
1407              
1408 167 100       665 return $$self->[C_TA] if defined $$self->[C_TA];
1409              
1410 3         5 my @array; $$self->[C_TA] = \@array;
  3         7  
1411 3         5 Scalar::Util::weaken ${tie @array, ref $self, $self};
  3         12  
1412 3         12 return $$self->[C_TA];
1413             }
1414              
1415             sub _as_tied_hash {
1416             # To avoid confusing bugs due to typos producing overloaded dereferences
1417             # instead of intended accesses to the internal object, this feature
1418             # cannot be used from within this module.
1419 266 100   266   8333 if (scalar caller =~ m/^WARC::Fields/) {
1420 1         3 local $Carp::CarpLevel = 1;
1421 1         70 confess "overloaded hash dereference in internal code"
1422             }
1423              
1424 265         429 my $self = shift;
1425              
1426 265 100       1127 return $$self->[C_TH] if defined $$self->[C_TH];
1427              
1428 61         115 my %hash; $$self->[C_TH] = \%hash;
  61         105  
1429 61         120 Scalar::Util::weaken ${tie %hash, ref $self, $self};
  61         219  
1430 61         289 return $$self->[C_TH];
1431             }
1432              
1433             =head3 Reference Count Trickery with Overloaded Dereference Operators
1434              
1435             To avoid problems, the underlying tied object is a reference to the parent
1436             object. For ordinary use of C, this is a strong reference, however,
1437             the anonymous tied array and hash are cached in the object to avoid having
1438             to C a new object every time the dereference operators are used.
1439              
1440             To prevent memory leaks due to circular references, the overloaded
1441             dereference operators tie a I reference to the parent object. The
1442             tied aggregate always holds a strong reference to its object, but when the
1443             dereference operators are used, that inner object is a I reference to
1444             the actual C object.
1445              
1446             The caveat is thus: do not attempt to save a reference to the array or hash
1447             produced by dereferencing a C object. The parent
1448             C object must remain in scope for as long as any anonymous
1449             tied aggregates exist.
1450              
1451             =cut
1452              
1453             1;
1454             __END__