File Coverage

blib/lib/WARC/Record/Logical/Block.pm
Criterion Covered Total %
statement 159 159 100.0
branch 64 66 96.9
condition 28 29 96.5
subroutine 21 21 100.0
pod n/a
total 272 275 98.9


line stmt bran cond sub pod time code
1             package WARC::Record::Logical::Block; # -*- CPerl -*-
2              
3 2     2   54349 use strict;
  2         10  
  2         55  
4 2     2   9 use warnings;
  2         5  
  2         80  
5              
6             our @ISA = qw();
7              
8 2     2   320 use WARC; *WARC::Record::Logical::Block::VERSION = \$WARC::VERSION;
  2         4  
  2         84  
9              
10 2     2   10 use Carp;
  2         4  
  2         115  
11 2     2   13 use Fcntl qw/SEEK_SET SEEK_CUR SEEK_END/;
  2         3  
  2         428  
12              
13             # This implementation uses an array as the underlying object.
14              
15 2     2   11 use constant { PARENT => 0, SEGMENT => 1, HANDLE => 2 };
  2         3  
  2         142  
16 2     2   11 use constant OBJECT_INIT => undef, 0, undef;
  2         4  
  2         92  
17              
18             # Invariant: HANDLE is always valid if SEGMENT is within range.
19              
20 2     2   407 BEGIN { require WARC::Record::Logical; }
21             BEGIN { $WARC::Record::Logical::Block::{$_} = $WARC::Record::Logical::{$_}
22 2     2   674 for WARC::Record::Logical::SEGMENT_INDEX; }
23              
24             sub _dbg_dump {
25 60     60   2850 my $self = shift;
26              
27 60         76 my $out = 'logical record block';
28             $out .= ' @['.($self->[SEGMENT]).' / '
29 60         117 .($#{$self->[PARENT]{segments}}).']';
  60         184  
30 60 100       100 $out .= ' [EOF]' if $self->[SEGMENT] > $#{$self->[PARENT]{segments}};
  60         143  
31 60         83 $out .= "\n";
32 60 50 66     206 $out .= ' '.((tied $self->[HANDLE])->_dbg_dump)
33             if $self->[HANDLE] && tied $self->[HANDLE];
34              
35 60         147 return $out;
36             }
37              
38             sub TIEHANDLE {
39 21     21   5274 my $class = shift;
40 21         28 my $parent = shift;
41              
42 21         41 my $ob = [OBJECT_INIT];
43 21         30 $ob->[PARENT] = $parent;
44 21         62 $ob->[HANDLE] = $parent->{segments}[0][SEG_REC]->open_block;
45              
46 21         3535 bless $ob, $class;
47             }
48              
49             # advance to next segment; return false if already at last segment
50             sub _next_segment {
51 77     77   162 my $self = shift;
52              
53 77 100       153 unless ($self->[PARENT]{segments}[1+$self->[SEGMENT]])
54 15         21 { $self->[SEGMENT]++; return 0 }
  15         45  
55              
56 62         115 close $self->[HANDLE];
57             $self->[HANDLE] =
58 62         255 $self->[PARENT]{segments}[++$self->[SEGMENT]][SEG_REC]->open_block;
59 62         8250 return $self->[HANDLE];
60             }
61              
62             sub READLINE {
63 57     57   3508 my $self = shift;
64              
65 57 100       84 return undef if $self->[SEGMENT] > $#{$self->[PARENT]{segments}};
  57         179  
66              
67 52 100       180 if (not defined $/) { # file slurp; we might run out of memory...
    100          
    100          
68 4         12 my $data = readline $self->[HANDLE];
69 4 100       10 $data = '' unless defined $data;
70 4         8 $data .= readline $self->[HANDLE] while $self->_next_segment;
71 4 100       23 return (length($data) ? $data : undef);
72             } elsif (ref $/) { # record read; fill a block...
73 2     2   15 use bytes;
  2         4  
  2         21  
74 10         15 my $rec_len = 0+${$/};
  10         14  
75 10         28 my $rec = readline $self->[HANDLE];
76 10 100       18 $rec = '' unless defined $rec;
77 10         24 while (length $rec < $rec_len) {
78 9         11 local $/ = \(do {$rec_len - length $rec});
  9         28  
79 9 100       16 last unless $self->_next_segment;
80 8         26 $rec .= readline $self->[HANDLE];
81             }
82 10 100       64 return (length ($rec) ? $rec : undef);
83             } elsif ($/ eq '') { # paragraph read; delimiter is empty line...
84 11         16 my $para = '';
85 11         32 my $input = readline $self->[HANDLE];
86              
87 11   100     1181 while (((defined $input and $para .= $input) or $self->_next_segment)
      100        
88             and ("\n\n" ne substr $para, -2))
89             # segment boundary was in the middle of a line, continuing read is safe
90 13         320 { $input = readline $self->[HANDLE] }
91             # paragraph delimiter may or may not span segments...
92 11 100       426 if (eof $self->[HANDLE]) {
93             # next segment may begin with newlines, if so, they must be read
94 6         97 my $ch; my $end_pos = $self->TELL;
  6         11  
95 6   100     48 $end_pos = $self->TELL, $para .= "\n"
96             while (defined ($ch = $self->GETC) and $ch eq "\n");
97 6         16 $self->SEEK($end_pos, SEEK_SET);
98 6 100       170 $input = defined($ch) ? '' : undef;
99             }
100 11 100       85 return (length ($para) ? $para : undef);
101             } else { # ordinary line read...
102 2     2   480 use bytes; # a line delimiter may be split on any octet boundary
  2         4  
  2         9  
103 27         37 my $line = '';
104 27         69 my $input = readline $self->[HANDLE];
105              
106             # read more data until we have a complete line or reach EOF
107 27   100     256 while (((defined $input and $line .= $input) or $self->_next_segment)
      100        
108             and ($/ ne substr $line, -length $/)) {
109 49 100       88 if (length $/ > 1) {
110             # each number N in this array indicates that a length-N prefix of
111             # $/ currently matches at the end of the line buffer
112             my @prefixes =
113 22         38 grep {substr($/, 0, $_) eq substr($line, -$_)} 1 .. length $/;
  128         234  
114 22         44 while (@prefixes) {
115 39         68 my $count = $self->READ($input, 1);
116 39 100       63 return $line unless $count;
117 38         47 $line .= $input;
118 38 100       86 return $line if $/ eq substr $line, -length $/;
119 34         47 unshift @prefixes, 0; @prefixes = # sieve prefixes
120 34         41 grep {++$_; substr($/, 0, $_) eq substr($line, -$_)} @prefixes;
  92         96  
  92         191  
121             } # loop iterates until no prefix of $/ matches the tail of $line
122             }
123              
124 44         189 $input = readline $self->[HANDLE];
125             }
126 22 100       47 return $input unless length $line;
127 21         109 return $line;
128             }
129             }
130              
131             # This sub must rely on the aliasing effect of @_.
132             sub READ {
133 70     70   317 my $self = shift;
134             # args now: 0: buffer 1: length 2: offset into buffer or undef
135 70         85 my $length = $_[1];
136 70   100     153 my $offset = $_[2] || 0;
137              
138 70 100       90 return 0 if $self->[SEGMENT] > $#{$self->[PARENT]{segments}};
  70         133  
139              
140 68         89 my $buf = ''; my $count = 1; my $bpos = 0;
  68         82  
  68         69  
141 68   100     138 while ($length && ($count || $self->_next_segment)) {
      100        
142 99         177 $count = read $self->[HANDLE], $buf, $length, $bpos;
143 99 50       686 return undef unless defined $count;
144 99         105 $length -= $count; $bpos += $count;
  99         184  
145             }
146              
147 68 100       102 $_[0] = '' unless defined $_[0];
148 68 100       105 $_[0] .= "\0" x ($offset - length($_[0])) if $offset > length $_[0];
149 68         137 substr $_[0], $offset, (length($_[0]) - $offset), $buf;
150 68         124 return $bpos;
151             }
152              
153             sub GETC {
154 22     22   307 my $self = shift;
155              
156 22         26 my $ch;
157 22 100       37 return undef unless $self->READ($ch, 1);
158 18         55 return $ch;
159             }
160              
161             sub EOF {
162 15     15   286 my $self = shift;
163              
164 15 100       21 return 1 if $self->[SEGMENT] > $#{$self->[PARENT]{segments}};
  15         73  
165 2 100       2 return 0 if $self->[SEGMENT] < $#{$self->[PARENT]{segments}};
  2         9  
166 1         4 return eof $self->[HANDLE];
167             }
168              
169             sub SEEK {
170 12     12   19 my $self = shift;
171 12         14 my $offset = shift;
172 12         15 my $whence = shift;
173              
174 12         18 my $segments = $self->[PARENT]{segments};
175              
176 12         12 my $npos;
177              
178 12 100       28 if ($whence == SEEK_SET) { $npos = $offset }
  8 100       9  
    100          
179 1         3 elsif ($whence == SEEK_CUR) { $npos = $self->TELL + $offset }
180             elsif ($whence == SEEK_END)
181 2         4 { $npos = ($segments->[-1][SEG_BASE]
182             + $segments->[-1][SEG_LENGTH] + $offset) }
183 1         195 else { croak "unknown WHENCE $whence in call to seek" }
184              
185             # This function must be able to seek backwards, even if the underlying
186             # handles cannot, to support paragraph reads. Seeking generally requires
187             # finding the new segment, switching to that segment, and seeking
188             # forwards to the desired offset within the new segment.
189              
190             # Special handling for seek to or past end-of-file
191 11 100       25 if ($npos >= ($segments->[-1][SEG_BASE] + $segments->[-1][SEG_LENGTH])) {
192 4         7 $self->[SEGMENT] = @$segments;
193 4         11 close $self->[HANDLE]; $self->[HANDLE] = undef;
  4         39  
194 4         72 return 1;
195             }
196              
197 7         11 my $new_segment_index = @$segments;
198 7         15 for (my $i = 0; $i < $new_segment_index; $i++) {
199 23 100 100     70 $new_segment_index = $i # which also exits this loop
200             if $npos >= $segments->[$i][SEG_BASE]
201             && $npos < ($segments->[$i][SEG_BASE] + $segments->[$i][SEG_LENGTH]);
202             }
203              
204 7         8 my $new_segment = $segments->[$new_segment_index];
205 7 100       16 return 0 unless $new_segment;
206              
207 6         8 $self->[SEGMENT] = $new_segment_index;
208 6         14 close $self->[HANDLE];
209 6         45 $self->[HANDLE] = $new_segment->[SEG_REC]->open_block;
210 6         2495 seek $self->[HANDLE], $npos - $new_segment->[SEG_BASE], SEEK_SET;
211             }
212              
213             sub TELL {
214 15     15   266 my $self = shift;
215              
216 15         21 my $segments = $self->[PARENT]{segments};
217              
218             return ($segments->[$self->[SEGMENT]][SEG_BASE] + (tell $self->[HANDLE]))
219 15 100       19 if $self->[SEGMENT] <= $#{$segments};
  15         66  
220 3         6 return ($segments->[-1][SEG_BASE] + $segments->[-1][SEG_LENGTH]);
221             }
222              
223             sub CLOSE {
224 1     1   2 my $self = shift;
225 1         2 $self->[SEGMENT] = 1+$#{$self->[PARENT]{segments}};
  1         4  
226 1         3 close $self->[HANDLE]; $self->[HANDLE] = undef;
  1         4  
227             }
228              
229             1;
230             __END__