File Coverage

blib/lib/File/Stream.pm
Criterion Covered Total %
statement 86 132 65.1
branch 14 40 35.0
condition 6 18 33.3
subroutine 17 27 62.9
pod 4 4 100.0
total 127 221 57.4


line stmt bran cond sub pod time code
1              
2             =head1 NAME
3              
4             File::Stream - Regular expression delimited records from streams
5              
6             =head1 SYNOPSIS
7              
8             use File::Stream;
9             my $stream = File::Stream->new($filehandle);
10            
11             $/ = qr/\s*,\s*/;
12             print "$_\n" while <$stream>;
13              
14             # or:
15             ($handler, $stream) = File::Stream->new(
16             $filehandle,
17             read_length => 1024,
18             separator => qr{to_be_used_instead_of_$/},
19             );
20            
21             while(<$stream>) {...}
22             my $line = $stream->readline(); # similar
23            
24             # extended usage:
25             use URI;
26             my $uri = URI->new('http://steffen-mueller.net');
27             my ($pre_match, $match) =
28             $handler->find('literal_string', qr/regex/, $uri);
29            
30             # $match contains whichever argument to find() was found first.
31             # $pre_match contains all that was before the first token that was found.
32             # both the contents of $match and $pre_match have been removed from the
33             # data stream (buffer).
34            
35             # Since version 2.10 of the module, you can use seek() and tell() on
36             # File::Stream objects:
37             my $position = tell($stream);
38             # ...
39             seek($stream, 0, $position); # rewind
40              
41             =head1 DESCRIPTION
42              
43             Perl filehandles are streams, but sometimes they just aren't powerful enough.
44             This module offers to have streams from filehandles searched with regexes
45             and allows the global input record separator variable to contain regexes.
46              
47             Thus, readline() and the <> operator can now return records delimited
48             by regular expression matches.
49              
50             There are some very important gripes with applying regular expressions to
51             (possibly infinite) streams. Please read the CAVEATS section of this
52             documentation carfully.
53              
54             =head2 EXPORT
55              
56             None.
57              
58             =cut
59              
60             package File::Stream;
61              
62 2     2   34946 use 5.006;
  2         9  
  2         69  
63 2     2   10 use strict;
  2         6  
  2         103  
64 2     2   9 use warnings;
  2         4  
  2         75  
65              
66 2     2   1967 use FileHandle;
  2         26670  
  2         15  
67 2     2   889 use Carp;
  2         6  
  2         133  
68 2     2   2531 use YAPE::Regex;
  2         65987  
  2         19  
69              
70 2     2   617 use vars qw/$End_Of_String/;
  2         3  
  2         557  
71             our $VERSION = '2.30';
72              
73             =head2 new
74              
75             The new() constructor takes a filehandle (or a glob reference) as first
76             argument. The following arguments are interpreted as key/value pairs with
77             the following parameters being defined:
78              
79             =over 4
80              
81             =item separator
82              
83             This may be set to either a string, a compiled regular expression (qr//), or
84             an object which is overloaded in stringification context to return a string.
85              
86             If separator is set, its value will be used in calls to readline() (including
87             the <> diamond operator which calls readline internally) instead of $/,
88             the global input record separator.
89              
90             String will be interpreted as literal strings, not regular expressions.
91              
92             =item read_length
93              
94             Defaults to 1024 bytes. Sets the number of bytes to read into the internal
95             buffer at once. Large values may speed up searching considerably, but
96             increase memory usage.
97              
98             =item die_on_anchors
99              
100             Defaults to true. If set, any call to C will die on
101             regular expressions which contain anchors (C<^ and $>). Usually, those
102             do not make much sense in regular expressions applied to streams.
103             If, however, you're using the C option, these anchors match
104             the start and end of each line respectively. In that case, you can suppress
105             the fatal error by setting C 0>.
106              
107             =back
108              
109             The new() method returns a fresh File::Stream object that has been tied to
110             be a filehandle and a filehandle. All the usual file operations should work
111             on the filehandle and the File::Stream methods should work on the object.
112              
113             =cut
114              
115             sub new {
116 1     1 1 837 my $proto = shift;
117 1   33     7 my $class = ref($proto) || $proto;
118              
119 1         11 my $filehandle = new FileHandle;
120 1         52 my $handler = tie *$filehandle => $class, @_;
121 1         4 return ( $handler, $filehandle );
122             }
123              
124             =head2 readline
125              
126             The readline method on a File::Stream object works just like the
127             builtin except that it uses the objects record separator instead
128             of $/ if it has been set via new() and honours regular expressions.
129              
130             This is also internally used when readline() is called on the tied
131             filehandle.
132              
133             =cut
134              
135             sub readline {
136 4     4 1 392 my $self = shift;
137 4         8 my $separator = $self->{separator};
138 4 50       12 $separator = $/ if not defined $separator;
139 4         12 my ( $pre_match, $match ) = $self->find($separator);
140 4 50       9 if ( not defined $pre_match ) {
141 0         0 my $buf = $self->{buffer};
142 0         0 $self->{buffer} = '';
143 0 0       0 return undef if $buf eq '';
144 0         0 return $buf;
145             }
146 4         12 return $pre_match . $match;
147             }
148              
149             =head2 find
150              
151             Finds the first occurrance one of its arguments in the stream. For example,
152              
153             $stream_handler->find('a', 'b');
154              
155             finds the first character 'a' or 'b' in the stream whichever comes first.
156             Returns two strings: The data read from the stream I the match
157             and the match itself. The arguments to find() may be regular expressions,
158             but please see the CAVEATS section of this documentation about that.
159             If any of the arguments is an object, it will be evaluated in stringification
160             context and the result of that will be matched I, ie. not as a
161             regular expression.
162              
163             As with readline(), this is a method on the stream handler object.
164              
165             =cut
166              
167             sub find {
168 7     7 1 1511 my $self = shift;
169 7         13 my @terms = @_;
170 2     2   12 use re 'eval';
  2         10  
  2         2597  
171 7         9 $End_Of_String = 0;
172 10         42 my @regex_tokens =
173             map {
174             my $yp = YAPE::Regex->new($_);
175 10         221 my $str = '';
176 10         10 my $token;
177 10         29 while ($token = $yp->next()) {
178 36         3569 my $tstr = $token->string();
179 36 50 33     361 if ($self->{die_on_anchors} and $tstr eq '^' or $tstr eq '$') {
      33        
180 0         0 croak "Invalid use of anchors (here: '$tstr') in a ",
181             "regular expression that will be\n",
182             "applied to a stream";
183             }
184 36         110 $str .= $tstr .
185             '(?:\z(?{$End_Of_String++})(?!)|)';
186             }
187 10         1122 qr/$str/;
188             }
189             map {
190 7 100       12 if ( not ref($_) ) {
  10 50       23  
191 6         64 qr/\Q$_\E/
192             }
193             elsif ( ref($_) eq 'Regexp' ) {
194 4         8 $_
195             }
196             else {
197 0         0 my $string = "$_";
198 0         0 qr/\Q$string\E/
199             }
200             } @terms;
201            
202 7         28 my $re = '(' . join( ')|(', @regex_tokens ) . ')';
203 7         759 my $compiled = qr/$re/s;
204              
205 7         14 while (1) {
206 9         139 my @matches = $self->{buffer} =~ $compiled;
207 9 100 66     41 if ($End_Of_String or not @matches) {
208 2         4 $End_Of_String = 0;
209 2 50       6 return undef unless $self->fill_buffer();
210 2         4 next;
211             }
212             else {
213 7         8 my $index = undef;
214 7         17 for ( 0 .. $#matches ) {
215 8 100       23 $index = $_, last if defined $matches[$_];
216             }
217 7 50       12 die if not defined $index; # sanity check
218 7         8 my $match = $matches[$index];
219 7 50       162 $self->{buffer} =~ s/^(.*?)\Q$match\E//s or die;
220 7         81 return ( $1, $match );
221             }
222             }
223             }
224              
225             =head2 fill_buffer
226              
227             It is unlikely that you will need to call this method directly.
228             Reads more data from the internal filehandle into the buffer.
229             First argument may be the number of bytes to read, otherwise the
230             'read_length' attribute is used.
231              
232             Again, call this on the handler object, not the file handle.
233              
234             =cut
235              
236             sub fill_buffer {
237 2     2 1 3 my $self = shift;
238 2   33     11 my $length = shift || $self->{read_length};
239 2         3 my $data;
240 2         39 my $bytes = read( $self->{fh}, $data, $length );
241 2 50       5 return 0 if not $bytes;
242 2         7 $self->{buffer} .= $data;
243 2         6 return $bytes;
244             }
245              
246             sub TIEHANDLE {
247 1     1   2 my $class = shift;
248 1         2 my $fh = shift;
249 1         7 my $self = {
250             fh => $fh,
251             read_length => 1024,
252             separator => undef,
253             buffer => '',
254             die_on_anchors => 1,
255             @_
256             };
257 1         4 bless $self => $class;
258             }
259              
260 3     3   381 sub READLINE { goto &readline; }
261              
262             sub PRINT {
263 0     0   0 my $self = shift;
264 0 0       0 my $buf = join( defined $, ? $, : "", @_ );
265 0 0       0 $buf .= $\ if defined $\;
266 0         0 $self->WRITE( $buf, length($buf), 0 );
267             }
268              
269             sub PRINTF {
270 0     0   0 my $self = shift;
271              
272 0         0 my $buf = sprintf( shift, @_ );
273 0         0 $self->WRITE( $buf, length($buf), 0 );
274             }
275              
276             sub GETC {
277 0     0   0 my $self = shift;
278              
279 0         0 my $buf;
280 0         0 $self->READ( $buf, 1 );
281 0         0 return $buf;
282             }
283              
284             sub READ {
285 0 0   0   0 croak if @_ < 3;
286 0         0 my $self = shift;
287 0         0 my $bufref = \$_[0];
288 0 0       0 $$bufref = '' if not defined $$bufref;
289 0         0 my ( undef, $len, $offset ) = @_;
290 0 0       0 $offset = 0 if not defined $offset;
291 0 0       0 if ( length $self->{buffer} < $len ) {
292 0         0 my $bytes = 0;
293 0   0     0 while ( $bytes = $self->fill_buffer()
294             and length( $self->{buffer} ) < $len )
295             { }
296              
297 0 0       0 if ( not $bytes ) {
298 0         0 my $length_avail = length( $self->{buffer} );
299 0         0 substr( $$bufref, $offset, $length_avail,
300             substr( $self->{buffer}, 0, $length_avail, '' ) );
301 0         0 return $length_avail;
302             }
303              
304             # only reached if buffer long enough.
305             }
306 0         0 substr( $$bufref, $offset, $len, substr( $self->{buffer}, 0, $len, '' ) );
307 0         0 return $len;
308             }
309              
310             sub WRITE {
311 0     0   0 my $self = $_[0];
312 0         0 my $fh = $self->{fh};
313 0         0 print $fh substr( $_[1], 0, $_[2] );
314             }
315              
316             sub TELL {
317 2     2   901 my $self = $_[0];
318 2         9 my $foffset = tell $self->{fh};
319 2         10 return $foffset - length($self->{buffer});
320             }
321              
322             sub SEEK {
323 1     1   6700 my $self = $_[0];
324 1         7 seek $self->{fh}, $_[1], $_[2];
325 1         3 $self->{buffer} = '';
326 1         6 return 1;
327             }
328              
329 0 0   0   0 sub EOF { not length( $_[0]->{buffer} ) and eof( *{ $_[0]->{fh} } ) }
  0         0  
330 0     0   0 sub FILENO { fileno( *{ $_[0]->{fh} } ) }
  0         0  
331 0     0   0 sub BINMODE { binmode( *{ $_[0]->{fh} }, @_ ) }
  0         0  
332              
333 0     0   0 sub CLOSE { close( *{ $_[0]->{fh} } ) }
  0         0  
334 0     0   0 sub UNTIE { close( *{ $_[0]->{fh} } ) }
  0         0  
335 1     1   565 sub DESTROY { close( *{ $_[0]->{fh} } ) }
  1         132  
336              
337             1;
338             __END__