File Coverage

blib/lib/File/Bidirectional.pm
Criterion Covered Total %
statement 128 145 88.2
branch 65 104 62.5
condition 23 39 58.9
subroutine 18 23 78.2
pod 7 7 100.0
total 241 318 75.7


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2              
3 2     2   114025 use strict;
  2         6  
  2         104  
4 2     2   14 use warnings;
  2         4  
  2         199  
5              
6             package File::Bidirectional;
7 2     2   13 use Carp;
  2         8  
  2         202  
8 2     2   13 use Fcntl qw/:seek O_RDONLY/;
  2         5  
  2         940  
9              
10             our $VERSION = '0.01';
11              
12             =pod
13              
14             =head1 NAME
15              
16             File::Bidirectional - Read a file line-by-line either forwards or backwards
17              
18             =head1 SYNOPSIS
19              
20             use File::Bidirectional;
21             my $file = "/var/log/large_file";
22              
23             # Object interface
24              
25             # start from the last line
26             my $fh = File::Bidirectional->new($file, {origin => -1})
27             or die $!;
28              
29             # read backwards until point of interest
30             while (my $line = $fh->readline()) {
31             last if $line =~ /RECORD_START/;
32             }
33              
34             # switch directions
35             $fh->switch();
36              
37             # read forwards until point of interest
38             while (my $line = $fh->readline()) {
39             last if $line =~ /RECORD_END/;
40             }
41              
42             # Tied Handle Interface
43              
44             local *F;
45             tie *F, "File::Bidirectional", $file, {origin => 1}
46             or die $!;
47              
48             while (my $line = ) { ... }
49              
50             (tied *F)->switch();
51              
52             =head1 DESCRIPTION
53              
54             File::Bidirectional reads a file line-by-line in either the forwards or
55             backwards direction. It supports an object interface as well as a tied
56             filehandle interface, and should be straight-forward to use. It is also memory
57             efficient, since it is intended to be used on files too large to be efficiently
58             slurped into an array and traversed backwards.
59              
60             The direction in which to traverse the file can be changed at anytime, but it is
61             important to note that the last-read line will be repeated when this happens.
62             See C to see why this is so.
63              
64             On non-Unix platforms, this module attempts to immitate native Perl in
65             converting the line endings. Currently, this is limited and untested, so please
66             see L for more information.
67              
68             =head1 MOTIVATION
69              
70             I had a C file describing the changes in a large (> 200MB) file. Based on
71             the line numbers in the C, I have to repeatedly read backwards and
72             forwards in the large file to obtain the context lines before and after the
73             C changes. The number of context lines vary, thus it was a little more
74             involved than regenerating the C with an appropriate C<--context> option.
75              
76             I decided to publish this module as I thought others might have similar needs.
77             Reading large log files backwards is probably the most common of these, but if
78             you have any other interesting uses, do let me know.
79              
80             =cut
81              
82             # globals
83             my ($BLOCK_SIZE);
84             BEGIN {
85             # defaults - can be changed through constructor
86 2     2   4 $BLOCK_SIZE = 1024 * 8;
87              
88             # _read_line() and _eof() are used as sensible defaults. we will fix up the
89             # aliases again later to optimize away one indirection function call
90              
91             # tied interface
92 2         8 *TIEHANDLE = \&new;
93 2         6 *READLINE = \&_read_line;
94 2         6 *EOF = \&_eof;
95 2         5 *CLOSE = \&close;
96 2         6 *TELL = \&tell;
97              
98             # IO::Handle compatability
99 2         5 *getline = \&_read_line;
100              
101             # File::ReadBackwards compatability
102 2         6 *get_handle = \&fh;
103              
104             # aliases
105 2         4 *readline = \&_read_line;
106 2         5057 *eof = \&_eof;
107             }
108              
109             =pod
110              
111             =head1 CONSTRUCTOR (CLASS METHODS)
112              
113             =over
114              
115             =item new $file, \%option
116              
117             $fh = File::Bidirectional->new($file);
118             $fh = File::Bidirectional->new($file, {mode => 'forward'});
119             $fh = File::Bidirectional->new($file, {mode => 'backward'});
120             $fh = File::Bidirectional->new($file, {origin => -1});
121             $fh = File::Bidirectional->new($file, \%option);
122              
123             Has the file name as the first parameter, and a hashref of options as an
124             optional second parameter. Upon success, it will return the object. For invalid
125             parameters, it will C. For L errors, it returns
126             undef and sets the error code in L.
127              
128             The list of valid options are:
129              
130             =over
131              
132             =item mode
133              
134             Can be either C (bi-directional), C or C. The C
135             and C modes are restrictive: the file is read from the first and last
136             line respectively, and switching directions is prohibited. The C mode
137             allows direction switching, and will start from the first line by default (use
138             the C option to change that.) The default is C.
139              
140             =item origin
141              
142             Can be either C<1> or C<-1>. These denote whether the first or last line of the
143             file is considered as line 1 by C. (C will always start
144             from line 1.) C can only be set if the C option is C. The
145             default is C<1>.
146              
147             =item binmode
148              
149             Can be any true or false expression. It is analogous to the L
150             built-in function. On systems that distinguish between binary and text files,
151             notably DOS and Windows-based systems, this is important. A true value will
152             preserve C<\r\n> as is; a false value will convert C<\r\n> to C<\n>. The
153             default is false.
154              
155             =item separator
156              
157             Can be any scalar string. It is analogous to the L variable.
158             C determines C's notion of what a line is. The
159             default is L, which in turn defaults to C<"\n">.
160              
161             Caveat: The Perl-ish magic that occurs when L is C<""> does not
162             happen yet.
163              
164             =item regex
165              
166             Can be any true or false expression. It determines whether the C
167             option is a regex or a string. The default is false.
168              
169             =item block_size
170              
171             Can be any positive integer. This is the size of a single block read by the
172             underlying L. The default is 8192.
173              
174             =back
175              
176             =back
177              
178             =head1 INSTANCE METHODS
179              
180             =cut
181              
182              
183             sub new {
184 206     206 1 2265608 my ($class, $file, $option) = @_;
185 206 50       813 croak "expected class method"
186             unless defined $class;
187 206 50       609 croak "expected filename"
188             unless defined $file;
189 206 50 66     7064 croak "expected hashref for parameters"
190             unless !defined $option || ref($option) eq 'HASH';
191              
192             # block size and buffer size
193 206   33     1088 my $block_size = $option->{block_size} || $BLOCK_SIZE;
194 206 50 33     2195 croak "expected block_size to be positive integer"
195             unless $block_size =~ /^\d+$/ && $block_size > 0;
196              
197             # default separator is $/
198 206         562 my $sep = $option->{separator};
199 206 50       851 $sep = $/
200             unless defined $sep;
201              
202             # default is not to treat separator as regex
203 206         421 my $sep_re = $option->{regex};
204 206 50       519 $sep_re = 0
205             unless defined $sep_re;
206              
207             # pre-compile regular expression
208 206 50       2735 my $re = ($sep_re) ? qr/(.*?$sep|.+)/ : qr/(.*?\Q$sep\E|.+)/;
209              
210             # translation takes place on DOS (without binmode), Mac etc.
211 206         505 my $binmode = $option->{binmode};
212 206 50 33     768 my $translate =
    50          
213             (_is_dos() && !$binmode) ? qr/\015\012/ :
214             (_is_mac()) ? qr/\015/ :
215             undef;
216              
217             # default mode is bidirectional
218 206         868 my $mode = $option->{mode};
219 206 50 66     13140 croak "expected mode to be [bi|forward|backward]"
220             unless !defined $mode || $mode =~ /^(bi|forward|backward)$/;
221 206 100       514 $mode = 'bi' unless defined $mode;
222              
223             # origin can only be explicitly set for bidirectional
224 206         311 my $origin = $option->{origin};
225 206 50 66     894 croak "expected origin only for mode \"bi\""
226             unless !defined $origin || $mode eq 'bi';
227 206 50 66     1449 croak "expected origin to be [1|-1]"
228             unless !defined $origin || $origin =~ /^(1|-1)$/;
229 206 100       420 if (!defined $origin) {
230 70 50       238 $origin =
    100          
    100          
231             ($mode eq 'bi') ? 1 :
232             ($mode eq 'forward') ? 1 :
233             ($mode eq 'backward') ? -1 : undef;
234             }
235              
236             # file size
237 206         4037 my $file_size = -s $file;
238              
239             # set starting point of cursor to coincide with the origin
240 206 100       869 my $start = ($origin == 1) ? 0 : $file_size;
241              
242 206 50       21280 sysopen my $fh, $file, O_RDONLY
243             or return undef;
244 206         658 binmode $fh;
245              
246 206         4891 my $x = {
247             mode => $mode, # mode
248             fh => $fh, # filehandle
249             cur => $start, # physical cursor on filehandle
250             buffer => [], # buffer
251             origin => $origin, # 1: first line as line 1 / -1: last line as line 1
252             move => $origin, # 1: moving forwards / -1: moving backwards
253             line => 0, # forward: line read / backward: line to be read
254             re => $re, # regular expression for separator
255             translate => $translate,
256             file_size => $file_size,
257             block_size => $block_size,
258             };
259              
260 206         550 bless ($x, $class);
261              
262             # fixup the aliases to save a method call for readline
263 206         731 $x->_fixup_alias();
264              
265 206         1267 return $x;
266             }
267              
268             =pod
269              
270             =over
271              
272             =item readline
273              
274             while (my $line = $fh->readline()) { ... }
275              
276             Returns the subsequent line. This refers either to the next line when the
277             direction is forwards, or to the previous line when the direction is backwards.
278             The direction can be changed with C. C is returned when there
279             are no more lines to be read.
280              
281             =item getline
282              
283             An alias for C. It exists for compatability with the IO::* classes.
284              
285             =item eof
286              
287             Returns true when C will return an C, false otherwise.
288              
289             =item switch
290              
291             $fh->switch();
292              
293             Switches the current direction in which we are reading the file. It will
294             L if the C option in the constructor is set to C or
295             C.
296              
297             Note that switching directions will cause the last-read line to be repeated by
298             C.
299              
300             =cut
301              
302             # reverse movement direction
303             sub switch {
304 1932     1932 1 16178 my ($x) = @_;
305              
306 1932 50       21920 croak "needs to be bidirectional mode to switch directions"
307             unless $x->{mode} eq 'bi';
308              
309             # get current tell() before changing direction and invalidating the buffer
310 1932         6594 $x->{cur} = $x->tell();
311              
312             # invalidate the buffer
313 1932         2962 undef @{$x->{buffer}};
  1932         270428  
314              
315             # change direction
316 1932         5176 $x->{move} *= -1;
317              
318             # fixup aliases for readline() and eof() after changing direction
319 1932         7079 $x->_fixup_alias();
320             }
321              
322             =pod
323              
324             =item close
325              
326             $fh->close();
327              
328             Closes the underlying filehandle and releases the memory allocated for its
329             buffer. On success it returns true, otherwise it returns false with the error
330             code found in L. All subsequent C calls will return
331             undef, and C, its last value.
332              
333             =cut
334              
335             # close file and destroy state
336             sub close {
337 205     205 1 3631222 my ($x) = @_;
338 205         1515 undef @{$x->{buffer}};
  205         32701  
339 205 100       805 $x->{cur} = ($x->{move} == 1) ? $x->{file_size} : 0;
340 205 50       43420 CORE::close($x->{fh})
341             or return undef;
342 205         823 return 1;
343             }
344              
345             =pod
346              
347             =item direction $direction
348              
349             Takes an optional parameter: 1 for reading forwards, -1 for reading backwards,
350             L otherwise. If an argument for the parameter is provided, the
351             direction will be switched if necessary. Either way, it returns the (new)
352             direction.
353              
354             =cut
355              
356             sub direction {
357 0     0 1 0 my ($x, $direction) = @_;
358 0 0 0     0 croak "expected direction to be [1|-1]"
359             unless !defined $direction || $direction =~ /^(1|-1)$/;
360              
361 0 0 0     0 if (defined $direction && $direction != $x->{move}) {
362 0         0 $x->switch();
363             }
364              
365 0         0 return $x->{move};
366             }
367              
368             =pod
369              
370             =item line_num
371              
372             my $fh=File::Bidirectional->new($file); n=$fh->line_num(); # n = 0
373             $fh->readline(); n=$fh->line_num(); # n = 1
374             $fh->readline(); n=$fh->line_num(); # n = 2
375             $fh->switch(); n=$fh->line_num(); # n = 2
376             $fh->readline(); n=$fh->line_num(); # n = 1
377             $fh->readline(); n=$fh->line_num(); # n = 0
378              
379             Returns the current line number. It is analogous to L.
380              
381             For a file with I logical lines, the line number ranges from 0 to I. When
382             reading away from the origin (forwards if the first line is the origin), its
383             behavior is always identical to that of L - it refers to the number
384             of lines that has been read. When reading towards the origin, it refers to the
385             number of lines that can still be read.
386              
387             When C is called, the direction is changed, but the line number
388             remains the same. Therefore, the last-read line before changing directions will
389             be repeated by C.
390              
391             =cut
392              
393             # current line number, 1-based
394             # forward: the line that has just been read
395             # backward: the line that is going to be read
396             sub line_num {
397 0     0 1 0 my ($x) = @_;
398 0         0 return $x->{line};
399             }
400              
401             =pod
402              
403             =item tell
404              
405             Returns the current position of the filehandle.
406              
407             =cut
408              
409             # logical cursor on filehandle
410             sub tell {
411 2082     2082 1 4300 my ($x) = @_;
412 2082         5013 my $pos = 0;
413 2082         8277 for my $s (@{$x->{buffer}}) {
  2082         6847  
414 2175866         3239603 $pos += length $s;
415             }
416 2082 100       22981 return ($x->{move} == 1) ? $x->{cur} - $pos : $x->{cur} + $pos;
417             }
418              
419             =pod
420              
421             =item fh
422              
423             Returns the underlying filehandle. This is mainly useful for file-locking.
424              
425             Notice that this actually breaks the encapsulation of File::Bidirectional,
426             therefore it becomes the user's responsibility to ensure that nothing bad
427             happens to the underlying filehandle. For example, it should definitely not be
428             closed.
429              
430             The underlying filehandle will be returned with its seek position set to what is
431             returned by C. It should generally be okay for this seek position to be
432             modified (the object remembers its own seek position and will always restore
433             it). Any other operations on the filehandle, however, is very likely to void
434             your warranty. =)
435              
436             =cut
437              
438             sub fh {
439 48     48 1 8748 my ($x) = @_;
440 48         173 sysseek($x->{fh}, $x->tell(), SEEK_SET);
441 48         174 return $x->{fh};
442             }
443              
444              
445              
446             # used only as fail-safe default
447             sub _read_line {
448 0     0   0 my ($x) = @_;
449             return
450 0 0       0 ($x->{move} == 1 ) ? $x->_next_line() :
    0          
451             ($x->{move} == -1) ? $x->_prev_line() :
452             undef;
453             }
454              
455              
456             sub _next_line {
457 144241     144241   890855 my ($x) = @_;
458             # 1. more than 1 line is in the buffer, so the top of the buffer is a
459             # complete line
460             # 2. only line -1 (last line) remains in the buffer
461             # 3. nothing else to read, i.e. return undef
462 144241         176550 while (1) {
463 145390 100 100     176217 if (@{$x->{buffer}} > 1 || $x->{cur} == $x->{file_size}) {
  145390         533561  
464 144241         154957 my $line = shift @{$x->{buffer}};
  144241         295491  
465 144241 100       479352 $x->{line} += $x->{origin} if defined $line;
466 144241         461520 return $line;
467             }
468              
469             # no complete line, so read something
470              
471             # reading forward is easy - just sysseek() to where the bottom of the
472             # buffer is, and let sysread() do the rest
473 1149 50       17590 sysseek($x->{fh}, $x->{cur}, SEEK_SET)
474             or croak $!;
475              
476             # sysread returns undef for errors;
477             # due to the pre-condition, 0 should not occur either
478 1149         2262 my $tmp;
479 1149 50       93341 my $size = sysread($x->{fh}, $tmp, $x->{block_size})
480             or croak $!;
481 1149         2378 $x->{cur} += $size;
482              
483             # prepend to the temp the leftover partial line in the buffer
484 88         946 $tmp = pop (@{$x->{buffer}}) . $tmp
  1149         4104  
485 1149 100       1717 if (@{$x->{buffer}});
486              
487             # platform-dependent translation
488 1149 50       6681 $tmp =~ s/$x->{translate}/\n/
489             if defined $x->{translate};
490              
491             # split the temp and store it in the buffer
492 1149         1242343 @{$x->{buffer}} = $tmp =~ /$x->{re}/gs;
  1149         242084  
493             }
494             }
495              
496             sub _prev_line {
497 144217     144217   820618 my ($x) = @_;
498 144217         204444 while (1) {
499             # 1. more than 1 line is in the buffer, so the bottom of the buffer is
500             # a complete line
501             # 2. only line 1 remains in the buffer
502             # 3. nothing else to read, i.e. return undef
503 145366 100 100     150140 if (@{$x->{buffer}} > 1 || $x->{cur} == 0) {
  145366         511982  
504 144217         197138 my $line = pop @{$x->{buffer}};
  144217         342741  
505 144217 100       461576 $x->{line} -= $x->{origin} if defined $line;
506 144217         491626 return $line;
507             }
508              
509             # no complete line, so read something
510              
511             # reading backward requires us to first calculate where the top of the
512             # buffer will reach. be careful to handle trailing bytes properly.
513 1149         3291 my $read_size = $x->{block_size};
514 1149         3517 $x->{cur} -= $x->{block_size};
515 1149 100       5726 if ($x->{cur} < 0) {
516 440         1012 $read_size += $x->{cur};
517 440         883 $x->{cur} = 0;
518             }
519 1149 50       40799 sysseek($x->{fh}, $x->{cur}, SEEK_SET)
520             or croak $!;
521              
522             # sysread returns undef for errors;
523             # due to the pre-condition, 0 should not occur either
524 1149         4822 my $tmp = '';
525 1149 50       58916 sysread($x->{fh}, $tmp, $read_size) == $read_size
526             or croak $!;
527              
528             # append to the temp the leftover partial line in the buffer
529 88         505 $tmp .= pop @{$x->{buffer}}
  1149         3749  
530 1149 100       1943 if (@{$x->{buffer}});
531              
532             # platform-dependent translation
533 1149 50       3268 $tmp =~ s/$x->{translate}/\n/
534             if defined $x->{translate};
535              
536             # split the temp and store it in the buffer
537 1149         1133904 @{$x->{buffer}} = $tmp =~ /$x->{re}/gs;
  1149         259224  
538             }
539             }
540              
541             # used only as fail-safe default
542             sub _eof {
543 0     0   0 my ($x) = @_;
544             return
545 0 0       0 ($x->{move} == 1 ) ? $x->next_eof() :
    0          
546             ($x->{move} == -1) ? $x->prev_eof() :
547             undef;
548             }
549              
550             sub _next_eof {
551 51     51   282 my ($x) = @_;
552 51   100     212 return $x->{cur} == $x->{file_size} && @{$x->{buffer}} == 0;
553             }
554              
555             sub _prev_eof {
556 51     51   370 my ($x) = @_;
557 51   100     213 return $x->{cur} == 0 && @{$x->{buffer}} == 0;
558             }
559              
560             # fixes up our aliases so that we eliminate the indirection functions
561             # _read_line() and _eof()
562             sub _fixup_alias {
563 2138     2138   4154 my ($x) = @_;
564              
565             # TODO: walk through the symbol table to do this automatically?
566              
567             # redefining aliases
568 2     2   15 no warnings qw/redefine/;
  2         4  
  2         1177  
569 2138 50       20674 *READLINE = ($x->{move} == 1) ? \&_next_line : ($x->{move} == -1) ? \&_prev_line : undef;
    100          
570 2138 50       44003 *getline = ($x->{move} == 1) ? \&_next_line : ($x->{move} == -1) ? \&_prev_line : undef;
    100          
571 2138 50       14157 *readline = ($x->{move} == 1) ? \&_next_line : ($x->{move} == -1) ? \&_prev_line : undef;
    100          
572 2138 50       15909 *eof = ($x->{move} == 1) ? \&_next_eof : ($x->{move} == -1) ? \&_prev_eof : undef;
    100          
573 2138 50       23008 *EOF = ($x->{move} == 1) ? \&_next_eof : ($x->{move} == -1) ? \&_prev_eof : undef;
    100          
574             }
575              
576              
577             # function
578             sub _is_dos {
579 206     206   3278 return $^O =~ /^(dos|os2|mswin32|cygwin)$/i;
580             }
581              
582             # function
583             sub _is_mac {
584 206     206   1438 return $^O =~ /^(macos)$/i;
585             }
586              
587             sub _dump {
588 0     0     my ($x) = @_;
589 0           require YAML;
590              
591             # YAML crashes for regexes
592 0           my %h = map {$_ => $x->{$_}} grep {!/^re$/} keys %$x;
  0            
  0            
593 0           return YAML::Dump(\%h);
594             }
595              
596             =pod
597              
598             =back
599              
600             =head1 TIED HANDLE INTERFACE
601              
602             local *F;
603             tie *F, "File::Bidirectional", $file, {origin => 1}
604             or die $!;
605              
606             while (my $line = ) { ... }
607              
608             (tied *F)->switch();
609              
610             The C, C, C, C and C are aliased to the
611             constructor and the lower-case method names, respectively. All other tied
612             operations, such as seeking and writing, are unsupported and will generate an
613             unknown method area.
614              
615             To use the other methods, it is necessary to get at the reference to the object
616             underlying the tied variable via L.
617              
618             =head1 LINE ENDINGS
619              
620             Currently, File::Bidirectional attempts to imitate Perl by converting the
621             platform-specific line separator into C<\n>. Currently, this only means
622             converting C<\r> on MacOS, and C<\r\n> on DOS and Windows-type systems (when the
623             C option is not set).
624              
625             So far, this module has only been tested on Unix where line endings do not need
626             to be converted, thus it will be greatly appreciated if users can feedback
627             whether the line endings conversion work on their respective platforms.
628              
629             =head1 BENCHMARKS
630              
631             As would be expected, File::Bidirectional is hardly as fast as native Perl I/O. To
632             break the news gently, it can be up to an order of magnitude slower...
633              
634             Reading through a 250MB file with various methods yield the following numbers:
635              
636             Method | Time (s)
637             --------------------------------------
638             Native Perl | 5
639             IO::File | 16
640             File::Bidirectional (OO) | 42
641             File::Bidirectional (tied) | 51
642              
643             To be optimistic about it, in the best case File::Bidirectional takes 2.6 times
644             the time taken for IO::File. For smaller files, the absolute time difference
645             may be less noticeable, so you will have to decide if the tradeoff is worth it
646             for your application. It is about as fast as I can make it without dropping
647             down into C, but if anybody has a compelling need for speed or ideas on how to
648             optimize things, please do drop me a line.
649              
650             The benchmarks were performed circa 2005, on a Pentium-4 machine with clockspeed
651             2.8GHz, a 7200rpm IDE harddisk, running Debian sarge and ext3. The programs
652             tested were the respective variants of
653              
654             while (my $line = <$fh>) { chomp $line; }
655              
656             The record separator was simply C<"\n"> and no newline translation took place.
657              
658             =head1 AUTHOR
659              
660             Kian Win Ong, cpan@bulk.squeakyblue.com
661              
662             =head1 COPYRIGHT
663              
664             Copyright (C) 2005 by Kian Win Ong. All rights reserved. This program is free
665             software; you can redistribute it and/or modify it under the same terms as Perl
666             itself. This can be either the GNU General Public License or the Artistic
667             License, as specified in the Perl README file.
668              
669             =head1 ACKNOWLEDGEMENTS
670              
671             Thanks goes out to Uri Guttman, the author of File::ReadBackwards, from which I
672             stole a bunch of code and tests. =)
673              
674             =cut
675              
676             1;