File Coverage

blib/lib/File/Sip.pm
Criterion Covered Total %
statement 47 48 97.9
branch 9 10 90.0
condition 5 8 62.5
subroutine 8 8 100.0
pod 1 1 100.0
total 70 75 93.3


line stmt bran cond sub pod time code
1             package File::Sip;
2              
3             #ABSTRACT: file parser intended for big files that don't fit into main memory.
4              
5              
6 1     1   24437 use Moo;
  1         16714  
  1         6  
7 1     1   1647 use Carp 'croak';
  1         1  
  1         64  
8 1     1   780 use IO::File;
  1         9923  
  1         117  
9 1     1   6377 use Encode qw(decode);
  1         16599  
  1         133  
10 1     1   15 use feature ':5.10';
  1         3  
  1         1649  
11              
12              
13             has path => (
14             is => 'ro',
15             required => 1,
16             );
17              
18              
19             has line_separator => (
20             is => 'ro',
21             default => sub {qw/(\015\012|\015|\012)/},
22             );
23              
24              
25             has is_utf8 => (
26             is => 'ro',
27             default => sub {1},
28             );
29              
30              
31             # internal cursor for iterations
32             has _read_line_position => (
33             is => 'rw',
34             default => sub {0},
35             );
36              
37             sub read_line {
38 15     15 1 10109 my ( $self, $line_number ) = @_;
39              
40 15   100     75 $line_number //= $self->_read_line_position;
41 15         425 my $fh = $self->_fh;
42 15         547 my $line_index = $self->index->[$line_number];
43 15 100       152 return if !defined $line_index;
44              
45 14 100       291 my $previous_line_index =
46             ( $line_number == 0 ) ? 0 : $self->index->[ $line_number - 1 ];
47              
48 14         75 my $line;
49 14         127 seek( $fh, $previous_line_index, 0 );
50 14         109 read( $fh, $line, $line_index - $previous_line_index );
51              
52 14 100       44 $self->_read_line_position( $line_number + 1 ) if @_ == 1;
53              
54 14 50 33     122 return decode( "utf8", $line ) if defined $line && $self->is_utf8;
55 0         0 return $line;
56             }
57              
58             # file handle return by IO::File
59             has _fh => (
60             is => 'ro',
61             lazy => 1,
62             default => sub {
63             my ($self) = @_;
64             my $open_file_param = "<:crlf";
65             IO::File->new( $self->path, $open_file_param )
66             or croak "Failed to open file '" . $self->path . "' : '$!'";
67             }
68             );
69              
70             # File stat array
71             has _stat => (
72             is => 'lazy',
73             );
74              
75             sub _build__stat {
76 1     1   403 my ($self) = @_;
77 1         25 my @stat = stat( $self->_fh );
78 1         30 return \@stat;
79             }
80              
81              
82             has index => (
83             is => 'rw',
84             lazy => 1,
85             builder => 1,
86             );
87              
88             sub _build_index {
89 1     1   432 my ($self) = @_;
90 1         2 my $index = [];
91              
92 1         3 my ($blocksize) = @{ $self->_stat }[11];
  1         4  
93 1   50     4 $blocksize ||= 8192;
94              
95 1         2 my $buffer = '';
96 1         3 my $offset = 0;
97 1         1 my $line_number = 0;
98              
99             # make sure we jump to the begining of the file
100 1         25 seek( $self->_fh, 0, SEEK_SET );
101              
102             # build the index, char by char, splitting on the line separator
103 1         18 my $line_sep = $self->line_separator;
104 1         25 while ( my $count = read( $self->_fh, $buffer, $blocksize ) ) {
105 1         51 for my $i ( 0 .. $count ) {
106 80         96 my $char = substr $buffer, $i, 1;
107 80 100       284 if ( $char =~ /$line_sep/ ) {
108 7         23 $index->[ $line_number++ ] = $offset + $i + 1;
109             }
110             }
111 1         29 $offset += $count;
112             }
113              
114             # reset the cursor at the begining of the file and return the index
115 1         35 seek( $self->_fh, 0, SEEK_SET );
116 1         15 return $index;
117             }
118              
119             1;
120              
121              
122             =pod
123              
124             =head1 NAME
125              
126             File::Sip - file parser intended for big files that don't fit into main memory.
127              
128             =head1 VERSION
129              
130             version 0.003
131              
132             =head1 DESCRIPTION
133              
134             In most of the cases, you don't want to use this, but L instead.
135              
136             This class is able to read a line from a file without loading the whole file in
137             memory. When you want to deal with files of millions of lines, on a limited
138             environment, brute force isn't an option.
139              
140             An index of all the lines in the file is built in order to be able to access
141             their starting position depending on their line number.
142              
143             The memory used is then limited to the size of the index plus the size of the
144             line that is read (until the line separator character is reached).
145              
146             It also provides a way to nicely iterate over all the lines of the file, using
147             only the amount of memory needed to store one line at a time, not the whole file.
148              
149             =head1 ATTRIBUTES
150              
151             =head2 path
152              
153             Required, file path as a string.
154              
155             =head2 line_separator
156              
157             Optional, regular expression of the newline seperator, default is
158             C.
159              
160             =head2 is_utf8
161              
162             Optional, flag to tell if the file is utf8-encoded, default is true.
163              
164             If true, the line returned by C will be decoded.
165              
166             =head2 index
167              
168             Index that contains positions of all lines of the file, usage:
169              
170             $sip->index->[ $line_number ] = $seek_position;
171              
172             =head1 METHODS
173              
174             =head2 read_line
175              
176             Return the line content at the given position (terminated by C).
177              
178             my $line = $sip->read_line( $line_number );
179              
180             It's also possible to read the entire file, line by line without providing a
181             line number to the method, until C is returned:
182              
183             while (my $line = $sip->read_line()) {
184             # do something with $line
185             }
186              
187             =head1 ACKNOWLEDGMENT
188              
189             This module was written at Weborama when dealing with huge raw files, where huge
190             means "oh no, it really won't fit anymore in this compute slot!" (which are
191             limited in main-memory).
192              
193             =head1 BENCHMARK
194              
195             C is not faster than in-memory parsers like L but
196             it has a lower memory footprint. With small files, it's not obvious (when the file
197             is small, the cost of the index is almost equal to the cost of all the
198             characters of the file).
199             But when the file gets bigger, the gain in main memory grows.
200              
201             With files bigger than few megabytes, C will consume up to 20 times less
202             memory than L. This factor of 20 appears to be an asymptotic limit
203             as size of studied files grows.
204              
205             If you want to estimate the memory size of a running process that uses C, you
206             can then assume that the size of the index will be around 1/20th of the size of
207             the processed file.
208              
209             =head1 AUTHORS
210              
211             This module has been written at Weborama by Alexis Sukrieh and Bin Shu.
212              
213             =head1 AUTHOR
214              
215             Alexis Sukrieh
216              
217             =head1 COPYRIGHT AND LICENSE
218              
219             This software is copyright (c) 2014 by Weborama.
220              
221             This is free software; you can redistribute it and/or modify it under
222             the same terms as the Perl 5 programming language system itself.
223              
224             =cut
225              
226              
227             __END__