File Coverage

blib/lib/File/BufferedInput.pm
Criterion Covered Total %
statement 9 72 12.5
branch 0 18 0.0
condition 0 5 0.0
subroutine 3 13 23.0
pod 0 9 0.0
total 12 117 10.2


line stmt bran cond sub pod time code
1             #=Copyright Infomation
2             #==========================================================
3             #Module Name : File::BufferedInput
4             #Program Author : Dr. Ahmed Amin Elsheshtawy, Ph.D. Physics, E.E.
5             #Home Page : http://www.mewsoft.com
6             #Contact Email : support@mewsoft.com
7             #Copyrights © 2014 Mewsoft. All rights reserved.
8             #==========================================================
9             package File::BufferedInput;
10              
11 1     1   144055 use Carp;
  1         3  
  1         135  
12 1     1   6 use strict;
  1         2  
  1         44  
13 1     1   6 use warnings;
  1         7  
  1         2378  
14              
15             our $VERSION = '1.03';
16             #==========================================================
17             sub new {
18 0     0 0   my ($class, %args) = @_;
19            
20 0           my $self = bless {}, $class;
21            
22 0           $self->{buffer} = $args{buffer};
23 0   0       $self->{buffer} ||= 204800; # 20MB
24 0           $self->{utf8} = $args{utf8};
25 0           $self->{file} = $args{file};
26              
27 0 0         if ($self->{file}) {
28 0           $self->file($self->{file});
29             }
30              
31 0           return $self;
32             }
33             #=========================================================#
34             sub file {
35 0     0 0   my ($self, $file) = @_;
36              
37 0           $self->close();
38              
39 0 0         if ($self->opened($file)) {
40 0           $self->{fh} = $file;
41             }
42             else {
43 0 0         open ($self->{fh}, ($self->{utf8})? "<:encoding(UTF-8)" : "<", $file) or croak "Error opening file $file: $!";
    0          
44             }
45              
46 0           $self->reset();
47             }
48             #=========================================================#
49             sub reset {
50 0     0 0   my ($self) = $_[0];
51 0           $self->{current_line} = 0;
52 0           $self->{line_count} = 0;
53 0           $self->{total_count} = 0;
54 0           $self->{eof} = 0;
55 0           $self->{block_remaining} = "";
56             }
57             #=========================================================#
58             sub rewind {
59 0     0 0   my ($self) = $_[0];
60 0           seek $self->{fh}, 0, 0;
61 0           $self->reset();
62             }
63             #=========================================================#
64             sub eof {
65 0     0 0   my ($self) = $_[0];
66 0           $self->{eof};
67             }
68             #=========================================================#
69             sub DESTROY {
70 0     0     my ($self) = $_[0];
71 0           $self->close();
72             }
73             #=========================================================#
74             sub close {
75 0     0 0   my ($self) = $_[0];
76 0 0         $self->{fh} || return;
77 0           close $self->{fh};
78             }
79             #=========================================================#
80             sub opened {
81 0     0 0   my ($self) = $_[0];
82 0           defined fileno($_[1]);
83             }
84             #=========================================================#
85             sub line {
86 0     0 0   my ($self) = $_[0];
87            
88 0 0         return undef if $self->{eof};
89              
90 0 0         if ($self->{current_line} == $self->{line_count}) {
91 0           $self->read_block();
92             }
93              
94 0           return ($self->{lines}->[$self->{current_line}++]);
95             }
96             #=========================================================#
97             sub read_block {
98 0     0 0   my ($self) = $_[0];
99 0           my ($buffer, $chunk, $match);
100              
101 0 0         if (read($self->{fh}, $buffer, $self->{buffer})) {
102            
103             # win, dos eol: \r\n =\015\012 , unix eol: \n = \012, Mac eol: \r =\015; \cM=\n, \cJ=\r, /\x0D\x0A/, /\x0A/
104             #split /(?:\x{0d}?\x{0a}|\x{0d})/
105            
106 0           $self->{end_pos} = rindex($buffer, "\n");
107            
108             # if current block does not have newline, join another blocks until one found
109 0   0       while ($self->{end_pos} == -1 && read($self->{fh}, $chunk, $self->{buffer})) {
110 0           $buffer .= $chunk;
111 0           $self->{end_pos} = rindex($buffer, "\n");
112             }
113              
114             #if (length(decode_utf8($str))
115              
116 0 0         if (length($buffer) >= $self->{buffer}) {
117 0           $match = substr($buffer, 0, $self->{end_pos});
118 0           $match = $self->{block_remaining} . $match ;
119 0           $self->{block_remaining} = substr($buffer, $self->{end_pos}+1, length($buffer) - $self->{end_pos});
120             } else {
121             # last block in file
122 0           $match = $buffer;
123 0           $match = $self->{block_remaining} . $buffer;
124 0           $self->{block_remaining} = "";
125             }
126            
127 0           @{$self->{lines}} = split(/\n/, $match, -1);
  0            
128            
129 0           $self->{current_line} = 0;
130 0           $self->{line_count} = @{$self->{lines}};
  0            
131 0           $self->{total_count} += $self->{line_count};
132             }
133             else {
134             # finished
135 0           $self->{eof} = 1;
136 0           $self->{current_line} = 0;
137 0           $self->{line_count} = 0;
138 0           @{$self->{lines}} = ();
  0            
139             }
140             }
141             #==========================================================
142             1;
143              
144             =head1 NAME
145              
146             File::BufferedInput - Large and giant text file performance buffered reader.
147              
148             =head1 SYNOPSIS
149              
150             use File::BufferedInput;
151              
152             # create new object with default options, buffer size 20MB and ANSI text file
153             # file argument can be file name or file handle
154             my $fileobj = File::BufferedInput->new(file=>$filename);
155            
156             # or create new object with custom options, set buffer size 50MB and UTF-8 text file
157             my $fileobj = File::BufferedInput->new(file=>$filename, buffer=>50_000_000, utf8=>1);
158              
159             # or create new object then set the file name/handle
160             my $fileobj = File::BufferedInput->new();
161             $fileobj->file($filename_or_handle);
162              
163             # now loop through all the text file lines straight forward
164            
165             my $counter = 0;
166             # loop through the file lines sequentially
167             while (!$fileobj->eof()) {
168             $line=$fileobj->line();
169             $counter++;
170             # print "$counter)- $line\n";
171             }
172              
173             # start again from the begaining of the file
174             #$fileobj->rewind()
175              
176             $fileobj->close(); # close the file and frees the memory used for the block
177             print "$counter lines found\n";
178            
179            
180             =head1 DESCRIPTION
181              
182             This module solves the problem with reading large and huge text files in Perl. It is designed to read only block by block as needed.
183             It does not load the whole file into memory, it only reads one block at a time and once the last sequential line reached, it reads the
184             next block from the file and frees the previous block from memory, so at all times only one block of the file is kept in menory.
185              
186             For example if you are reading a 2GB file, once you start reading lines from the file, the module reads the first block from the
187             file on disk, while you loop through the lines, when you reach the line at the end of the read block, the module delete this block
188             from memory and read the next block from the file on disk and parses it to lines and so on.
189              
190             =head1 SEE ALSO
191              
192             =head1 AUTHOR
193              
194             Ahmed Amin Elsheshtawy,
195             Website: http://www.mewsoft.com
196              
197             =head1 COPYRIGHT AND LICENSE
198              
199             Copyright (C) 2014 by Ahmed Amin Elsheshtawy support@mewsoft.com
200             L
201              
202             This library is free software; you can redistribute it and/or modify
203             it under the same terms as Perl itself.
204              
205             =cut