File Coverage

lib/File/CountLines.pm
Criterion Covered Total %
statement 53 53 100.0
branch 17 22 77.2
condition 3 6 50.0
subroutine 8 8 100.0
pod 0 1 0.0
total 81 90 90.0


line stmt bran cond sub pod time code
1             package File::CountLines;
2 1     1   70550 use strict;
  1         2  
  1         29  
3 1     1   6 use warnings;
  1         2  
  1         75  
4              
5             our $VERSION = '0.0.3';
6             our @EXPORT_OK = qw(count_lines);
7              
8 1     1   5 use Exporter 5.057;
  1         36  
  1         55  
9             Exporter->import('import');
10              
11 1     1   5 use Carp qw(croak);
  1         2  
  1         82  
12 1     1   1663 use charnames qw(:full);
  1         39552  
  1         7  
13              
14             our %StyleMap = (
15             'cr' => "\N{CARRIAGE RETURN}",
16             'lf' => "\N{LINE FEED}",
17             'crlf' => "\N{CARRIAGE RETURN}\N{LINE FEED}",
18             'native' => "\n",
19             );
20              
21             our $BlockSize = 4096;
22              
23             sub count_lines {
24 21     21 0 9691 my $filename = shift;
25 21 100       308 croak 'expected filename in call to count_lines()'
26             unless defined $filename;
27 19         54 my %options = @_;
28 19         29 my $sep = $options{separator};
29 19 100       43 unless (defined $sep) {
30 9 100       44 my $style = exists $options{style} ? $options{style} : 'native';
31 9         21 $sep = $StyleMap{$style};
32 9 50       25 die "Don't know how to map style '$style'" unless defined $sep;
33             }
34 19 100       53 if (length($sep) > 1) {
35 6   66     57 return _cl_sysread_multiple_chars(
36             $filename,
37             $sep,
38             $options{blocksize} || $BlockSize,
39             );
40             } else {
41 13   33     68 return _cl_sysread_one_char(
42             $filename,
43             $sep,
44             $options{blocksize} || $BlockSize,
45             );
46             }
47             }
48              
49             sub _cl_sysread_one_char {
50 13     13   25 my ($filename, $sep, $blocksize) = @_;
51 13         21 local $Carp::CarpLevel = 1;
52 13 100       1062 open my $handle, '<:raw', $filename
53             or croak "Can't open file `$filename' for reading: $!";
54 12         26 binmode $handle;
55 12         18 my $lines = 0;
56 12         43 $sep =~ s/([\\{}])/\\$1/g;
57             # need eval here because tr/// doesn't interpolate
58 12         13 my $sysread_status;
59 12         2061 eval qq[
60             while (\$sysread_status = sysread \$handle, my \$buffer, $blocksize) {
61             \$lines += (\$buffer =~ tr{$sep}{});
62             }
63             ];
64 12 50       98 die "Can't sysread() from file `$filename': $!"
65             unless defined ($sysread_status);
66 12 50       131 close $handle or croak "Can't close file `$filename': $!";
67 12         105 return $lines;
68             }
69              
70             sub _cl_sysread_multiple_chars {
71 6     6   38 my ($filename, $sep, $blocksize) = @_;
72 6         11 local $Carp::CarpLevel = 1;
73 6 100       582 open my $handle, '<:raw', $filename
74             or croak "Can't open file `$filename' for reading: $!";
75 5         11 binmode $handle;
76 5         8 my $len = length($sep);
77 5         8 my $lines = 0;
78 5         8 my $buffer = '';
79 5         6 my $sysread_status;
80 5         45 while ($sysread_status = sysread $handle, $buffer, $blocksize, length($buffer)) {
81 10         17 my $offset = -$len;
82 10         30 while (-1 != ($offset = index $buffer, $sep, $offset + $len)) {
83 13         30 $lines++;
84             }
85             # we assume $len >= 2; otherwise use _cl_sysread_one_char()
86 10         66 $buffer = substr $buffer, 1 - $len;
87             }
88 5 50       12 die "Can't sysread() from file `$filename': $!"
89             unless defined ($sysread_status);
90 5 50       54 close $handle or croak "Can't close file `$filename': $!";
91 5         46 return $lines;
92             }
93              
94             1;
95              
96             __END__
97              
98             =head1 NAME
99              
100             File::CountLines - efficiently count the number of line breaks in a file.
101              
102             =head1 SYNOPSIS
103              
104             use File::CountLines qw(count_lines);
105             my $no_of_lines = count_lines('/etc/passwd');
106              
107             # other uses
108             my $carriage_returns = count_lines(
109             'path/to/file.txt',
110             style => 'cr',
111             );
112             # possible styles are 'native' (the default), 'cr', 'lf'
113              
114             =head1 DESCRIPTION
115              
116             L<perlfaq5> answers the question on how to count the number of lines
117             in a file. This module is a convenient wrapper around that method, with
118             additional options.
119              
120             More specifically, it counts the number of I<line breaks> rather than lines.
121             On Unix systems nearlly all text files end with a newline (by convention), so
122             usually the number of lines and number of line breaks is equal.
123              
124             Since different operating systems have different ideas of what a newline is,
125             you can specifiy a C<style> option, which can be one of the following values:
126              
127             =over
128              
129             =item C<native>
130              
131             This takes Perl's C<\n> as the line separator, which should be the right thing in most cases. See L<perlport> for details. This is the default.
132              
133             =item C<cr>
134              
135             Take a carriage return as line separator (MacOS style)
136              
137             =item C<lf>
138              
139             Take a line feed as line separator (Unix style)
140              
141             =item C<crlf>
142              
143             Take a carriage return followed by a line feed as separator (Microsoft
144             Windows style)
145              
146             =back
147              
148             Alternatively you can specify an arbitrary separator like this:
149              
150             my $lists = count_lines($file, separator => '\end{itemize}');
151              
152             It is taken verbatim and searched for in the file.
153              
154             The file is read in equally sized blocks. The size of the blocks
155             can be supplied with the C<blocksize> option. The default is 4096,
156             and can be changed by setting C<$File::CountLines::BlockSize>.
157              
158             Do not use a block size smaller than the length of the separator, that
159             might produce wrong results. (In general there's no reason to chose a
160             smaller block size at all. Depending on your size a larger block size
161             might speed up things a bit.)
162              
163             =head1 Character Encodings
164              
165             If you supply a separator yourself, it should not be a decoded string.
166              
167             The file is read in binary mode, which implies that this module
168             works fine for text files in ASCII-compatible encodings, including
169             ASCII itself, UTF-8 and all the ISO-8859-* encodings (aka Latin-1,
170             Latin-2, ...).
171              
172             Note that the multi byte encodings like UTF-32, UTF-16le, UTF-16be
173             and UCS-2 encode a line feed character in a way that the C<0x0A> byte
174             is a substring of the encoded character, but if you search blindly for
175             that byte you will get false positives. For example the I<LATIN CAPITAL
176             LETTER C WITH DOT ABOVE>, U+010A has the byte sequence C<0x0A 0x01> when
177             encoded as UTF-16le, so it would be counted as a newline. Even search for
178             C<0x0A 0x00> might give false positives.
179              
180             So the summary is that for now you can't use this module in a meaningful
181             way to count lines of text files in encodings that are not ASCII-compatible.
182             If there's demand for, I can implement that though.
183              
184             =head1 Extending
185              
186             You can add your own EOL styles by adding them to the
187             C<%File::CountLines::StyleMap> hash, with the name of the style as hash key
188             and the separator as the value.
189              
190             =head1 AUTHOR
191              
192             Moritz Lenz L<http://perlgeek.de>, L<mailto:moritz@faui2k3.org>
193              
194             =head1 COPYRIGHT AND LICENSE
195              
196             Copyright (C) 2008 by Moritz A. Lenz. This module is free software.
197             You may use, redistribute and modify it under the same terms as perl itself.
198              
199             Example code included in this package may be used as if it were in the Public
200             Domain.
201              
202             =head1 DEVELOPMENT
203              
204             You can obtain the latest development version from L<http://github.com/moritz/File-CountLines>:
205              
206             git clone git://github.com/moritz/File-CountLines.git
207              
208             =cut
209