File Coverage

blib/lib/File/LoadLines.pm

Criterion	Covered	Total	%
statement	62	94	65.9
branch	35	64	54.6
condition	7	10	70.0
subroutine	7	7	100.0
pod	1	1	100.0
total	112	176	63.6

line	stmt	bran	cond	sub	pod	time	code
1							#! perl
2
3							package File::LoadLines;
4
5	8			8		700427	use warnings;
	8					72
	8					247
6	8			8		38	use strict;
	8					13
	8					160
7	8			8		32	use base 'Exporter';
	8					12
	8					942
8							our @EXPORT = qw( loadlines );
9	8			8		3481	use Encode;
	8					63127
	8					548
10	8			8		56	use Carp;
	8					13
	8					348
11	8			8		974	use utf8;
	8					26
	8					44
12
13							=head1 NAME
14
15							File::LoadLines - Load lines from file
16
17							=cut
18
19							our $VERSION = '1.02';
20
21							=head1 SYNOPSIS
22
23							use File::LoadLines;
24
25							my @lines = loadlines("mydata.txt");
26							...
27
28							=head1 DESCRIPTION
29
30							File::LoadLines provides an easy way to load the contents of a text
31							file into an array of lines. It is intended for relatively small files
32							like config files that are often produced by weird tools (and users).
33
34							It automatically handles ASCII, Latin-1 and UTF-8 text.
35							When the file has a BOM, it handles UTF-8, UTF-16 LE and BE, and
36							UTF-32 LE and BE.
37
38							Recognized line terminators are NL (Unix, Linux), CRLF (DOS, Windows)
39							and CR (Mac)
40
41							=head1 EXPORT
42
43							=head2 loadlines
44
45							=head1 FUNCTIONS
46
47							=head2 loadlines
48
49							my @lines = loadlines("mydata.txt");
50							my @lines = loadlines("mydata.txt", $options);
51
52							Basically, the file is opened, read, decoded and split into lines
53							that are returned in the result array. Line terminators are removed.
54
55							In scalar context, returns an array reference.
56
57							The first argument may be the name of a file, an opened file handle,
58							or a reference to a string that contains the data.
59
60							The second argument can be used to influence the behaviour.
61							It is a hash reference of option settings.
62
63							Note that loadlines() is a I, it reads the whole file into
64							memory and requires temporarily memory for twice the size of the
65							file.
66
67							=over
68
69							=item split
70
71							Enabled by default.
72
73							If set to zero, the data is not split into lines but returned as a
74							single string.
75
76							=item chomp
77
78							Enabled by default.
79
80							If set to zero, the line terminators are not removed from the
81							resultant lines.
82
83							=item encoding
84
85							If specified, loadlines() will use this encoding to decode the file
86							data if it cannot automatically detect the encoding.
87
88							If you pass an options hash, File::LoadLines will set C to
89							the encoding it detected and used for this file data.
90
91							=back
92
93							=cut
94
95							sub loadlines {
96	25			25	1	54066	my ( $filename, $options ) = @_;
97	25	100				100	croak("Missing filename.\n") unless defined $filename;
98	24	100	100			104	croak("Invalid options.\n") if (defined $options && (ref($options) ne "HASH"));
99
100	23		50			132	$options->{split} //= 1;
101	23		50			104	$options->{chomp} //= 1;
102
103	23					39	my $data; # slurped file data
104							my $encoded; # already encoded
105
106							# Gather data from the input.
107	23	50				91	if ( ref($filename) ) {
		50
108	0	0				0	if ( ref($filename) eq 'GLOB' ) {
109	0					0	binmode( $filename, ':raw' );
110	0					0	$data = do { local $/; <$filename> };
	0					0
	0					0
111	0					0	$filename = "__GLOB__";
112							}
113							else {
114	0					0	$data = $$filename;
115	0					0	$filename = "__STRING__";
116	0					0	$encoded++;
117							}
118							}
119							elsif ( $filename eq '-' ) {
120	0					0	$filename = "__STDIN__";
121	0					0	$data = do { local $/; };
	0					0
	0					0
122							}
123							else {
124	23					41	my $name = $filename;
125	23					86	$filename = decode_utf8($name);
126							# On MS Windows, non-latin (wide) filenames need special treatment.
127	23	50	66			719	if ( $filename ne $name && $^O =~ /mswin/i ) {
128	0					0	require Win32API::File;
129	0					0	my $fn = encode('UTF-16LE', "$filename").chr(0).chr(0);
130	0					0	my $fh = Win32API::File::CreateFileW
131							( $fn, Win32API::File::FILE_READ_DATA(), 0, [],
132							Win32API::File::OPEN_EXISTING(), 0, []);
133	0	0				0	croak("$filename: $^E (Win32)\n") if $^E;
134	0	0				0	Win32API::File::OsFHandleOpen( 'FILE', $fh, "r")
135							or croak("$filename: $!\n");
136	0					0	$data = do { local $/; readline(\*FILE) };
	0					0
	0					0
137							# warn("$filename³: len=", length($data), "\n");
138							}
139							else {
140	23	50				928	open( my $f, '<', $filename )
141							or croak("$filename: $!\n");
142	23					61	$data = do { local $/; <$f> };
	23					100
	23					788
143							}
144							}
145	23	50				111	$options->{_filesource} = $filename if $options;
146
147	23					107	my $name = encode_utf8($filename);
148	23	50				431	if ( $encoded ) {
		100
		50
		50
		100
		50
		100
149							# Nothing to do, already dealt with.
150	0					0	$options->{encoding} = 'Perl';
151							}
152
153							# Detect Byte Order Mark.
154							elsif ( $data =~ /^\xEF\xBB\xBF/ ) {
155	6	50				21	warn("$name is UTF-8 (BOM)\n") if $options->{debug};
156	6					13	$options->{encoding} = 'UTF-8';
157	6					30	$data = decode( "UTF-8", substr($data, 3) );
158							}
159							elsif ( $data =~ /^\xFE\xFF/ ) {
160	0	0				0	warn("$name is UTF-16BE (BOM)\n") if $options->{debug};
161	0					0	$options->{encoding} = 'UTF-16BE';
162	0					0	$data = decode( "UTF-16BE", substr($data, 2) );
163							}
164							elsif ( $data =~ /^\xFF\xFE\x00\x00/ ) {
165	0	0				0	warn("$name is UTF-32LE (BOM)\n") if $options->{debug};
166	0					0	$options->{encoding} = 'UTF-32LE';
167	0					0	$data = decode( "UTF-32LE", substr($data, 4) );
168							}
169							elsif ( $data =~ /^\xFF\xFE/ ) {
170	6	50				24	warn("$name is UTF-16LE (BOM)\n") if $options->{debug};
171	6					15	$options->{encoding} = 'UTF-16LE';
172	6					26	$data = decode( "UTF-16LE", substr($data, 2) );
173							}
174							elsif ( $data =~ /^\x00\x00\xFE\xFF/ ) {
175	0	0				0	warn("$name is UTF-32BE (BOM)\n") if $options->{debug};
176	0					0	$options->{encoding} = 'UTF-32BE';
177	0					0	$data = decode( "UTF-32BE", substr($data, 4) );
178							}
179
180							# No BOM, did user specify an encoding?
181							elsif ( $options->{encoding} ) {
182							warn("$name is ", $options->{encoding}, " (fallback)\n")
183	1	50				4	if $options->{debug};
184	1					5	$data = decode( $options->{encoding}, $data, 1 );
185							}
186
187							# Try UTF8, fallback to ISO-8895.1.
188							else {
189	10					20	my $d = eval { decode( "UTF-8", $data, 1 ) };
	10					34
190	10	100				1175	if ( $@ ) {
		100
191	1	50				4	warn("$name is ISO-8859.1 (assumed)\n") if $options->{debug};
192	1					3	$options->{encoding} = 'ISO-8859-1';
193	1					3	$data = decode( "iso-8859-1", $data );
194							}
195							elsif ( $d !~ /[^[:ascii:]]/ ) {
196	2	50				7	warn("$name is ASCII (detected)\n") if $options->{debug};
197	2					6	$options->{encoding} = 'ASCII';
198	2					3	$data = $d;
199							}
200							else {
201	7	50				26	warn("$name is UTF-8 (detected)\n") if $options->{debug};
202	7					13	$options->{encoding} = 'UTF-8';
203	7					15	$data = $d;
204							}
205							}
206
207	23	50				14101	return $data unless $options->{split};
208
209							# Split in lines;
210	23					41	my @lines;
211	23	50				54	if ( $options->{chomp} ) {
212							# Unless empty, make sure there is a final newline.
213	23	100				156	$data .= "\n" if $data =~ /.(?!\r\n\|\n\|\r)\z/;
214							# We need to maintain trailing newlines.
215	23					346	push( @lines, $1 ) while $data =~ /(.*?)(?:\r\n\|\n\|\r)/g;
216							}
217							else {
218							# We need to maintain trailing newlines.
219	0					0	push( @lines, $1 ) while $data =~ /(.*?(?:\r\n\|\n\|\r))/g;
220							}
221	23					53	undef $data;
222	23	100				116	return wantarray ? @lines : \@lines;
223							}
224
225							=head1 SEE ALSO
226
227							There are currently no other modules that handle BOM detection and
228							line splitting.
229
230							I have a faint hope that future versions of Perl and Raku will deal
231							with this transparently, but I fear the worst.
232
233							=head1 HINTS
234
235							When you have raw file data (e.g. from a zip), you can use loadlines()
236							to decode and unpack:
237
238							open( my $data, '<', \$contents );
239							$lines = loadlines( $data, $options );
240
241							=head1 AUTHOR
242
243							Johan Vromans, C<< >>
244
245							=head1 SUPPORT AND DOCUMENTATION
246
247							Development of this module takes place on GitHub:
248							https://github.com/sciurius/perl-File-LoadLines.
249
250							You can find documentation for this module with the perldoc command.
251
252							perldoc File::LoadLines
253
254							Please report any bugs or feature requests using the issue tracker on
255							GitHub.
256
257							=head1 COPYRIGHT & LICENSE
258
259							Copyright 2018,2020 Johan Vromans, all rights reserved.
260
261							This program is free software; you can redistribute it and/or modify it
262							under the same terms as Perl itself.
263
264							=cut
265
266							1; # End of File::LoadLines