File Coverage

blib/lib/Spreadsheet/Compare/Reader/CSV.pm

Criterion	Covered	Total	%
statement	154	162	95.0
branch	44	64	68.7
condition	21	33	63.6
subroutine	15	20	75.0
pod	2	3	66.6
total	236	282	83.6

line	stmt	bran	cond	sub	pod	time	code
1							package Spreadsheet::Compare::Reader::CSV;
2
3	2			2		1633	use Mojo::Base 'Spreadsheet::Compare::Reader', -signatures;
	2					5
	2					20
4	2			2		351	use Spreadsheet::Compare::Common;
	2					13
	2					13
5	2			2		1713	use Text::CSV;
	2					29913
	2					235
6
7							#<<<
8							use Spreadsheet::Compare::Config {
9	18					129	csv_options => sub { { allow_whitespace => 1 } },
10	0					0	files => sub {[]},
11	2					46	fix_empty_header => 1,
12							make_header_unique => 0,
13							rootdir => '.',
14							sep_auto => undef,
15							skip_before_head => 0,
16							skip_after_head => 0,
17	2			2		71	}, make_attributes => 1;
	2					5
18
19							has filename => undef, ro => 1;
20							has filehandle => undef, ro => 1;
21							has _chunk_data => sub { {} }, ro => 1;
22							has csv => sub {
23							my $csv = Text::CSV->new( $_[0]->csv_options );
24							LOGDIE join( ',', Text::CSV->error_diag ) unless $csv;
25							return $csv;
26							}, ro => 1;
27							#>>>
28
29							my( $trace, $debug );
30
31	56			56	0	95	sub init ( $self, @args ) {
	56					84
	56					88
	56					79
32	56					113	$self->{__ro__can_chunk} = 1;
33	56					175	return $self->SUPER::init(@args);
34							}
35
36	28			28	1	52	sub setup ($self) {
	28					56
	28					54
37
38	28					112	( $trace, $debug ) = get_log_settings();
39
40	28		50			420	my $proot = path( $self->rootdir // '.' );
41	28					1490	my $fn = path($self->files->[ $self->index ]);
42	28	50				888	my $pfull = $self->{__ro__filename} = $fn->is_absolute ? $fn : $proot->child($fn);
43
44	28					2456	INFO "opening input file >>$pfull<<";
45	28					424	my $fh = $self->{__ro__filehandle} = $pfull->openr_raw;
46
47	28					4531	<$fh> for 1 .. $self->skip_before_head;
48
49	28					338	$self->_set_header;
50
51	28					137	<$fh> for 1 .. $self->skip_after_head;
52
53	28	100				305	$self->_chunk_records() if $self->chunker;
54
55	28					73	$self->{_sln} = 0;
56
57	28					209	return $self;
58							}
59
60
61	6			6		60	sub _chunk_records ($self) {
	6					15
	6					13
62	6	50				24	$debug and DEBUG "chunking side $self->{index}";
63	6					26	my $skipper = $self->skipper;
64	6					39	while ( my $rec = $self->_read_record ) {
65	3026	50	33			11239	next if $skipper and $skipper->($rec);
66	3026					13630	my $cname = $self->chunker->($rec);
67	3026		100			8890	my $cdata = $self->_chunk_data->{$cname} //= [];
68	3026					46507	push @$cdata, delete( $rec->{__INFO__} );
69							}
70
71	6	50		0		41	$debug and DEBUG "found chunks:", sub { Dump( [ sort keys $self->_chunk_data->%* ] ) };
	0					0
72
73	6					34	my $fh = $self->filehandle;
74	6					79	seek( $fh, 0, 0 );
75	6					24	return $self;
76							}
77
78
79	138			138	1	302	sub fetch ( $self, $size ) {
	138					242
	138					263
	138					212
80
81	138					690	my $result = $self->result;
82	138					722	my $count = 0;
83
84	138	100				457	if ( $self->chunker ) {
85	98					795	my $cdata = $self->_chunk_data;
86	98					1642	my $cname = ( sort keys %$cdata )[0];
87	98					354	my $chunk = delete $cdata->{$cname};
88	98	100				372	$self->{__ro__exhausted} = 1 unless keys %$cdata;
89	98	50				386	$debug and DEBUG "Fetching data for chunk $cname";
90	98					275	for my $rec_info (@$chunk) {
91	3026	50				6883	if ( my $rec = $self->_read_record($rec_info) ) {
92	3026					6671	push @$result, $rec;
93	3026					7545	$count++;
94							}
95							}
96	98	50				2817	$debug and DEBUG "fetched $count records from chunk $cname";
97							}
98							else {
99	40	50				279	$debug and DEBUG "fetching max $size records";
100
101	40					75	my $i = 0;
102	40					152	my $fh = $self->filehandle;
103	40					251	my $skipper = $self->skipper;
104	40					284	while ( ++$i <= $size ) {
105	6504					14542	my $rec = $self->_read_record();
106	6504	100				13017	unless ($rec) {
107	20	50				95	$debug and DEBUG "EOF for $self->{__ro__filename}";
108	20					59	$self->{__ro__exhausted} = 1;
109	20					52	last;
110							}
111	6484	50	33			20193	next if $skipper and $skipper->($rec);
112	6484					26113	push @$result, $rec;
113	6484					15850	$count++;
114							}
115
116	40	100				194	if ( $size == ~0 ) {
117	18					525	@$result = sort { $a->id cmp $b->id } @$result;
	20322					109682
118							}
119							}
120
121	138	50				1514	$debug and DEBUG "fetched $count records";
122
123	138					550	return $count;
124							}
125
126
127	28			28		67	sub _set_header ($self) {
	28					67
	28					57
128	28					116	my $fh = $self->filehandle;
129	28					166	my $start_pos = tell($fh);
130
131	28					101	my $tcx = $self->csv_options;
132	28					132	my $csv = $self->csv;
133	28		66			147	my $sep = $tcx->{sep} // $tcx->{sep_char};
134	28					110	my $hd = $self->has_header;
135
136	28	100				192	my $sep_set = $sep ? [$sep] : $self->sep_auto;
137	28					95	my @rec;
138
139	28	50	100			151	if ( $sep and defined $hd and not $hd ) {
			66
140	0					0	@rec = $csv->getline($fh)->@*;
141							}
142							else { # no separator defined and/or autodetect
143							try {
144							@rec = $csv->header(
145							$fh, {
146							$sep_set ? ( sep_set => $sep_set ) : (),
147	1002					1316	munge_column_names => sub ($hcol) {
148	1002					1331	state $count = 0;
149	1002					1371	state $seen = {};
150	1002	100	66			2558	$hcol = 'unnamed_' . ++$count
151							if $hcol !~ /\S/ and $self->fix_empty_header;
152	1002	50				2395	if ( $self->make_header_unique ) {
153	0	0				0	$hcol .= "_$seen->{$hcol}" if $seen->{$hcol}++;
154							}
155	1002					4853	return $hcol;
156							},
157							}
158	28	100		28		1802	);
159							# very simple header detection: if it contains a naked numerical value
160							# => assume it is not a header
161	22		66			3467	$hd //= none { /^\d+[\.\,]?\d+$/ } @rec;
	82					149
162	22	100				113	INFO "Detected Separator: >>", $csv->sep, '<<' unless $sep;
163							}
164							catch {
165							# exeption will be thrown if non unique fields are found (1013)
166							# => this is only fatal if we should have a header
167							# or we found more than one seperator (1011)
168							# => this is always fatal
169	6	50	33	6		5042	LOGDIE "Error reading first line from csv, $_" if $hd or 0 + $csv->error_diag == 1011;
170
171							# else read again with getline(), separator is detected
172	6					204	INFO "Detected Separator: >>", $csv->sep, '<<';
173	6					214	seek( $fh, $start_pos, 0 );
174	6					245	@rec = $csv->getline($fh)->@*;
175	6					588	$hd = 0; # defined but 0 ==> we don't have a header
176	28					298	};
177							}
178
179
180	28	50		0		735	$debug and DEBUG "setting header from record:", sub { Dump( \@rec ) };
	0					0
181	28	100				89	if ($hd) {
182	22					91	$self->has_header(1);
183	22					188	INFO "Setting header info from header line";
184	22					206	$self->{__ro__header} = \@rec;
185							}
186							else {
187	6					35	$self->has_header(0);
188	6					66	INFO "Setting header info from column numbers";
189	6					71	my @cols = 0 .. $#rec;
190	6					39	$self->{__ro__header} = \@cols;
191	6					41	$csv->column_names(@cols);
192	6					1587	seek( $fh, $start_pos, 0 );
193							}
194
195	28					132	return $self;
196							}
197
198
199	12562			12562		19230	sub _read_record ( $self, $rec_info = undef ) {
	12562					18947
	12562					20119
	12562					16439
200
201	12562					32409	my $fh = $self->filehandle;
202
203	12562					66788	my $rec;
204							my $pos;
205	12562					0	my $sln;
206	12562					0	my $rcount;
207	12562	100				27480	if ( defined $rec_info ) {
208	3026					39980	seek( $fh, $rec_info->{pos}, 0 );
209							try {
210	3026			3026		136398	$rec = $self->csv->getline($fh);
211							}
212							catch {
213	0			0		0	LOGDIE "Error reading csv data, $_";
214	3026					29554	};
215	3026					364866	$sln = $rec_info->{sln};
216	3026					9485	$rcount = tell($fh) - $rec_info->{pos};
217							}
218							else {
219	9536					17858	$pos = tell($fh);
220							try {
221	9536			9536		400744	$rec = $self->csv->getline($fh);
222							}
223							catch {
224	0			0		0	LOGDIE "Error reading csv data, $_";
225	9536					50287	};
226	9536					964785	$sln = ++$self->{_sln};
227	9536					24414	$rcount = tell($fh) - $pos;
228							}
229
230	12562	100				27682	return unless $rec;
231
232	12536		50			28759	$self->{read_bytes} += $rcount // 0;
233
234	12536	50		0		28806	$trace and TRACE "record array: ", sub { Dump($rec) };
	0					0
235
236	12536					46651	my $robj = Spreadsheet::Compare::Record->new(
237							rec => $rec,
238							reader => $self,
239							sln => $sln,
240							);
241
242	12536	50				25009	$trace and TRACE "record id: ", $robj->id;
243
244							#<<<
245							$robj->{__INFO__} = {
246	12536	100	100			34511	pos => $pos,
247							sln => $sln,
248							} if $self->chunker and not $rec_info;
249							#>>>
250
251	12536					83498	return $robj;
252							}
253
254
255	28			28		12418	sub DESTROY ( $self, @ ) {
	28					157
	28					46
256	28	50				201	close( $self->{fh} ) if $self->{fh};
257	28					2727	return;
258							}
259
260
261							1;
262
263							=head1 NAME
264
265							Spreadsheet::Compare::Reader::CSV - CSV File Adapter for Spreadsheet::Compare
266
267							=head1 DESCRIPTION
268
269							This module provides a fetch interface for reading records from CSV files. It uses
270							L<Text::CSV\|https://metacpan.org/pod/Text::CSV> to do the heavy lifting. This allows
271							the interface to have maximal flexibility.
272
273							=head1 ATTRIBUTES
274
275							If not stated otherwise, read write attributes can be set as options from the config file
276							passed to L<Spreadsheet::Compare> or L<spreadcomp>.
277
278							=head2 csv
279
280							(B<readonly>) The L<Text::CSV> instance.
281
282							=head2 csv_options
283
284							possible values: <hash>
285							default: { allow_whitespace : 1 }
286
287							Example:
288
289							csv_options:
290							allow_loose_quotes: 1
291							allow_whitespace: 1
292							sep: ';'
293
294							A reference to a hash with options for calling the L<Text::CSV> constructor.
295
296							=head2 filehandle
297
298							(B<readonly>) The filehandle for L</filename>.
299
300							=head2 filename
301
302							(B<readonly>) The filename of the used CSV file for this reader. Use L</files> for
303							filename specification.
304
305							=head2 files
306
307							possible values: <list of exactly 2 filenames>
308							default: []
309
310							Example:
311
312							files:
313							- ./left_dir/data.csv
314							- ./right_dir/data.csv
315
316							Relative filenames will be interpreted releative to L</rootdir>
317
318							=head2 fix_empty_header
319
320							possible values: <bool>
321							default: 1
322
323							If a header entry does not contain at least one non space character replace it with
324							'unnamed_<n>' with a simple counter <n>;
325
326							=head2 make_header_unique
327
328							possible values: <bool>
329							default: 0
330
331							If there should be duplicate header names, append an counter '_<n>' to make the header name unique.
332
333							=head2 rootdir
334
335							Set by L<Spreadsheet::Compare> during reader initialisation.
336							Same as L<Spreadsheet::Compare/rootdir>.
337
338							=head2 sep_auto
339
340							possible values: <list of possible separators>
341							default: undef
342
343							Example:
344
345							sep_auto: [ ";", ",", "\|", "\t" ]
346
347							Set the list of possible separators in header detection. If left undefined the
348							value set by B<sep> or B<sep_char> in L</csv_options> will be used.
349							(see L<Text::CSV/sep_set>).
350
351							=head2 skip_after_head
352
353							possible values: <integer>
354							default: 0
355
356							Number of lines to skip after reading the header line.
357
358							=head2 skip_before_head
359
360							possible values: <integer>
361							default: 0
362
363							Number of lines to skip at the beginning of the files before reading the
364							header line.
365
366							=head1 METHODS
367
368							L<Spreadsheet::Compare::Reader::CSV> inherits or overwrites all methods from L<Spreadsheet::Compare::Reader>.
369
370							=cut