File Coverage

blib/lib/Data/TableReader.pm

Criterion	Covered	Total	%
statement	284	398	71.3
branch	132	244	54.1
condition	33	84	39.2
subroutine	36	44	81.8
pod	5	6	83.3
total	490	776	63.1

line	stmt	bran	cond	sub	pod	time	code
1							package Data::TableReader;
2							$Data::TableReader::VERSION = '0.010';
3	4			4		4782	use Moo 2;
	4					32499
	4					24
4	4			4		4711	use Try::Tiny;
	4					10
	4					213
5	4			4		25	use Carp;
	4					7
	4					201
6	4			4		25	use List::Util 'max';
	4					6
	4					409
7	4			4		28	use Module::Runtime 'require_module';
	4					15
	4					25
8	4			4		2261	use Data::TableReader::Field;
	4					13
	4					161
9	4			4		1782	use Data::TableReader::Iterator;
	4					10
	4					22016
10
11							# ABSTRACT: Extract records from "dirty" tabular data sources
12
13
14							has input => ( is => 'rw', required => 1 );
15							has _file_handle => ( is => 'lazy' );
16							has _decoder_arg => ( is => 'rw', init_arg => 'decoder' );
17							has decoder => ( is => 'lazy', init_arg => undef );
18							has fields => ( is => 'rw', required => 1, coerce => \&_coerce_field_list );
19	25			25	0	67	sub field_list { @{ shift->fields } }
	25					463
20							has field_by_name => ( is => 'lazy' );
21							has record_class => ( is => 'rw', required => 1, default => sub { 'HASH' } );
22							has static_field_order => ( is => 'rw' ); # force order of columns
23							has header_row_at => ( is => 'rw', default => sub { [1,10] } ); # row of header, or range to scan
24							has header_row_combine => ( is => 'rw', lazy => 1, builder => 1 );
25							has on_unknown_columns => ( is => 'rw', default => sub { 'use' } );
26							has on_blank_row => ( is => 'rw', default => sub { 'next' } );
27							has on_validation_fail => ( is => 'rw', default => sub { 'die' } );
28							has log => ( is => 'rw', trigger => sub { shift->_clear_log } );
29
30							# Open 'input' if it isn't already a file handle
31							sub _build__file_handle {
32	13			13		139	my $self= shift;
33	13					31	my $i= $self->input;
34	13	100	66			100	return undef if ref($i) && (ref($i) eq "Spreadsheet::ParseExcel::Worksheet");
35	12	100	66			180	return $i if ref($i) && (ref($i) eq 'GLOB' or ref($i)->can('read'));
			33
36	8	50		2		212	open(my $fh, '<', $i) or croak "open($i): $!";
	2					16
	2					5
	2					17
37	8					1824	binmode $fh;
38	8					238	return $fh;
39							}
40
41							# Create ::Decoder instance either from user-supplied args, or by detecting input format
42							sub _build_decoder {
43	13			13		148	my $self= shift;
44	13					42	my $decoder_arg= $self->_decoder_arg;
45	13					31	my $decoder_ref= ref $decoder_arg;
46	13					28	my ($class, @args);
47	13	100	33			70	if (!$decoder_arg) {
		100
		50
		0
48	1					4	($class, @args)= $self->detect_input_format;
49	1					21	$self->_log->('trace', "Detected input format as %s", $class);
50							}
51							elsif (!$decoder_ref) {
52	4					7	$class= $decoder_arg;
53							}
54							elsif ($decoder_ref eq "HASH" or $decoder_ref eq "ARRAY") {
55	8	50				22	($class, @args)= $decoder_ref eq "ARRAY"? @$decoder_arg : do {
56	8					36	my %tmp= %$decoder_arg;
57	8					47	(delete($tmp{CLASS}), %tmp);
58							};
59	8	50				31	if(!$class) {
60	0					0	my ($input_class, @input_args)= $self->detect_input_format;
61	0	0				0	croak "decoder class not in arguments and unable to identify decoder class from input"
62							if !$input_class;
63	0					0	($class, @args)= ($input_class, @input_args, @args);
64							}
65							}
66							elsif ($decoder_ref->can('iterator')) {
67	0					0	return $decoder_arg;
68							}
69							else {
70	0					0	croak "Can't create decoder from $decoder_ref";
71							}
72	13	50				64	$class= "Data::TableReader::Decoder::$class"
73							unless $class =~ /::/;
74	13	50				66	require_module($class) or croak "$class does not exist or is not installed";
75	13					639	$self->_log->('trace', 'Creating decoder %s on input %s', $class, $self->input);
76	13	100	100			385	return $class->new(
77							file_name => ($self->input eq ($self->_file_handle\|\|"") ? '' : $self->input),
78							file_handle => $self->_file_handle,
79							_log => $self->_log,
80							@args
81							);
82							}
83
84							# User supplies any old perl data, but this field should always be an arrayref of ::Field
85							sub _coerce_field_list {
86	13			13		41420	my ($list)= @_;
87	13	50	33			94	defined $list and ref $list eq 'ARRAY' or croak "'fields' must be a non-empty arrayref";
88	13					42	my @list= @$list; # clone it, to make sure we don't unexpectedly alter the caller's data
89	13					36	for (@list) {
90	34	100				191	if (!ref $_) {
		50
91	5					87	$_= Data::TableReader::Field->new({ name => $_ });
92							} elsif (ref $_ eq 'HASH') {
93	29					108	my %args= %$_;
94							# "isa" alias for the 'type' attribute
95	29	50	33			92	$args{type}= delete $args{isa} if defined $args{isa} && !defined $args{type};
96	29					507	$_= Data::TableReader::Field->new(\%args)
97							} else {
98	0					0	croak "Can't coerce '$_' to a Field object"
99							}
100							}
101	13					273	return \@list;
102							}
103
104							sub _build_field_by_name {
105	0			0		0	my $self= shift;
106							# reverse list so first field of a name takes precedence
107	0					0	{ map { $_->name => $_ } reverse @{ $self->fields } }
	0					0
	0					0
	0					0
108							}
109
110							sub _build_header_row_combine {
111	12			12		570	my $self= shift;
112							# If headers contain "\n", we need to collect multiple cells per column
113	12					32	max map { 1+(()= ($_->header_regex =~ /\\n\|\n/g)) } $self->field_list;
	34					758
114							}
115
116							# 'log' can be a variety of things, but '_log' will always be a coderef
117							has _log => ( is => 'lazy', clearer => 1 );
118							sub _build__log {
119	13			13		333	_log_fn(shift->log);
120							}
121							sub _log_fn {
122	13			13		102	my $dest= shift;
123							!$dest? sub {
124	2			2		11	my ($level, $msg, @args)= @_;
125	2	50	33			15	return unless $level eq 'warn' or $level eq 'error';
126	0	0				0	$msg= sprintf($msg, @args) if @args;
127	0					0	warn $msg."\n";
128							}
129							: ref $dest eq 'ARRAY'? sub {
130	11			11		54	my ($level, $msg, @args)= @_;
131	11	100	66			48	return unless $level eq 'warn' or $level eq 'error';
132	1	50				7	$msg= sprintf($msg, @args) if @args;
133	1					4	push @$dest, [ $level, $msg ];
134							}
135							: ref($dest)->can('info')? sub {
136	118			118		848	my ($level, $msg, @args)= @_;
137	118	100				550	$dest->$level( @args? sprintf($msg, @args) : $msg )
		100
138							if $dest->can('is_'.$level)->($dest);
139							}
140	13	50				332	: croak "Don't know how to log to $dest";
		100
		100
141							}
142
143
144							sub detect_input_format {
145	1			1	1	3	my ($self, $filename, $magic)= @_;
146
147	1					4	my $input= $self->input;
148							# As convenience to spreadsheet users, let input be a parsed workbook/worksheet object.
149	1	50	33			19	return ('XLSX', sheet => $input)
150							if ref($input) && ref($input)->can('get_cell');
151	0	0	0			0	return ('XLSX', workbook => $input)
152							if ref($input) && ref($input)->can('worksheets');
153
154							# Load first block of file, unless supplied
155	0					0	my $fpos;
156	0	0				0	if (!defined $magic) {
157	0					0	my $fh= $self->_file_handle;
158							# Need to be able to seek.
159	0	0				0	if (seek($fh, 0, 1)) {
		0
160	0					0	$fpos= tell $fh;
161	0					0	read($fh, $magic, 4096);
162	0	0				0	seek($fh, $fpos, 0) or croak "seek: $!";
163							}
164							elsif ($fh->can('ungets')) {
165	0					0	$fpos= 0; # to indicate that we did try reading the file
166	0					0	read($fh, $magic, 4096);
167	0					0	$fh->ungets($magic);
168							}
169							else {
170	0					0	$self->_log->('notice',"Can't fully detect input format because handle is not seekable."
171							." Consider fully buffering the file, or using FileHandle::Unget");
172	0					0	$magic= '';
173							}
174							}
175
176							# Excel is obvious so check it first. This handles cases where an excel file is
177							# erroneously named ".csv" and sillyness like that.
178	0	0				0	return ( 'XLSX' ) if $magic =~ /^PK(\x03\x04\|\x05\x06\|\x07\x08)/;
179	0	0				0	return ( 'XLS' ) if $magic =~ /^\xD0\xCF\x11\xE0/;
180
181							# Else trust the file extension, because TSV with commas can be very similar to CSV with
182							# tabs in the data, and some crazy person might store an HTML document as the first element
183							# of a CSV file.
184							# Detect filename if not supplied
185	0	0				0	if (!defined $filename) {
186	0					0	$filename= '';
187	0	0	0			0	$filename= "$input" if defined $input and (!ref $input \|\| ref($input) =~ /path\|file/i);
			0
188							}
189	0	0				0	if ($filename =~ /\.([^.]+)$/) {
190	0					0	my $suffix= uc($1);
191	0	0				0	return 'HTML' if $suffix eq 'HTM';
192	0					0	return $suffix;
193							}
194
195							# Else probe some more...
196	0					0	$self->_log->('debug',"Probing file format because no filename suffix");
197	0	0				0	length $magic or croak "Can't probe format. No filename suffix, and "
		0
198							.(!defined $fpos? "unseekable file handle" : "no content");
199
200							# HTML is pretty obvious
201	0	0				0	return 'HTML' if $magic =~ /^(\xEF\xBB\xBF\|\xFF\xFE\|\xFE\xFF)?<(!DOCTYPE )HTML/i;
202							# Else guess between CSV and TSV
203	0					0	my ($probably_csv, $probably_tsv)= (0,0);
204	0	0				0	++$probably_csv if $magic =~ /^(\xEF\xBB\xBF\|\xFF\xFE\|\xFE\xFF)?["']?[\w ]+["']?,/;
205	0	0				0	++$probably_tsv if $magic =~ /^(\xEF\xBB\xBF\|\xFF\xFE\|\xFE\xFF)?["']?[\w ]+["']?\t/;
206	0					0	my $comma_count= () = ($magic =~ /,/g);
207	0					0	my $tab_count= () = ($magic =~ /\t/g);
208	0					0	my $eol_count= () = ($magic =~ /\n/g);
209	0	0	0			0	++$probably_csv if $comma_count > $eol_count and $comma_count > $tab_count;
210	0	0	0			0	++$probably_tsv if $tab_count > $eol_count and $tab_count > $comma_count;
211	0					0	$self->_log->('debug', 'probe results: comma_count=%d tab_count=%d eol_count=%d probably_csv=%d probably_tsv=%d',
212							$comma_count, $tab_count, $eol_count, $probably_csv, $probably_tsv);
213	0	0	0			0	return 'CSV' if $probably_csv and $probably_csv > $probably_tsv;
214	0	0	0			0	return 'TSV' if $probably_tsv and $probably_tsv > $probably_csv;
215	0					0	croak "Can't determine file format";
216							}
217
218
219							has _table_found => ( is => 'rw', lazy => 1, builder => 1, clearer => 1, predicate => 1 );
220							sub _build__table_found {
221	4			4		29	my $self= shift;
222	4					11	my %loc= ( croak_on_fail => 1 );
223	4					65	$self->_find_table($self->decoder->iterator, \%loc);
224	4					15	\%loc;
225							}
226
227							sub find_table {
228	8			8	1	3833	my $self= shift;
229	8	50				41	return 1 if $self->_has_table_found;
230	8					13	my %loc;
231	8	50				205	if ($self->_find_table($self->decoder->iterator, \%loc)) {
232	8					177	$self->_table_found(\%loc);
233	8					88	return 1;
234							}
235	0					0	return 0;
236							}
237
238	1			1	1	25	sub col_map { shift->_table_found->{col_map}; }
239	6			6	1	1680	sub field_map { shift->_table_found->{field_map}; }
240
241							sub _find_table {
242	12			12		34	my ($self, $data_iter, $stash)= @_;
243	12		50			31	$stash \|\|= {};
244	12		66			39	while (!$self->_find_table_in_dataset($data_iter, $stash)
			66
245							&& !defined $stash->{fatal}
246							&& $data_iter->next_dataset)
247							{}
248	12	50				34	if ($stash->{col_map}) {
249							# Calculate field map from col map
250	12					25	my $col_map= $stash->{col_map};
251	12					17	my %fmap;
252	12					39	for my $i (0 .. $#$col_map) {
253	57	100				110	next unless $col_map->[$i];
254	39	100				90	if ($col_map->[$i]->array) {
255	10					13	push @{ $fmap{$col_map->[$i]->name} }, $i;
	10					24
256							} else {
257	29					80	$fmap{$col_map->[$i]->name}= $i;
258							}
259							}
260	12					87	$stash->{field_map}= \%fmap;
261							# And record the stream position of the start of the table
262	12					50	$stash->{first_record_pos}= $data_iter->tell;
263	12					28	$stash->{data_iter}= $data_iter;
264	12					36	return $stash;
265							}
266							else {
267	0		0			0	my $err= $stash->{fatal} \|\| "Can't locate valid header";
268	0					0	$self->_log->('error', $err);
269	0	0				0	croak $err if $stash->{croak_on_fail};
270	0					0	return undef;
271							}
272							}
273
274							sub _find_table_in_dataset {
275	13			13		30	my ($self, $data_iter, $stash)= @_;
276							# If header_row_at is undef, then there is no header.
277							# Ensure static_field_order, then set up columns.
278	13					42	my @fields= $self->field_list;
279	13					154	my $header_at= $self->header_row_at;
280	13	50				64	if (!defined $header_at) {
281	0	0				0	unless ($self->static_field_order) {
282	0					0	$stash->{fatal}= "You must enable 'static_field_order' if there is no header row";
283	0					0	return;
284							}
285	0					0	$stash->{col_map}= \@fields;
286	0					0	return 1;
287							}
288
289							# If headers contain "\n", we need to collect multiple cells per column
290	13					232	my $row_accum= $self->header_row_combine;
291
292	13	50				77	my ($start, $end)= ref $header_at? @$header_at : ( $header_at, $header_at );
293	13					24	my @rows;
294
295							# If header_row_at doesn't start at 1, seek forward
296	13	50				42	if ($start > 1) {
297	0					0	$self->_log->('trace', 'Skipping to row %s', $start);
298	0					0	push @rows, $data_iter->() for 1..$start-1;
299							}
300
301							# Scan through the rows of the dataset up to the end of header_row_at, accumulating rows so that
302							# multi-line regexes can match.
303	13					48	for ($start .. $end) {
304	22					146	my $vals= $data_iter->();
305	22	100				54	if (!$vals) { # if undef, we reached end of dataset
306	1					18	$self->_log->('trace', 'EOF');
307	1					9	last;
308							}
309	21	100				50	if ($row_accum > 1) {
310	5					10	push @rows, $vals;
311	5					14	shift @rows while @rows > $row_accum;
312	5					10	$vals= [ map { my $c= $_; join("\n", map $_->[$c], @rows) } 0 .. $#{$rows[-1]} ];
	28					42
	28					79
	5					14
313	5					25	$stash->{context}= $row_accum.' rows ending at '.$data_iter->position;
314							} else {
315	16					53	$stash->{context}= $data_iter->position;
316							}
317	21					419	$self->_log->('trace', 'Checking for headers on %s', $stash->{context});
318	21					206	$stash->{context}.= ': ';
319	21	50				99	$stash->{col_map}= $self->static_field_order?
320							# If static field order, look for headers in sequence
321							$self->_match_headers_static($vals, $stash)
322							# else search for each header
323							: $self->_match_headers_dynamic($vals, $stash);
324	21	100				112	return 1 if $stash->{col_map};
325	9	50				23	return if $stash->{fatal};
326	9					217	$self->_log->('debug', '%sNo match', $stash->{context});
327							}
328	1					17	$self->_log->('warn','No row in dataset matched full header requirements');
329	1					548	return;
330							}
331
332							sub _match_headers_static {
333	0			0		0	my ($self, $header, $stash)= @_;
334	0					0	my $fields= $self->fields;
335	0					0	for my $i (0 .. $#$fields) {
336	0	0				0	next if $header->[$i] =~ $fields->[$i]->header_regex;
337
338							# Field header doesn't match. Start over on next row.
339	0		0			0	$self->_log->('debug','%sMissing field %s', $stash->{context}\|\|'', $fields->[$i]->name);
340	0					0	return;
341							}
342							# found a match for every field!
343	0		0			0	$self->_log->('debug','%sFound!', $stash->{context}\|\|'');
344	0					0	return $fields;
345							}
346
347							sub _match_headers_dynamic {
348	21			21		46	my ($self, $header, $stash)= @_;
349	21		50			61	my $context= $stash->{context} \|\| '';
350	21					31	my %col_map;
351	21					363	my $fields= $self->fields;
352							my $free_fields= $stash->{free_fields} \|\|= [
353							# Sort required fields to front, to fail faster on non-matching rows
354	31	100				102	sort { $a->required? -1 : $b->required? 1 : 0 }
		100
355	21		100			168	grep { !$_->follows_list } @$fields
	34					85
356							];
357							my $follows_fields= $stash->{follows_fields} \|\|= [
358	21		100			70	grep { $_->follows_list } @$fields
	34					68
359							];
360	21					52	for my $f (@$free_fields) {
361	42					715	my $hr= $f->header_regex;
362	42					871	$self->_log->('debug', 'looking for %s', $hr);
363	42					382	my @found= grep { $header->[$_] =~ $hr } 0 .. $#$header;
	252					889
364	42	100				169	if (@found == 1) {
		100
		100
365	30	50				80	if ($col_map{$found[0]}) {
366	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
367							$self->_log->('info','%sField %s and %s both match column %s',
368	0					0	$context, $f->name, $col_map{$found[0]}->name, $found[0]);
369	0					0	return;
370							}
371	30					107	$col_map{$found[0]}= $f;
372							}
373							elsif (@found > 1) {
374	1	50				5	if ($f->array) {
375							# Array columns may be found more than once
376	1					9	$col_map{$_}= $f for @found;
377							} else {
378	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
379	0					0	$self->_log->('info','%sField %s matches more than one column: %s',
380							$context, $f->name, join(', ', @found));
381	0					0	return;
382							}
383							}
384							elsif ($f->required) {
385	9					30	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
386	9					215	$self->_log->('info','%sNo match for required field %s', $context, $f->name);
387	9					5009	return;
388							}
389							# else Not required, and not found
390							}
391							# Need to have found at least one column (even if none required)
392	12	50				42	unless (keys %col_map) {
393	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
394	0					0	$self->_log->('debug','%sNo field headers found', $context);
395	0					0	return;
396							}
397							# Now, check for any of the 'follows' fields, some of which might also be 'required'.
398	12	100				31	if (@$follows_fields) {
399	2					4	my %following;
400							my %found;
401	2					7	for my $i (0 .. $#$header) {
402	17	100				34	if ($col_map{$i}) {
403	5					20	%following= ( $col_map{$i}->name => $col_map{$i} );
404							} else {
405	12					19	my $val= $header->[$i];
406	12					15	my @match;
407	12					21	for my $f (@$follows_fields) {
408	18	100				59	next unless grep $following{$_}, $f->follows_list;
409	9	100				151	push @match, $f if $val =~ $f->header_regex;
410							}
411	12	100				88	if (@match == 1) {
		50
412	7	50	66			34	if ($found{$match[0]} && !$match[0]->array) {
413	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
414	0					0	$self->_log->('info','%sField %s matches multiple columns',
415							$context, $match[0]->name);
416	0					0	return;
417							}
418	7					14	$col_map{$i}= $match[0];
419	7					17	$found{$match[0]}= $i;
420	7					23	$following{$match[0]->name}= $match[0];
421							}
422							elsif (@match > 1) {
423	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
424	0					0	$self->_log->('info','%sField %s and %s both match column %d',
425							$context, $match[0]->name, $match[1]->name, $i+1);
426	0					0	return;
427							}
428							else {
429	5					14	%following= ();
430							}
431							}
432							}
433							# Check if any of the 'follows' fields were required
434	2	50				5	if (my @unfound= grep { !$found{$_} && $_->required } @$follows_fields) {
	3	50				17
435	0					0	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
436							$self->_log->('info','%sNo match for required %s %s', $context,
437	0	0				0	(@unfound > 1? ('fields', join(', ', map { $_->name } sort @unfound))
	0					0
438							: ('field', $unfound[0]->name)
439							));
440	0					0	return;
441							}
442							}
443							# Now, if there are any un-claimed columns, handle per 'on_unknown_columns' setting.
444	12					33	my @unclaimed= grep { !$col_map{$_} } 0 .. $#$header;
	57					125
445	12	100				39	if (@unclaimed) {
446	5					18	my $act= $self->on_unknown_columns;
447	5					18	my $unknown_list= join(', ', map $self->_fmt_header_text($header->[$_]), @unclaimed);
448	5	50				18	$act= $act->($self, $header, \@unclaimed) if ref $act eq 'CODE';
449	5	50				16	if ($act eq 'use') {
		0
		0
450	5					96	$self->_log->('warn','%sIgnoring unknown columns: %s', $context, $unknown_list);
451							} elsif ($act eq 'next') {
452	0					0	$self->_log->('warn','%sWould match except for unknown columns: %s',
453							$context, $unknown_list);
454							} elsif ($act eq 'die') {
455	0					0	$stash->{fatal}= "${context}Header row includes unknown columns: $unknown_list";
456							} else {
457	0					0	$stash->{fatal}= "Invalid action '$act' for 'on_unknown_columns'";
458							}
459	5					2457	$self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
460	5	50				59	return if $stash->{fatal};
461							}
462	12					96	return [ map $col_map{$_}, 0 .. $#$header ];
463							}
464							# Make header string readable for log messages
465							sub _fmt_header_text {
466	87	100		87		177	shift if ref $_[0];
467	87					117	my $x= shift;
468	87					155	$x =~ s/ ( [^[:print:]] ) / sprintf("\\x%02X", ord $1 ) /gex;
	11					44
469	87					473	qq{"$x"};
470							}
471							# format the colmap into a string
472							sub _colmap_progress_str {
473	14			14		34	my ($colmap, $headers)= @_;
474							join(' ', map {
475	14	100				37	$colmap->{$_}? $_.'='.$colmap->{$_}->name
	95					357
476							: $_.':'._fmt_header_text($headers->[$_])
477							} 0 .. $#$headers)
478							}
479
480
481							sub iterator {
482	14			14	1	6775	my $self= shift;
483	14					383	my $fields= $self->fields;
484							# Creating the record iterator consumes the data source's iterator.
485							# The first time after detecting the table, we continue with the same iterator.
486							# Every time after that we need to create a new data iterator and seek to the
487							# first record under the header.
488	14					296	my $data_iter= delete $self->_table_found->{data_iter};
489	14	100				92	unless ($data_iter) {
490	2					35	$data_iter= $self->decoder->iterator;
491	2					36	$data_iter->seek($self->_table_found->{first_record_pos});
492							}
493	14					243	my $col_map= $self->_table_found->{col_map};
494	14					280	my $field_map= $self->_table_found->{field_map};
495	14					140	my @row_slice; # one column index per field, and possibly more for array_val_map
496							my @arrayvals; # list of source index and destination index for building array values
497	14					0	my @field_names; # ordered list of field names where row slice should be assigned
498	14					0	my @trim_idx; # list of array indicies which should be whitespace-trimmed.
499	14					0	my @blank_val; # blank value per each fetched column
500	14					0	my @type_check;# list of
501	14					0	my $class; # optional object class for the resulting rows
502
503							# If result is array, the slice of the row must match the position of the fields in the
504							# $self->fields array. If a field was not found it will get an undef for that slot.
505							# It also results in an undef for secondary fields of the same name as the first.
506	14	50				58	if ($self->record_class eq 'ARRAY') {
507	0					0	my %remaining= %$field_map;
508							@row_slice= map {
509	0					0	my $src= delete $remaining{$_->name};
	0					0
510	0	0				0	defined $src? $src : 0x7FFFFFFF
511							} @$fields;
512							}
513							# If result is anything else, then only slice out the columns that are used for the fields
514							# that we located.
515							else {
516	14	50				60	$class= $self->record_class
517							unless 'HASH' eq $self->record_class;
518	14					55	@field_names= keys %$field_map;
519	14					38	@row_slice= values %$field_map;
520							}
521							# For any field whose value is an array of more that one source column,
522							# encode those details in @arrayvals, and update @row_slice and @trim_idx accordingly
523	14					79	for (0 .. $#row_slice) {
524	37	100				82	if (!ref $row_slice[$_]) {
525	35					58	my $field= $col_map->[$row_slice[$_]];
526	35	50				96	push @trim_idx, $_ if $field->trim;
527	35					71	push @blank_val, $field->blank;
528	35	100				103	push @type_check, $self->_make_validation_callback($field, $_)
529							if $field->type;
530							}
531							else {
532							# This field is an array-value, so add the src columns to @row_slice
533							# and list it in @arrayvals, and update @trim_idx if needed
534	2					4	my $src= $row_slice[$_];
535	2					4	$row_slice[$_]= 0x7FFFFFFF;
536	2					4	my $from= @row_slice;
537	2					7	push @row_slice, @$src;
538	2					5	push @arrayvals, [ $_, $from, scalar @$src ];
539	2					6	for ($from .. $#row_slice) {
540	10					15	my $field= $col_map->[$row_slice[$_]];
541	10	50				23	push @trim_idx, $_ if $field->trim;
542	10					19	push @blank_val, $field->blank;
543	10	50				23	push @type_check, $self->_make_validation_callback($field, $_)
544							if $field->type;
545							}
546							}
547							}
548	14					30	@arrayvals= reverse @arrayvals;
549	14					27	my ($n_blank, $first_blank, $eof);
550							my $sub= sub {
551	48	100	50	48		2556	again:
			66
552							# Pull the specific slice of the next row that we need
553							my $row= !$eof && $data_iter->(\@row_slice)
554							or ++$eof && return undef;
555							# Apply 'trim' to any column whose field requested it
556	35					97	for (grep { defined } @{$row}[@trim_idx]) {
	89					193
	35					71
557	85					162	$_ =~ s/\s+$//;
558	85					161	$_ =~ s/^\s+//;
559							}
560							# Apply 'blank value' to every column which is zero length
561	35					59	$n_blank= 0;
562							$row->[$_]= $blank_val[$_]
563	35	100	100			76	for grep { (!defined $row->[$_] \|\| !length $row->[$_]) && ++$n_blank } 0..$#$row;
	93					374
564							# If all are blank, then handle according to $on_blank_row setting
565	35	100				133	if ($n_blank == @$row) {
		100
566	2		33			12	$first_blank \|\|= $data_iter->position;
567	2					13	goto again;
568							} elsif ($first_blank) {
569	2	50				7	unless ($self->_handle_blank_row($first_blank, $data_iter->position)) {
570	0					0	$eof= 1;
571	0					0	return undef;
572							}
573	2					7	$first_blank= undef;
574							}
575							# Check type constraints, if any
576	33	100				95	if (@type_check) {
577	18	100				40	if (my @failed= map $_->($row), @type_check) {
578	6	100				22	$self->_handle_validation_fail(\@failed, $row, $data_iter->position.': ')
579							or goto again;
580							}
581							}
582							# Collect all the array-valued fields from the tail of the row
583	29					71	$row->[$_->[0]]= [ splice @$row, $_->[1], $_->[2] ] for @arrayvals;
584							# stop here if the return class is 'ARRAY'
585	29	50				59	return $row unless @field_names;
586							# Convert the row to a hashref
587	29					43	my %rec;
588	29					104	@rec{@field_names}= @$row;
589							# Construct a class, if requested, else return hashref
590	29	50				223	return $class? $class->new(\%rec) : \%rec;
591	14					110	};
592	14					124	return Data::TableReader::_RecIter->new(
593							$sub, { data_iter => $data_iter, reader => $self },
594							);
595							}
596
597							sub _make_validation_callback {
598	4			4		9	my ($self, $field, $index)= @_;
599	4					8	my $t= $field->type;
600							ref $t eq 'CODE'? sub {
601	18			18		54	my $e= $t->($_[0][$index]);
602	18	100				135	defined $e? ([ $field, $index, $e ]) : ()
603							}
604							: $t->can('validate')? sub {
605	0			0		0	my $e= $t->validate($_[0][$index]);
606	0	0				0	defined $e? ([ $field, $index, $e ]) : ()
607							}
608	4	0				27	: croak "Invalid type constraint $t on field ".$field->name;
		50
609							}
610
611							sub _handle_blank_row {
612	2			2		22	my ($self, $first, $last)= @_;
613	2					11	my $act= $self->on_blank_row;
614	2	50				10	$act= $act->($self, $first, $last)
615							if ref $act eq 'CODE';
616	2	50				21	if ($act eq 'next') {
617	2					48	$self->_log->('warn', 'Skipping blank rows from %s until %s', $first, $last);
618	2					1183	return 1;
619							}
620	0	0				0	if ($act eq 'last') {
621	0					0	$self->_log->('warn', 'Ending at blank row %s', $first);
622	0					0	return 0;
623							}
624	0	0				0	if ($act eq 'die') {
625	0					0	my $msg= "Encountered blank rows at $first..$last";
626	0					0	$self->_log->('error', $msg);
627	0					0	croak $msg;
628							}
629	0					0	croak "Invalid value for 'on_blank_row': \"$act\"";
630							}
631
632							sub _handle_validation_fail {
633	6			6		17	my ($self, $failures, $values, $context)= @_;
634	6					16	my $act= $self->on_validation_fail;
635	6	100				27	$act= $act->($self, $failures, $values, $context)
636							if ref $act eq 'CODE';
637	6					47	my $errors= join(', ', map $_->[0]->name.': '.$_->[2], @$failures);
638	6	100				15	if ($act eq 'next') {
639	3	50				63	$self->_log->('warn', "%sSkipped for data errors: %s", $context, $errors) if $errors;
640	3					1312	return 0;
641							}
642	3	100				8	if ($act eq 'use') {
643	2	100				25	$self->_log->('warn', "%sPossible data errors: %s", $context, $errors) if $errors;
644	2					7	return 1;
645							}
646	1	50				4	if ($act eq 'die') {
647	1					9	my $msg= "${context}Invalid record: $errors";
648	1					27	$self->_log->('error', $msg);
649	1					749	croak $msg;
650							}
651							}
652
653	4			4		1084	BEGIN { @Data::TableReader::_RecIter::ISA= ( 'Data::TableReader::Iterator' ) }
654							sub Data::TableReader::_RecIter::all {
655	8			8		16	my $self= shift;
656	8					17	my (@rec, $x);
657	8					18	push @rec, $x while ($x= $self->());
658	8					226	return \@rec;
659							}
660							sub Data::TableReader::_RecIter::position {
661	0			0		0	shift->_fields->{data_iter}->position(@_);
662							}
663							sub Data::TableReader::_RecIter::progress {
664	0			0		0	shift->_fields->{data_iter}->progress(@_);
665							}
666							sub Data::TableReader::_RecIter::tell {
667	0			0		0	shift->_fields->{data_iter}->tell(@_);
668							}
669							sub Data::TableReader::_RecIter::seek {
670	0			0		0	shift->_fields->{data_iter}->seek(@_);
671							}
672							sub Data::TableReader::_RecIter::next_dataset {
673							shift->_fields->{reader}->_log
674	0			0		0	->('warn',"Searching for supsequent table headers is not supported yet");
675	0					0	return 0;
676							}
677
678							1;
679
680							__END__