File Coverage

blib/lib/Data/TableReader.pm
Criterion Covered Total %
statement 284 398 71.3
branch 132 244 54.1
condition 33 84 39.2
subroutine 36 44 81.8
pod 5 6 83.3
total 490 776 63.1


line stmt bran cond sub pod time code
1             package Data::TableReader;
2             $Data::TableReader::VERSION = '0.010';
3 4     4   4782 use Moo 2;
  4         32499  
  4         24  
4 4     4   4711 use Try::Tiny;
  4         10  
  4         213  
5 4     4   25 use Carp;
  4         7  
  4         201  
6 4     4   25 use List::Util 'max';
  4         6  
  4         409  
7 4     4   28 use Module::Runtime 'require_module';
  4         15  
  4         25  
8 4     4   2261 use Data::TableReader::Field;
  4         13  
  4         161  
9 4     4   1782 use Data::TableReader::Iterator;
  4         10  
  4         22016  
10              
11             # ABSTRACT: Extract records from "dirty" tabular data sources
12              
13              
14             has input => ( is => 'rw', required => 1 );
15             has _file_handle => ( is => 'lazy' );
16             has _decoder_arg => ( is => 'rw', init_arg => 'decoder' );
17             has decoder => ( is => 'lazy', init_arg => undef );
18             has fields => ( is => 'rw', required => 1, coerce => \&_coerce_field_list );
19 25     25 0 67 sub field_list { @{ shift->fields } }
  25         463  
20             has field_by_name => ( is => 'lazy' );
21             has record_class => ( is => 'rw', required => 1, default => sub { 'HASH' } );
22             has static_field_order => ( is => 'rw' ); # force order of columns
23             has header_row_at => ( is => 'rw', default => sub { [1,10] } ); # row of header, or range to scan
24             has header_row_combine => ( is => 'rw', lazy => 1, builder => 1 );
25             has on_unknown_columns => ( is => 'rw', default => sub { 'use' } );
26             has on_blank_row => ( is => 'rw', default => sub { 'next' } );
27             has on_validation_fail => ( is => 'rw', default => sub { 'die' } );
28             has log => ( is => 'rw', trigger => sub { shift->_clear_log } );
29              
30             # Open 'input' if it isn't already a file handle
31             sub _build__file_handle {
32 13     13   139 my $self= shift;
33 13         31 my $i= $self->input;
34 13 100 66     100 return undef if ref($i) && (ref($i) eq "Spreadsheet::ParseExcel::Worksheet");
35 12 100 66     180 return $i if ref($i) && (ref($i) eq 'GLOB' or ref($i)->can('read'));
      33        
36 8 50   2   212 open(my $fh, '<', $i) or croak "open($i): $!";
  2         16  
  2         5  
  2         17  
37 8         1824 binmode $fh;
38 8         238 return $fh;
39             }
40              
41             # Create ::Decoder instance either from user-supplied args, or by detecting input format
42             sub _build_decoder {
43 13     13   148 my $self= shift;
44 13         42 my $decoder_arg= $self->_decoder_arg;
45 13         31 my $decoder_ref= ref $decoder_arg;
46 13         28 my ($class, @args);
47 13 100 33     70 if (!$decoder_arg) {
    100          
    50          
    0          
48 1         4 ($class, @args)= $self->detect_input_format;
49 1         21 $self->_log->('trace', "Detected input format as %s", $class);
50             }
51             elsif (!$decoder_ref) {
52 4         7 $class= $decoder_arg;
53             }
54             elsif ($decoder_ref eq "HASH" or $decoder_ref eq "ARRAY") {
55 8 50       22 ($class, @args)= $decoder_ref eq "ARRAY"? @$decoder_arg : do {
56 8         36 my %tmp= %$decoder_arg;
57 8         47 (delete($tmp{CLASS}), %tmp);
58             };
59 8 50       31 if(!$class) {
60 0         0 my ($input_class, @input_args)= $self->detect_input_format;
61 0 0       0 croak "decoder class not in arguments and unable to identify decoder class from input"
62             if !$input_class;
63 0         0 ($class, @args)= ($input_class, @input_args, @args);
64             }
65             }
66             elsif ($decoder_ref->can('iterator')) {
67 0         0 return $decoder_arg;
68             }
69             else {
70 0         0 croak "Can't create decoder from $decoder_ref";
71             }
72 13 50       64 $class= "Data::TableReader::Decoder::$class"
73             unless $class =~ /::/;
74 13 50       66 require_module($class) or croak "$class does not exist or is not installed";
75 13         639 $self->_log->('trace', 'Creating decoder %s on input %s', $class, $self->input);
76 13 100 100     385 return $class->new(
77             file_name => ($self->input eq ($self->_file_handle||"") ? '' : $self->input),
78             file_handle => $self->_file_handle,
79             _log => $self->_log,
80             @args
81             );
82             }
83              
84             # User supplies any old perl data, but this field should always be an arrayref of ::Field
85             sub _coerce_field_list {
86 13     13   41420 my ($list)= @_;
87 13 50 33     94 defined $list and ref $list eq 'ARRAY' or croak "'fields' must be a non-empty arrayref";
88 13         42 my @list= @$list; # clone it, to make sure we don't unexpectedly alter the caller's data
89 13         36 for (@list) {
90 34 100       191 if (!ref $_) {
    50          
91 5         87 $_= Data::TableReader::Field->new({ name => $_ });
92             } elsif (ref $_ eq 'HASH') {
93 29         108 my %args= %$_;
94             # "isa" alias for the 'type' attribute
95 29 50 33     92 $args{type}= delete $args{isa} if defined $args{isa} && !defined $args{type};
96 29         507 $_= Data::TableReader::Field->new(\%args)
97             } else {
98 0         0 croak "Can't coerce '$_' to a Field object"
99             }
100             }
101 13         273 return \@list;
102             }
103              
104             sub _build_field_by_name {
105 0     0   0 my $self= shift;
106             # reverse list so first field of a name takes precedence
107 0         0 { map { $_->name => $_ } reverse @{ $self->fields } }
  0         0  
  0         0  
  0         0  
108             }
109              
110             sub _build_header_row_combine {
111 12     12   570 my $self= shift;
112             # If headers contain "\n", we need to collect multiple cells per column
113 12         32 max map { 1+(()= ($_->header_regex =~ /\\n|\n/g)) } $self->field_list;
  34         758  
114             }
115              
116             # 'log' can be a variety of things, but '_log' will always be a coderef
117             has _log => ( is => 'lazy', clearer => 1 );
118             sub _build__log {
119 13     13   333 _log_fn(shift->log);
120             }
121             sub _log_fn {
122 13     13   102 my $dest= shift;
123             !$dest? sub {
124 2     2   11 my ($level, $msg, @args)= @_;
125 2 50 33     15 return unless $level eq 'warn' or $level eq 'error';
126 0 0       0 $msg= sprintf($msg, @args) if @args;
127 0         0 warn $msg."\n";
128             }
129             : ref $dest eq 'ARRAY'? sub {
130 11     11   54 my ($level, $msg, @args)= @_;
131 11 100 66     48 return unless $level eq 'warn' or $level eq 'error';
132 1 50       7 $msg= sprintf($msg, @args) if @args;
133 1         4 push @$dest, [ $level, $msg ];
134             }
135             : ref($dest)->can('info')? sub {
136 118     118   848 my ($level, $msg, @args)= @_;
137 118 100       550 $dest->$level( @args? sprintf($msg, @args) : $msg )
    100          
138             if $dest->can('is_'.$level)->($dest);
139             }
140 13 50       332 : croak "Don't know how to log to $dest";
    100          
    100          
141             }
142              
143              
144             sub detect_input_format {
145 1     1 1 3 my ($self, $filename, $magic)= @_;
146              
147 1         4 my $input= $self->input;
148             # As convenience to spreadsheet users, let input be a parsed workbook/worksheet object.
149 1 50 33     19 return ('XLSX', sheet => $input)
150             if ref($input) && ref($input)->can('get_cell');
151 0 0 0     0 return ('XLSX', workbook => $input)
152             if ref($input) && ref($input)->can('worksheets');
153              
154             # Load first block of file, unless supplied
155 0         0 my $fpos;
156 0 0       0 if (!defined $magic) {
157 0         0 my $fh= $self->_file_handle;
158             # Need to be able to seek.
159 0 0       0 if (seek($fh, 0, 1)) {
    0          
160 0         0 $fpos= tell $fh;
161 0         0 read($fh, $magic, 4096);
162 0 0       0 seek($fh, $fpos, 0) or croak "seek: $!";
163             }
164             elsif ($fh->can('ungets')) {
165 0         0 $fpos= 0; # to indicate that we did try reading the file
166 0         0 read($fh, $magic, 4096);
167 0         0 $fh->ungets($magic);
168             }
169             else {
170 0         0 $self->_log->('notice',"Can't fully detect input format because handle is not seekable."
171             ." Consider fully buffering the file, or using FileHandle::Unget");
172 0         0 $magic= '';
173             }
174             }
175              
176             # Excel is obvious so check it first. This handles cases where an excel file is
177             # erroneously named ".csv" and sillyness like that.
178 0 0       0 return ( 'XLSX' ) if $magic =~ /^PK(\x03\x04|\x05\x06|\x07\x08)/;
179 0 0       0 return ( 'XLS' ) if $magic =~ /^\xD0\xCF\x11\xE0/;
180              
181             # Else trust the file extension, because TSV with commas can be very similar to CSV with
182             # tabs in the data, and some crazy person might store an HTML document as the first element
183             # of a CSV file.
184             # Detect filename if not supplied
185 0 0       0 if (!defined $filename) {
186 0         0 $filename= '';
187 0 0 0     0 $filename= "$input" if defined $input and (!ref $input || ref($input) =~ /path|file/i);
      0        
188             }
189 0 0       0 if ($filename =~ /\.([^.]+)$/) {
190 0         0 my $suffix= uc($1);
191 0 0       0 return 'HTML' if $suffix eq 'HTM';
192 0         0 return $suffix;
193             }
194              
195             # Else probe some more...
196 0         0 $self->_log->('debug',"Probing file format because no filename suffix");
197 0 0       0 length $magic or croak "Can't probe format. No filename suffix, and "
    0          
198             .(!defined $fpos? "unseekable file handle" : "no content");
199              
200             # HTML is pretty obvious
201 0 0       0 return 'HTML' if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?<(!DOCTYPE )HTML/i;
202             # Else guess between CSV and TSV
203 0         0 my ($probably_csv, $probably_tsv)= (0,0);
204 0 0       0 ++$probably_csv if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?["']?[\w ]+["']?,/;
205 0 0       0 ++$probably_tsv if $magic =~ /^(\xEF\xBB\xBF|\xFF\xFE|\xFE\xFF)?["']?[\w ]+["']?\t/;
206 0         0 my $comma_count= () = ($magic =~ /,/g);
207 0         0 my $tab_count= () = ($magic =~ /\t/g);
208 0         0 my $eol_count= () = ($magic =~ /\n/g);
209 0 0 0     0 ++$probably_csv if $comma_count > $eol_count and $comma_count > $tab_count;
210 0 0 0     0 ++$probably_tsv if $tab_count > $eol_count and $tab_count > $comma_count;
211 0         0 $self->_log->('debug', 'probe results: comma_count=%d tab_count=%d eol_count=%d probably_csv=%d probably_tsv=%d',
212             $comma_count, $tab_count, $eol_count, $probably_csv, $probably_tsv);
213 0 0 0     0 return 'CSV' if $probably_csv and $probably_csv > $probably_tsv;
214 0 0 0     0 return 'TSV' if $probably_tsv and $probably_tsv > $probably_csv;
215 0         0 croak "Can't determine file format";
216             }
217              
218              
219             has _table_found => ( is => 'rw', lazy => 1, builder => 1, clearer => 1, predicate => 1 );
220             sub _build__table_found {
221 4     4   29 my $self= shift;
222 4         11 my %loc= ( croak_on_fail => 1 );
223 4         65 $self->_find_table($self->decoder->iterator, \%loc);
224 4         15 \%loc;
225             }
226              
227             sub find_table {
228 8     8 1 3833 my $self= shift;
229 8 50       41 return 1 if $self->_has_table_found;
230 8         13 my %loc;
231 8 50       205 if ($self->_find_table($self->decoder->iterator, \%loc)) {
232 8         177 $self->_table_found(\%loc);
233 8         88 return 1;
234             }
235 0         0 return 0;
236             }
237              
238 1     1 1 25 sub col_map { shift->_table_found->{col_map}; }
239 6     6 1 1680 sub field_map { shift->_table_found->{field_map}; }
240              
241             sub _find_table {
242 12     12   34 my ($self, $data_iter, $stash)= @_;
243 12   50     31 $stash ||= {};
244 12   66     39 while (!$self->_find_table_in_dataset($data_iter, $stash)
      66        
245             && !defined $stash->{fatal}
246             && $data_iter->next_dataset)
247             {}
248 12 50       34 if ($stash->{col_map}) {
249             # Calculate field map from col map
250 12         25 my $col_map= $stash->{col_map};
251 12         17 my %fmap;
252 12         39 for my $i (0 .. $#$col_map) {
253 57 100       110 next unless $col_map->[$i];
254 39 100       90 if ($col_map->[$i]->array) {
255 10         13 push @{ $fmap{$col_map->[$i]->name} }, $i;
  10         24  
256             } else {
257 29         80 $fmap{$col_map->[$i]->name}= $i;
258             }
259             }
260 12         87 $stash->{field_map}= \%fmap;
261             # And record the stream position of the start of the table
262 12         50 $stash->{first_record_pos}= $data_iter->tell;
263 12         28 $stash->{data_iter}= $data_iter;
264 12         36 return $stash;
265             }
266             else {
267 0   0     0 my $err= $stash->{fatal} || "Can't locate valid header";
268 0         0 $self->_log->('error', $err);
269 0 0       0 croak $err if $stash->{croak_on_fail};
270 0         0 return undef;
271             }
272             }
273              
274             sub _find_table_in_dataset {
275 13     13   30 my ($self, $data_iter, $stash)= @_;
276             # If header_row_at is undef, then there is no header.
277             # Ensure static_field_order, then set up columns.
278 13         42 my @fields= $self->field_list;
279 13         154 my $header_at= $self->header_row_at;
280 13 50       64 if (!defined $header_at) {
281 0 0       0 unless ($self->static_field_order) {
282 0         0 $stash->{fatal}= "You must enable 'static_field_order' if there is no header row";
283 0         0 return;
284             }
285 0         0 $stash->{col_map}= \@fields;
286 0         0 return 1;
287             }
288            
289             # If headers contain "\n", we need to collect multiple cells per column
290 13         232 my $row_accum= $self->header_row_combine;
291            
292 13 50       77 my ($start, $end)= ref $header_at? @$header_at : ( $header_at, $header_at );
293 13         24 my @rows;
294            
295             # If header_row_at doesn't start at 1, seek forward
296 13 50       42 if ($start > 1) {
297 0         0 $self->_log->('trace', 'Skipping to row %s', $start);
298 0         0 push @rows, $data_iter->() for 1..$start-1;
299             }
300            
301             # Scan through the rows of the dataset up to the end of header_row_at, accumulating rows so that
302             # multi-line regexes can match.
303 13         48 for ($start .. $end) {
304 22         146 my $vals= $data_iter->();
305 22 100       54 if (!$vals) { # if undef, we reached end of dataset
306 1         18 $self->_log->('trace', 'EOF');
307 1         9 last;
308             }
309 21 100       50 if ($row_accum > 1) {
310 5         10 push @rows, $vals;
311 5         14 shift @rows while @rows > $row_accum;
312 5         10 $vals= [ map { my $c= $_; join("\n", map $_->[$c], @rows) } 0 .. $#{$rows[-1]} ];
  28         42  
  28         79  
  5         14  
313 5         25 $stash->{context}= $row_accum.' rows ending at '.$data_iter->position;
314             } else {
315 16         53 $stash->{context}= $data_iter->position;
316             }
317 21         419 $self->_log->('trace', 'Checking for headers on %s', $stash->{context});
318 21         206 $stash->{context}.= ': ';
319 21 50       99 $stash->{col_map}= $self->static_field_order?
320             # If static field order, look for headers in sequence
321             $self->_match_headers_static($vals, $stash)
322             # else search for each header
323             : $self->_match_headers_dynamic($vals, $stash);
324 21 100       112 return 1 if $stash->{col_map};
325 9 50       23 return if $stash->{fatal};
326 9         217 $self->_log->('debug', '%sNo match', $stash->{context});
327             }
328 1         17 $self->_log->('warn','No row in dataset matched full header requirements');
329 1         548 return;
330             }
331              
332             sub _match_headers_static {
333 0     0   0 my ($self, $header, $stash)= @_;
334 0         0 my $fields= $self->fields;
335 0         0 for my $i (0 .. $#$fields) {
336 0 0       0 next if $header->[$i] =~ $fields->[$i]->header_regex;
337            
338             # Field header doesn't match. Start over on next row.
339 0   0     0 $self->_log->('debug','%sMissing field %s', $stash->{context}||'', $fields->[$i]->name);
340 0         0 return;
341             }
342             # found a match for every field!
343 0   0     0 $self->_log->('debug','%sFound!', $stash->{context}||'');
344 0         0 return $fields;
345             }
346              
347             sub _match_headers_dynamic {
348 21     21   46 my ($self, $header, $stash)= @_;
349 21   50     61 my $context= $stash->{context} || '';
350 21         31 my %col_map;
351 21         363 my $fields= $self->fields;
352             my $free_fields= $stash->{free_fields} ||= [
353             # Sort required fields to front, to fail faster on non-matching rows
354 31 100       102 sort { $a->required? -1 : $b->required? 1 : 0 }
    100          
355 21   100     168 grep { !$_->follows_list } @$fields
  34         85  
356             ];
357             my $follows_fields= $stash->{follows_fields} ||= [
358 21   100     70 grep { $_->follows_list } @$fields
  34         68  
359             ];
360 21         52 for my $f (@$free_fields) {
361 42         715 my $hr= $f->header_regex;
362 42         871 $self->_log->('debug', 'looking for %s', $hr);
363 42         382 my @found= grep { $header->[$_] =~ $hr } 0 .. $#$header;
  252         889  
364 42 100       169 if (@found == 1) {
    100          
    100          
365 30 50       80 if ($col_map{$found[0]}) {
366 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
367             $self->_log->('info','%sField %s and %s both match column %s',
368 0         0 $context, $f->name, $col_map{$found[0]}->name, $found[0]);
369 0         0 return;
370             }
371 30         107 $col_map{$found[0]}= $f;
372             }
373             elsif (@found > 1) {
374 1 50       5 if ($f->array) {
375             # Array columns may be found more than once
376 1         9 $col_map{$_}= $f for @found;
377             } else {
378 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
379 0         0 $self->_log->('info','%sField %s matches more than one column: %s',
380             $context, $f->name, join(', ', @found));
381 0         0 return;
382             }
383             }
384             elsif ($f->required) {
385 9         30 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
386 9         215 $self->_log->('info','%sNo match for required field %s', $context, $f->name);
387 9         5009 return;
388             }
389             # else Not required, and not found
390             }
391             # Need to have found at least one column (even if none required)
392 12 50       42 unless (keys %col_map) {
393 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
394 0         0 $self->_log->('debug','%sNo field headers found', $context);
395 0         0 return;
396             }
397             # Now, check for any of the 'follows' fields, some of which might also be 'required'.
398 12 100       31 if (@$follows_fields) {
399 2         4 my %following;
400             my %found;
401 2         7 for my $i (0 .. $#$header) {
402 17 100       34 if ($col_map{$i}) {
403 5         20 %following= ( $col_map{$i}->name => $col_map{$i} );
404             } else {
405 12         19 my $val= $header->[$i];
406 12         15 my @match;
407 12         21 for my $f (@$follows_fields) {
408 18 100       59 next unless grep $following{$_}, $f->follows_list;
409 9 100       151 push @match, $f if $val =~ $f->header_regex;
410             }
411 12 100       88 if (@match == 1) {
    50          
412 7 50 66     34 if ($found{$match[0]} && !$match[0]->array) {
413 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
414 0         0 $self->_log->('info','%sField %s matches multiple columns',
415             $context, $match[0]->name);
416 0         0 return;
417             }
418 7         14 $col_map{$i}= $match[0];
419 7         17 $found{$match[0]}= $i;
420 7         23 $following{$match[0]->name}= $match[0];
421             }
422             elsif (@match > 1) {
423 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
424 0         0 $self->_log->('info','%sField %s and %s both match column %d',
425             $context, $match[0]->name, $match[1]->name, $i+1);
426 0         0 return;
427             }
428             else {
429 5         14 %following= ();
430             }
431             }
432             }
433             # Check if any of the 'follows' fields were required
434 2 50       5 if (my @unfound= grep { !$found{$_} && $_->required } @$follows_fields) {
  3 50       17  
435 0         0 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
436             $self->_log->('info','%sNo match for required %s %s', $context,
437 0 0       0 (@unfound > 1? ('fields', join(', ', map { $_->name } sort @unfound))
  0         0  
438             : ('field', $unfound[0]->name)
439             ));
440 0         0 return;
441             }
442             }
443             # Now, if there are any un-claimed columns, handle per 'on_unknown_columns' setting.
444 12         33 my @unclaimed= grep { !$col_map{$_} } 0 .. $#$header;
  57         125  
445 12 100       39 if (@unclaimed) {
446 5         18 my $act= $self->on_unknown_columns;
447 5         18 my $unknown_list= join(', ', map $self->_fmt_header_text($header->[$_]), @unclaimed);
448 5 50       18 $act= $act->($self, $header, \@unclaimed) if ref $act eq 'CODE';
449 5 50       16 if ($act eq 'use') {
    0          
    0          
450 5         96 $self->_log->('warn','%sIgnoring unknown columns: %s', $context, $unknown_list);
451             } elsif ($act eq 'next') {
452 0         0 $self->_log->('warn','%sWould match except for unknown columns: %s',
453             $context, $unknown_list);
454             } elsif ($act eq 'die') {
455 0         0 $stash->{fatal}= "${context}Header row includes unknown columns: $unknown_list";
456             } else {
457 0         0 $stash->{fatal}= "Invalid action '$act' for 'on_unknown_columns'";
458             }
459 5         2457 $self->_log->('debug', 'search status: '._colmap_progress_str(\%col_map, $header));
460 5 50       59 return if $stash->{fatal};
461             }
462 12         96 return [ map $col_map{$_}, 0 .. $#$header ];
463             }
464             # Make header string readable for log messages
465             sub _fmt_header_text {
466 87 100   87   177 shift if ref $_[0];
467 87         117 my $x= shift;
468 87         155 $x =~ s/ ( [^[:print:]] ) / sprintf("\\x%02X", ord $1 ) /gex;
  11         44  
469 87         473 qq{"$x"};
470             }
471             # format the colmap into a string
472             sub _colmap_progress_str {
473 14     14   34 my ($colmap, $headers)= @_;
474             join(' ', map {
475 14 100       37 $colmap->{$_}? $_.'='.$colmap->{$_}->name
  95         357  
476             : $_.':'._fmt_header_text($headers->[$_])
477             } 0 .. $#$headers)
478             }
479              
480              
481             sub iterator {
482 14     14 1 6775 my $self= shift;
483 14         383 my $fields= $self->fields;
484             # Creating the record iterator consumes the data source's iterator.
485             # The first time after detecting the table, we continue with the same iterator.
486             # Every time after that we need to create a new data iterator and seek to the
487             # first record under the header.
488 14         296 my $data_iter= delete $self->_table_found->{data_iter};
489 14 100       92 unless ($data_iter) {
490 2         35 $data_iter= $self->decoder->iterator;
491 2         36 $data_iter->seek($self->_table_found->{first_record_pos});
492             }
493 14         243 my $col_map= $self->_table_found->{col_map};
494 14         280 my $field_map= $self->_table_found->{field_map};
495 14         140 my @row_slice; # one column index per field, and possibly more for array_val_map
496             my @arrayvals; # list of source index and destination index for building array values
497 14         0 my @field_names; # ordered list of field names where row slice should be assigned
498 14         0 my @trim_idx; # list of array indicies which should be whitespace-trimmed.
499 14         0 my @blank_val; # blank value per each fetched column
500 14         0 my @type_check;# list of
501 14         0 my $class; # optional object class for the resulting rows
502              
503             # If result is array, the slice of the row must match the position of the fields in the
504             # $self->fields array. If a field was not found it will get an undef for that slot.
505             # It also results in an undef for secondary fields of the same name as the first.
506 14 50       58 if ($self->record_class eq 'ARRAY') {
507 0         0 my %remaining= %$field_map;
508             @row_slice= map {
509 0         0 my $src= delete $remaining{$_->name};
  0         0  
510 0 0       0 defined $src? $src : 0x7FFFFFFF
511             } @$fields;
512             }
513             # If result is anything else, then only slice out the columns that are used for the fields
514             # that we located.
515             else {
516 14 50       60 $class= $self->record_class
517             unless 'HASH' eq $self->record_class;
518 14         55 @field_names= keys %$field_map;
519 14         38 @row_slice= values %$field_map;
520             }
521             # For any field whose value is an array of more that one source column,
522             # encode those details in @arrayvals, and update @row_slice and @trim_idx accordingly
523 14         79 for (0 .. $#row_slice) {
524 37 100       82 if (!ref $row_slice[$_]) {
525 35         58 my $field= $col_map->[$row_slice[$_]];
526 35 50       96 push @trim_idx, $_ if $field->trim;
527 35         71 push @blank_val, $field->blank;
528 35 100       103 push @type_check, $self->_make_validation_callback($field, $_)
529             if $field->type;
530             }
531             else {
532             # This field is an array-value, so add the src columns to @row_slice
533             # and list it in @arrayvals, and update @trim_idx if needed
534 2         4 my $src= $row_slice[$_];
535 2         4 $row_slice[$_]= 0x7FFFFFFF;
536 2         4 my $from= @row_slice;
537 2         7 push @row_slice, @$src;
538 2         5 push @arrayvals, [ $_, $from, scalar @$src ];
539 2         6 for ($from .. $#row_slice) {
540 10         15 my $field= $col_map->[$row_slice[$_]];
541 10 50       23 push @trim_idx, $_ if $field->trim;
542 10         19 push @blank_val, $field->blank;
543 10 50       23 push @type_check, $self->_make_validation_callback($field, $_)
544             if $field->type;
545             }
546             }
547             }
548 14         30 @arrayvals= reverse @arrayvals;
549 14         27 my ($n_blank, $first_blank, $eof);
550             my $sub= sub {
551 48 100 50 48   2556 again:
      66        
552             # Pull the specific slice of the next row that we need
553             my $row= !$eof && $data_iter->(\@row_slice)
554             or ++$eof && return undef;
555             # Apply 'trim' to any column whose field requested it
556 35         97 for (grep { defined } @{$row}[@trim_idx]) {
  89         193  
  35         71  
557 85         162 $_ =~ s/\s+$//;
558 85         161 $_ =~ s/^\s+//;
559             }
560             # Apply 'blank value' to every column which is zero length
561 35         59 $n_blank= 0;
562             $row->[$_]= $blank_val[$_]
563 35 100 100     76 for grep { (!defined $row->[$_] || !length $row->[$_]) && ++$n_blank } 0..$#$row;
  93         374  
564             # If all are blank, then handle according to $on_blank_row setting
565 35 100       133 if ($n_blank == @$row) {
    100          
566 2   33     12 $first_blank ||= $data_iter->position;
567 2         13 goto again;
568             } elsif ($first_blank) {
569 2 50       7 unless ($self->_handle_blank_row($first_blank, $data_iter->position)) {
570 0         0 $eof= 1;
571 0         0 return undef;
572             }
573 2         7 $first_blank= undef;
574             }
575             # Check type constraints, if any
576 33 100       95 if (@type_check) {
577 18 100       40 if (my @failed= map $_->($row), @type_check) {
578 6 100       22 $self->_handle_validation_fail(\@failed, $row, $data_iter->position.': ')
579             or goto again;
580             }
581             }
582             # Collect all the array-valued fields from the tail of the row
583 29         71 $row->[$_->[0]]= [ splice @$row, $_->[1], $_->[2] ] for @arrayvals;
584             # stop here if the return class is 'ARRAY'
585 29 50       59 return $row unless @field_names;
586             # Convert the row to a hashref
587 29         43 my %rec;
588 29         104 @rec{@field_names}= @$row;
589             # Construct a class, if requested, else return hashref
590 29 50       223 return $class? $class->new(\%rec) : \%rec;
591 14         110 };
592 14         124 return Data::TableReader::_RecIter->new(
593             $sub, { data_iter => $data_iter, reader => $self },
594             );
595             }
596              
597             sub _make_validation_callback {
598 4     4   9 my ($self, $field, $index)= @_;
599 4         8 my $t= $field->type;
600             ref $t eq 'CODE'? sub {
601 18     18   54 my $e= $t->($_[0][$index]);
602 18 100       135 defined $e? ([ $field, $index, $e ]) : ()
603             }
604             : $t->can('validate')? sub {
605 0     0   0 my $e= $t->validate($_[0][$index]);
606 0 0       0 defined $e? ([ $field, $index, $e ]) : ()
607             }
608 4 0       27 : croak "Invalid type constraint $t on field ".$field->name;
    50          
609             }
610              
611             sub _handle_blank_row {
612 2     2   22 my ($self, $first, $last)= @_;
613 2         11 my $act= $self->on_blank_row;
614 2 50       10 $act= $act->($self, $first, $last)
615             if ref $act eq 'CODE';
616 2 50       21 if ($act eq 'next') {
617 2         48 $self->_log->('warn', 'Skipping blank rows from %s until %s', $first, $last);
618 2         1183 return 1;
619             }
620 0 0       0 if ($act eq 'last') {
621 0         0 $self->_log->('warn', 'Ending at blank row %s', $first);
622 0         0 return 0;
623             }
624 0 0       0 if ($act eq 'die') {
625 0         0 my $msg= "Encountered blank rows at $first..$last";
626 0         0 $self->_log->('error', $msg);
627 0         0 croak $msg;
628             }
629 0         0 croak "Invalid value for 'on_blank_row': \"$act\"";
630             }
631              
632             sub _handle_validation_fail {
633 6     6   17 my ($self, $failures, $values, $context)= @_;
634 6         16 my $act= $self->on_validation_fail;
635 6 100       27 $act= $act->($self, $failures, $values, $context)
636             if ref $act eq 'CODE';
637 6         47 my $errors= join(', ', map $_->[0]->name.': '.$_->[2], @$failures);
638 6 100       15 if ($act eq 'next') {
639 3 50       63 $self->_log->('warn', "%sSkipped for data errors: %s", $context, $errors) if $errors;
640 3         1312 return 0;
641             }
642 3 100       8 if ($act eq 'use') {
643 2 100       25 $self->_log->('warn', "%sPossible data errors: %s", $context, $errors) if $errors;
644 2         7 return 1;
645             }
646 1 50       4 if ($act eq 'die') {
647 1         9 my $msg= "${context}Invalid record: $errors";
648 1         27 $self->_log->('error', $msg);
649 1         749 croak $msg;
650             }
651             }
652              
653 4     4   1084 BEGIN { @Data::TableReader::_RecIter::ISA= ( 'Data::TableReader::Iterator' ) }
654             sub Data::TableReader::_RecIter::all {
655 8     8   16 my $self= shift;
656 8         17 my (@rec, $x);
657 8         18 push @rec, $x while ($x= $self->());
658 8         226 return \@rec;
659             }
660             sub Data::TableReader::_RecIter::position {
661 0     0   0 shift->_fields->{data_iter}->position(@_);
662             }
663             sub Data::TableReader::_RecIter::progress {
664 0     0   0 shift->_fields->{data_iter}->progress(@_);
665             }
666             sub Data::TableReader::_RecIter::tell {
667 0     0   0 shift->_fields->{data_iter}->tell(@_);
668             }
669             sub Data::TableReader::_RecIter::seek {
670 0     0   0 shift->_fields->{data_iter}->seek(@_);
671             }
672             sub Data::TableReader::_RecIter::next_dataset {
673             shift->_fields->{reader}->_log
674 0     0   0 ->('warn',"Searching for supsequent table headers is not supported yet");
675 0         0 return 0;
676             }
677              
678             1;
679              
680             __END__