File Coverage

blib/lib/Spreadsheet/ExcelTableReader.pm
Criterion Covered Total %
statement 38 266 14.2
branch 3 130 2.3
condition 3 36 8.3
subroutine 12 33 36.3
pod 7 8 87.5
total 63 473 13.3


line stmt bran cond sub pod time code
1             package Spreadsheet::ExcelTableReader;
2 1     1   1699 use Moo 2;
  1         14795  
  1         6  
3 1     1   2964 use Spreadsheet::ParseExcel;
  1         73495  
  1         35  
4 1     1   20 use Spreadsheet::ParseExcel::Utility 'int2col';
  1         3  
  1         49  
5 1     1   860 use Spreadsheet::XLSX;
  1         92381  
  1         41  
6 1     1   767 use Log::Any '$log';
  1         5128  
  1         6  
7 1     1   5459 use Spreadsheet::ExcelTableReader::Field;
  1         4  
  1         34  
8 1     1   7 use Carp 'croak';
  1         2  
  1         47  
9 1     1   5 use IO::Handle;
  1         2  
  1         3456  
10              
11             our $VERSION= '0.000001_001';
12              
13             # ABSTRACT: Module to extract a table from somewhere within an Excel spreadsheet
14              
15              
16             has file => ( is => 'ro' );
17              
18              
19             has sheet => ( is => 'ro' );
20              
21             # Arrayref of all sheets we can search
22             has _sheets => ( is => 'lazy' );
23              
24              
25             has fields => ( is => 'ro', required => 1, coerce => \&_coerce_field_list );
26 1     1 1 391 sub field_list { @{ shift->fields } }
  1         8  
27              
28              
29             has find_table_args => ( is => 'rw' );
30              
31             has _table_location => ( is => 'rw', lazy_build => 1 );
32              
33             sub _build__sheets {
34 1     1   398 my $self= shift;
35            
36             # If we have ->sheet and it is a worksheet object, then no need to do anything else
37 1 50 33     26 if ($self->sheet && ref($self->sheet) && ref($self->sheet)->can('get_cell')) {
      33        
38 1         25 return [ $self->sheet ];
39             }
40            
41             # Else we need to scan sheets from the excel file. Make sure we have the file
42 0         0 my $wbook= $self->_open_workbook($self->file);
43 0         0 my @sheets= $wbook->worksheets;
44 0 0       0 @sheets or croak "No worksheets in file?";
45 0 0       0 if (defined $self->sheet) {
46 0 0       0 if (ref($self->sheet) eq 'Regexp') {
    0          
    0          
47 0         0 @sheets= grep { $_->get_name =~ $self->sheet } @sheets;
  0         0  
48             } elsif (ref($self->sheet) eq 'CODE') {
49 0         0 @sheets= grep { $self->sheet->($_) } @sheets;
  0         0  
50             } elsif (!ref $self->sheet) {
51 0         0 @sheets= grep { $_->get_name eq $self->sheet } @sheets;
  0         0  
52             } else {
53 0         0 croak "Unknown type of sheet specification: ".$self->sheet;
54             }
55             }
56 0 0       0 @sheets or croak "No worksheets match the specification";
57            
58 0         0 return \@sheets;
59             }
60              
61             sub _open_workbook {
62 0     0   0 my ($self, $f)= @_;
63            
64 0 0       0 defined $f or croak "workbook file is undefined";
65            
66 0         0 my $wbook;
67 0 0 0     0 if (ref($f) && ref($f)->can('worksheets')) {
68 0         0 $wbook= $f;
69             } else {
70 0         0 my $type= "xlsx";
71            
72             # Probe the file to determine type
73 0 0 0     0 if (ref($f) eq 'GLOB' or ref($f) && ref($f)->can('read')) {
    0 0        
74 0         0 my $fpos= $f->tell;
75 0 0       0 $fpos >= 0 or croak "File handle must be seekable";
76 0 0       0 $f->read(my $buf, 4) == 4 or croak "read($f,4): $!";
77 0 0       0 $f->seek($fpos, 0) or croak "failed to seek back to start of file";
78 0 0       0 $type= 'xls' if $buf eq "\xD0\xCF\x11\xE0";
79             }
80             elsif (-e $f) {
81 0         0 $f= "$f"; # force stringification
82 0 0       0 open my $fh, '<', $f or croak "open($f): $!";
83 0 0       0 read($fh, my $buf, 4) == 4 or croak "read($f,4): $!";
84 0 0       0 $type= 'xls' if $buf eq "\xD0\xCF\x11\xE0";
85             }
86             else {
87 0 0       0 $log->notice("Can't determine parser for '$f', guessing '$type'") if $log->is_notice;
88             }
89            
90 0 0       0 if ($type eq 'xlsx') {
91             # Spreadsheet::XLSX uses Archive::Zip which can *only* work on IO::Handle
92             # instances, not plain globrefs. (seems like a bug, hm)
93 0 0       0 if (ref($f) eq 'GLOB') {
94 0         0 require IO::File;
95 0         0 my $f_obj= IO::File->new;
96 0 0       0 $f_obj->fdopen($f, 'r') or croak "Can't convert GLOBref to IO::File";
97 0         0 $f= $f_obj;
98             }
99 0         0 $wbook= Spreadsheet::XLSX->new($f);
100             } else {
101 0         0 $wbook= Spreadsheet::ParseExcel->new->parse($f);
102             }
103 0 0       0 defined $wbook or croak "Can't parse file '".$self->file."'";
104             }
105 0         0 return $wbook;
106             }
107              
108             sub _build__table_location {
109 0     0   0 my $self= shift;
110 0         0 my $args= $self->find_table_args;
111 0 0       0 $self->find_table( !$args? () : (ref($args) eq 'ARRAY')? @$args : %$args )
    0          
    0          
112             or croak "No match for table header in excel file";
113 0         0 $self->{_table_location}; # find_table sets the value already, in a slight violation of this builder method.
114             }
115              
116             sub _coerce_field_list {
117 1     1   3487 my ($list)= @_;
118 1 50 33     12 defined $list and ref $list eq 'ARRAY' or croak "'fields' must be a non-empty arrayref";
119 1         4 my @list= @$list; # clone it, to make sure we don't unexpectedly alter the caller's data
120 1         2 for (@list) {
121 2 50       13 if (!ref $_) {
    0          
122 2         72 $_= Spreadsheet::ExcelTableReader::Field->new(
123             name => $_,
124             header => qr/^\s*\Q$_\E\s*$/i,
125             );
126             } elsif (ref $_ eq 'HASH') {
127 0         0 my %args= %$_;
128             # "isa" alias for the 'type' attribute
129 0 0 0     0 $args{type}= delete $args{isa} if defined $args{isa} && !defined $args{type};
130             # default header to field name with optional whitespace
131 0 0       0 $args{header}= qr/^\s*\Q$args{name}\E\s*$/i unless defined $args{header};
132 0         0 $_= Spreadsheet::ExcelTableReader::Field->new( %args )
133             } else {
134 0         0 croak "Can't coerce '$_' to a Field object"
135             }
136             }
137 1         26 return \@list;
138             }
139              
140              
141             sub BUILD {
142 1     1 0 21 my $self= shift;
143             # Any errors getting the list of searchable worksheets should happen now, during construction time
144 1         5 $self->_sheets;
145             }
146              
147              
148             sub _cell_name {
149 0     0     my ($row, $col)= @_;
150 0           return int2col($col).($row+1);
151             }
152              
153             sub find_table {
154 0     0 1   my $self= shift;
155            
156 0           my $location;
157 0           my @sheets= @{$self->_sheets};
  0            
158 0           my @fields= $self->field_list;
159 0           my $num_required_fields= grep { $_->required } @fields;
  0            
160            
161             # Algorithm is O(N^4) in worst case, but the regex should make it more like O(N^2) in most
162             # real world cases. The worst case would be if every row of every sheet of the workbook almost
163             # matched the header row (which could happen with extremely lax field header patterns)
164 0           my $header_regex= qr/(?:@{[ join('|', map { $_->header_regex } @fields) ]})/ms;
  0            
  0            
165              
166             # Scan top-down across all sheets at once, since headers are probably at the top of the document.
167 0           my $row= 0;
168 0           my $in_range= 1; # flag turns false if we pass the bottom of all sheets
169 0           row_loop: while ($in_range) {
170 0           $in_range= 0;
171 0           for my $sheet (@sheets) {
172 0 0         $log->trace("row $row sheet $sheet") if $log->is_trace;
173 0           my %field_found;
174 0           my ($rmin, $rmax)= $sheet->row_range();
175 0           my ($cmin, $cmax)= $sheet->col_range();
176 0 0 0       next unless $row >= $rmin && $row <= $rmax;
177 0           $in_range++;
178 0 0         my @row_vals= map { my $c= $sheet->get_cell($row, $_); $c? $c->value : '' } 0..$cmax;
  0            
  0            
179 0           my $match_count= grep { $_ =~ $header_regex } @row_vals;
  0            
180 0           $log->trace("str=@row_vals, regex=$header_regex, match_count=$match_count");
181 0 0         if ($match_count >= $num_required_fields) {
182 0           my $field_col= $self->_resolve_field_columns($sheet, $row, \@row_vals);
183 0 0         if ($field_col) {
184 0           $location= {
185             sheet => $sheet,
186             header_row => $row,
187             min_row => $row+1,
188             field_col => $field_col,
189             };
190 0           last row_loop;
191             }
192             }
193             }
194 0           ++$row;
195             }
196            
197 0 0         return '' unless defined $location;
198            
199             # Calculate a few more fields for location
200 0           my @cols_used= sort { $a <=> $b } values %{ $location->{field_col} };
  0            
  0            
201 0           $location->{min_col}= $cols_used[0];
202 0           $location->{max_col}= $cols_used[-1];
203            
204             # Maybe should look for the last row containing data for our columns, but that seems expensive...
205 0           $location->{max_row}= ($location->{sheet}->row_range())[1];
206            
207 0           $location->{start_cell}= _cell_name($location->{min_row}, $location->{min_col});
208 0           $location->{end_cell}= _cell_name($location->{min_col}, $location->{max_col});
209 0           $self->_table_location($location);
210            
211 0           return 1;
212             }
213              
214             sub _resolve_field_columns {
215 0     0     my ($self, $sheet, $row, $row_vals)= @_;
216 0           my %col_map;
217             my %field_found;
218 0           my $fields= $self->fields;
219            
220             # Try each cell to see if it matches each field's header
221 0           for my $col (0..$#$row_vals) {
222 0           my $v= $row_vals->[$col];
223 0 0 0       next unless defined $v and length $v;
224 0           for my $field (@$fields) {
225 0 0         push @{ $field_found{$field->name} }, $col
  0            
226             if $v =~ $field->header_regex;
227             }
228             }
229            
230             # Is there one and only one mapping of fields to columns?
231 0           my $ambiguous= 0;
232 0           my @todo= @$fields;
233 0           while (@todo) {
234 0           my $field= shift @todo;
235 0 0         next unless defined $field_found{$field->name};
236 0           my $possible= $field_found{$field->name};
237 0           my @available= grep { !defined $col_map{$_} } @$possible;
  0            
238 0           $log->trace("ambiguous=$ambiguous : field ".$field->name." could be ".join(',', map { _cell_name($row,$_) } @$possible)
239 0           ." and ".join(',', map { _cell_name($row,$_) } @available)." are available");
  0            
240 0 0         if (!@available) {
    0          
241             # It is possible that two fields claim the same columns and one is required
242 0 0         if ($field->required) {
243 0           my $col= $possible->[0];
244 0 0         $log->debug("Field ".$field->name." and ".$col_map{$col}." would both claim "._cell_name($row, $col))
245             if $log->is_debug;
246 0           return;
247             }
248             }
249             elsif (@available > 1) {
250             # It is possible for a field to match more than one column.
251             # If so, we send it to the back of the list in case another more specific
252             # column claims one of the options.
253 0 0         if (++$ambiguous > @todo) {
254 0 0         $log->debug("Can't decide between ".join(', ', map { _cell_name($row,$_) } @available)." for field ".$field->name)
  0            
255             if $log->is_debug;
256 0           return;
257             }
258 0           push @todo, $field;
259             }
260             else {
261 0           $col_map{$available[0]}= $field->name;
262 0           $ambiguous= 0; # made progress, start counting over again
263             }
264             }
265             # Success! convert the col map to an array of col-index-per-field
266 0 0         $log->debug("Found headers at ".join(' ', map { _cell_name($row,$_) } sort keys %col_map))
  0            
267             if $log->is_debug;
268 0           return { reverse %col_map };
269             }
270              
271              
272             sub table_location {
273 0     0 1   my ($self)= @_;
274 0 0         return undef unless defined $self->{_table_location};
275             # Deep-clone the location
276 0           my %loc= %{ $self->_table_location };
  0            
277 0           $loc{field_col}= { %{ $loc{field_col} } };
  0            
278 0           return \%loc;
279             }
280              
281              
282             sub record_count {
283 0     0 1   my $self= shift;
284 0 0         return 0 unless defined $self->_table_location;
285 0           return $self->_table_location->{max_row} - $self->_table_location->{min_row} + 1;
286             }
287              
288              
289             sub records {
290 0     0 1   my ($self, %opts)= @_;
291 0           my $i= $self->iterator(%opts);
292 0           my @records;
293 0           while (my $r= $i->()) { push @records, $r; }
  0            
294 0           return \@records;
295             }
296              
297              
298 0     0 1   sub record_arrays { shift->records(as => 'array', @_) }
299              
300              
301             our %_Iterators;
302              
303             sub iterator {
304 0     0 1   my ($self, %opts)= @_;
305 0           my ($as, $blank_row, $on_error)= delete @opts{'as','blank_row','on_error'};
306 0 0         croak "Unknown option(s) to iterator: ".join(', ', keys %opts)
307             if keys %opts;
308            
309 0 0         $as= 'hash' unless defined $as;
310 0           my $hash= ($as eq 'hash');
311            
312 0 0         $blank_row= 'end' unless defined $blank_row;
313 0           my $skip_blank_row= ($blank_row eq 'skip');
314 0           my $end_blank_row= ($blank_row eq 'end');
315            
316 0           my $sheet= $self->_table_location->{sheet};
317 0           my $min_row= $self->_table_location->{min_row};
318 0           my $row= $min_row - 1;
319 0           my $col;
320 0           my $min_col= $self->_table_location->{min_col};
321 0           my $remaining= $self->_table_location->{max_row} - $self->_table_location->{min_row} + 1;
322 0           my $is_blank_row;
323 0           my %field_col= %{ $self->_table_location->{field_col} };
  0            
324 0           my (@result_keys, @field_extractors, @validations);
325 0           for my $field ($self->field_list) {
326 0           my $blank= $field->blank;
327 0           my $src_col= $field_col{$field->name};
328            
329             # Don't need an extractor for fields not found in the table if result type is hash,
330             # but if result type is array we need to pad with a null value
331 0 0         if (!defined $src_col) {
332 0 0   0     $hash or push @field_extractors, sub { undef; };
  0            
333 0           next;
334             }
335            
336 0 0         push @result_keys, $field->name if $hash;
337            
338             # If trimming, use a different implementation than if not, for a little efficiency
339             push @field_extractors, $field->trim?
340             sub {
341 0     0     my $v= $sheet->get_cell($row, $src_col);
342 0 0         return $blank unless defined $v;
343 0           $v= $v->value;
344 0           $v =~ s/^\s*(.*?)\s*$/$1/;
345 0 0         return $blank unless length $v;
346 0           $is_blank_row= 0;
347 0           $v;
348             }
349             :
350             sub {
351 0     0     my $v= $sheet->get_cell($row, $src_col);
352 0 0 0       defined $v && length($v= $v->value)
353             or return $blank;
354 0           $is_blank_row= 0;
355 0           $v;
356 0 0         };
357            
358 0 0         if (defined (my $type= $field->type)) {
359             # This sub will access the values array at the same position as the current field_extractor
360 0           my $idx= $#field_extractors;
361             push @validations, sub {
362 0 0   0     return if $type->check($_[0][$idx]);
363 0           $col= $src_col; # so the iterator->col reports the column of the error
364 0           croak "Not a ".$type->name." at cell "._cell_name($row, $col);
365 0           };
366             }
367             }
368            
369             # Closure over everything, for very fast access
370             my $sub= sub {
371 0     0     $log->trace("iterator: remaining=$remaining row=$row sheet=$sheet");
372 0 0         again:
373             return unless $remaining > 0;
374 0           ++$row;
375 0           $col= $min_col;
376 0           --$remaining;
377 0           $is_blank_row= 1; # This var is closured, and gets set to 0 by the next line
378 0           my @values= map { $_->() } @field_extractors;
  0            
379 0 0 0       goto again if $skip_blank_row && $is_blank_row;
380 0 0 0       if ($end_blank_row && $is_blank_row) {
381 0           $remaining= 0;
382 0           return;
383             }
384 0           $_->(\@values) for @validations; # This can die. It can also be an empty list.
385 0 0         return $hash? do { my %r; @r{@result_keys}= @values; \%r } : \@values;
  0            
  0            
  0            
386 0           };
387            
388             # Blessed coderef, so we can call methods on it
389 0           bless $sub, 'Spreadsheet::ExcelTableReader::Iterator';
390            
391             # Store references to all the closered variables so the methods can access them
392             $_Iterators{$sub}= {
393             r_sheet => \$sheet,
394             r_row => \$row,
395             r_col => \$col,
396             r_remaining => \$remaining,
397             min_row => $self->_table_location->{min_row},
398             max_row => $self->_table_location->{max_row},
399 0           };
400            
401 0           return $sub;
402             }
403              
404             package Spreadsheet::ExcelTableReader::Iterator;
405              
406 0     0     sub DESTROY { delete $_Iterators{$_[0]}; }
407 0     0     sub sheet { ${ $_Iterators{$_[0]}{r_sheet} } }
  0            
408 0     0     sub col { ${ $_Iterators{$_[0]}{r_col} } }
  0            
409 0     0     sub row { ${ $_Iterators{$_[0]}{r_row} } }
  0            
410 0     0     sub remaining { ${ $_Iterators{$_[0]}{r_remaining} } }
  0            
411              
412             sub rewind {
413 0     0     my $self= $_Iterators{$_[0]};
414 0           ${$self->{r_row}}= $self->{min_row} - 1;
  0            
415 0           ${$self->{r_remaining}}= $self->{max_row} - $self->{min_row} + 1;
  0            
416 0           return 1;
417             }
418              
419             1;
420              
421             __END__