File Coverage

blib/lib/Spreadsheet/ExcelTableReader.pm
Criterion Covered Total %
statement 215 266 80.8
branch 69 130 53.0
condition 19 36 52.7
subroutine 24 33 72.7
pod 7 8 87.5
total 334 473 70.6


line stmt bran cond sub pod time code
1             package Spreadsheet::ExcelTableReader;
2 4     4   6390 use Moo 2;
  4         57850  
  4         26  
3 4     4   11490 use Spreadsheet::ParseExcel;
  4         288958  
  4         150  
4 4     4   57 use Spreadsheet::ParseExcel::Utility 'int2col';
  4         6  
  4         216  
5 4     4   3290 use Spreadsheet::XLSX;
  4         346864  
  4         164  
6 4     4   877 use Log::Any '$log';
  4         5514  
  4         35  
7 4     4   9894 use Spreadsheet::ExcelTableReader::Field;
  4         15  
  4         141  
8 4     4   30 use Carp 'croak';
  4         8  
  4         199  
9 4     4   21 use IO::Handle;
  4         6  
  4         13658  
10              
11             our $VERSION= '0.000001_002';
12              
13             # ABSTRACT: Module to extract a table from somewhere within an Excel spreadsheet
14              
15              
16             has file => ( is => 'ro' );
17              
18              
19             has sheet => ( is => 'ro' );
20              
21             # Arrayref of all sheets we can search
22             has _sheets => ( is => 'lazy' );
23              
24              
25             has fields => ( is => 'ro', required => 1, coerce => \&_coerce_field_list );
26 17     17 1 582 sub field_list { @{ shift->fields } }
  17         71  
27              
28              
29             has find_table_args => ( is => 'rw' );
30              
31             has _table_location => ( is => 'rw', lazy_build => 1 );
32              
33             sub _build__sheets {
34 21     21   1853 my $self= shift;
35            
36             # If we have ->sheet and it is a worksheet object, then no need to do anything else
37 21 50 66     204 if ($self->sheet && ref($self->sheet) && ref($self->sheet)->can('get_cell')) {
      66        
38 9         195 return [ $self->sheet ];
39             }
40            
41             # Else we need to scan sheets from the excel file. Make sure we have the file
42 12         41 my $wbook= $self->_open_workbook($self->file);
43 12         72 my @sheets= $wbook->worksheets;
44 12 50       98 @sheets or croak "No worksheets in file?";
45 12 50       57 if (defined $self->sheet) {
46 0 0       0 if (ref($self->sheet) eq 'Regexp') {
    0          
    0          
47 0         0 @sheets= grep { $_->get_name =~ $self->sheet } @sheets;
  0         0  
48             } elsif (ref($self->sheet) eq 'CODE') {
49 0         0 @sheets= grep { $self->sheet->($_) } @sheets;
  0         0  
50             } elsif (!ref $self->sheet) {
51 0         0 @sheets= grep { $_->get_name eq $self->sheet } @sheets;
  0         0  
52             } else {
53 0         0 croak "Unknown type of sheet specification: ".$self->sheet;
54             }
55             }
56 12 50       28 @sheets or croak "No worksheets match the specification";
57            
58 12         398 return \@sheets;
59             }
60              
61             sub _open_workbook {
62 12     12   54 my ($self, $f)= @_;
63            
64 12 50       33 defined $f or croak "workbook file is undefined";
65            
66 12         16 my $wbook;
67 12 50 66     92 if (ref($f) && ref($f)->can('worksheets')) {
68 0         0 $wbook= $f;
69             } else {
70 12         15 my $type= "xlsx";
71            
72             # Probe the file to determine type
73 12 100 100     352 if (ref($f) eq 'GLOB' or ref($f) && ref($f)->can('read')) {
    50 66        
74 4         27 my $fpos= $f->tell;
75 4 50       37 $fpos >= 0 or croak "File handle must be seekable";
76 4 50       23 $f->read(my $buf, 4) == 4 or croak "read($f,4): $!";
77 4 50       130 $f->seek($fpos, 0) or croak "failed to seek back to start of file";
78 4 100       45 $type= 'xls' if $buf eq "\xD0\xCF\x11\xE0";
79             }
80             elsif (-e $f) {
81 8         92 $f= "$f"; # force stringification
82 8 50       377 open my $fh, '<', $f or croak "open($f): $!";
83 8 50       163 read($fh, my $buf, 4) == 4 or croak "read($f,4): $!";
84 8 100       282 $type= 'xls' if $buf eq "\xD0\xCF\x11\xE0";
85             }
86             else {
87 0 0       0 $log->notice("Can't determine parser for '$f', guessing '$type'") if $log->is_notice;
88             }
89            
90 12 100       32 if ($type eq 'xlsx') {
91             # Spreadsheet::XLSX uses Archive::Zip which can *only* work on IO::Handle
92             # instances, not plain globrefs. (seems like a bug, hm)
93 6 100       24 if (ref($f) eq 'GLOB') {
94 1         8 require IO::File;
95 1         9 my $f_obj= IO::File->new;
96 1 50       49 $f_obj->fdopen($f, 'r') or croak "Can't convert GLOBref to IO::File";
97 1         78 $f= $f_obj;
98             }
99 6         51 $wbook= Spreadsheet::XLSX->new($f);
100             } else {
101 6         50 $wbook= Spreadsheet::ParseExcel->new->parse($f);
102             }
103 12 50       317956 defined $wbook or croak "Can't parse file '".$self->file."'";
104             }
105 12         39 return $wbook;
106             }
107              
108             sub _build__table_location {
109 0     0   0 my $self= shift;
110 0         0 my $args= $self->find_table_args;
111 0 0       0 $self->find_table( !$args? () : (ref($args) eq 'ARRAY')? @$args : %$args )
    0          
    0          
112             or croak "No match for table header in excel file";
113 0         0 $self->{_table_location}; # find_table sets the value already, in a slight violation of this builder method.
114             }
115              
116             sub _coerce_field_list {
117 21     21   44840 my ($list)= @_;
118 21 50 33     160 defined $list and ref $list eq 'ARRAY' or croak "'fields' must be a non-empty arrayref";
119 21         59 my @list= @$list; # clone it, to make sure we don't unexpectedly alter the caller's data
120 21         45 for (@list) {
121 81 100       602 if (!ref $_) {
    50          
122 61         2277 $_= Spreadsheet::ExcelTableReader::Field->new(
123             name => $_,
124             header => qr/^\s*\Q$_\E\s*$/i,
125             );
126             } elsif (ref $_ eq 'HASH') {
127 20         69 my %args= %$_;
128             # "isa" alias for the 'type' attribute
129 20 50 33     64 $args{type}= delete $args{isa} if defined $args{isa} && !defined $args{type};
130             # default header to field name with optional whitespace
131 20 100       203 $args{header}= qr/^\s*\Q$args{name}\E\s*$/i unless defined $args{header};
132 20         451 $_= Spreadsheet::ExcelTableReader::Field->new( %args )
133             } else {
134 0         0 croak "Can't coerce '$_' to a Field object"
135             }
136             }
137 21         532 return \@list;
138             }
139              
140              
141             sub BUILD {
142 21     21 0 376 my $self= shift;
143             # Any errors getting the list of searchable worksheets should happen now, during construction time
144 21         359 $self->_sheets;
145             }
146              
147              
148             sub _cell_name {
149 243     243   348 my ($row, $col)= @_;
150 243         513 return int2col($col).($row+1);
151             }
152              
153             sub find_table {
154 12     12 1 127 my $self= shift;
155            
156 12         16 my $location;
157 12         17 my @sheets= @{$self->_sheets};
  12         233  
158 12         108 my @fields= $self->field_list;
159 12         24 my $num_required_fields= grep { $_->required } @fields;
  47         100  
160            
161             # Algorithm is O(N^4) in worst case, but the regex should make it more like O(N^2) in most
162             # real world cases. The worst case would be if every row of every sheet of the workbook almost
163             # matched the header row (which could happen with extremely lax field header patterns)
164 12         16 my $header_regex= qr/(?:@{[ join('|', map { $_->header_regex } @fields) ]})/ms;
  12         20  
  47         869  
165              
166             # Scan top-down across all sheets at once, since headers are probably at the top of the document.
167 12         33 my $row= 0;
168 12         20 my $in_range= 1; # flag turns false if we pass the bottom of all sheets
169 12         29 row_loop: while ($in_range) {
170 14         18 $in_range= 0;
171 14         28 for my $sheet (@sheets) {
172 14 50       63 $log->trace("row $row sheet $sheet") if $log->is_trace;
173 14         136 my %field_found;
174 14         45 my ($rmin, $rmax)= $sheet->row_range();
175 14         104 my ($cmin, $cmax)= $sheet->col_range();
176 14 100 66     136 next unless $row >= $rmin && $row <= $rmax;
177 12         14 $in_range++;
178 12 50       29 my @row_vals= map { my $c= $sheet->get_cell($row, $_); $c? $c->value : '' } 0..$cmax;
  53         265  
  53         514  
179 12         74 my $match_count= grep { $_ =~ $header_regex } @row_vals;
  53         235  
180 12         74 $log->trace("str=@row_vals, regex=$header_regex, match_count=$match_count");
181 12 50       262 if ($match_count >= $num_required_fields) {
182 12         38 my $field_col= $self->_resolve_field_columns($sheet, $row, \@row_vals);
183 12 100       40 if ($field_col) {
184 10         40 $location= {
185             sheet => $sheet,
186             header_row => $row,
187             min_row => $row+1,
188             field_col => $field_col,
189             };
190 10         39 last row_loop;
191             }
192             }
193             }
194 4         9 ++$row;
195             }
196            
197 12 100       41 return '' unless defined $location;
198            
199             # Calculate a few more fields for location
200 10         14 my @cols_used= sort { $a <=> $b } values %{ $location->{field_col} };
  53         102  
  10         55  
201 10         23 $location->{min_col}= $cols_used[0];
202 10         19 $location->{max_col}= $cols_used[-1];
203            
204             # Maybe should look for the last row containing data for our columns, but that seems expensive...
205 10         38 $location->{max_row}= ($location->{sheet}->row_range())[1];
206            
207 10         70 $location->{start_cell}= _cell_name($location->{min_row}, $location->{min_col});
208 10         122 $location->{end_cell}= _cell_name($location->{min_col}, $location->{max_col});
209 10         122 $self->_table_location($location);
210            
211 10         45 return 1;
212             }
213              
214             sub _resolve_field_columns {
215 12     12   20 my ($self, $sheet, $row, $row_vals)= @_;
216 12         15 my %col_map;
217             my %field_found;
218 12         33 my $fields= $self->fields;
219            
220             # Try each cell to see if it matches each field's header
221 12         33 for my $col (0..$#$row_vals) {
222 53         344 my $v= $row_vals->[$col];
223 53 50 33     228 next unless defined $v and length $v;
224 53         86 for my $field (@$fields) {
225 224 100       5103 push @{ $field_found{$field->name} }, $col
  68         792  
226             if $v =~ $field->header_regex;
227             }
228             }
229            
230             # Is there one and only one mapping of fields to columns?
231 12         44 my $ambiguous= 0;
232 12         35 my @todo= @$fields;
233 12         31 while (@todo) {
234 61         90 my $field= shift @todo;
235 61 100       182 next unless defined $field_found{$field->name};
236 60         110 my $possible= $field_found{$field->name};
237 60         97 my @available= grep { !defined $col_map{$_} } @$possible;
  125         329  
238 125         711 $log->trace("ambiguous=$ambiguous : field ".$field->name." could be ".join(',', map { _cell_name($row,$_) } @$possible)
239 60         191 ." and ".join(',', map { _cell_name($row,$_) } @available)." are available");
  98         983  
240 60 50       1420 if (!@available) {
    100          
241             # It is possible that two fields claim the same columns and one is required
242 0 0       0 if ($field->required) {
243 0         0 my $col= $possible->[0];
244 0 0       0 $log->debug("Field ".$field->name." and ".$col_map{$col}." would both claim "._cell_name($row, $col))
245             if $log->is_debug;
246 0         0 return;
247             }
248             }
249             elsif (@available > 1) {
250             # It is possible for a field to match more than one column.
251             # If so, we send it to the back of the list in case another more specific
252             # column claims one of the options.
253 16 100       38 if (++$ambiguous > @todo) {
254 2 50       7 $log->debug("Can't decide between ".join(', ', map { _cell_name($row,$_) } @available)." for field ".$field->name)
  0         0  
255             if $log->is_debug;
256 2         22 return;
257             }
258 14         47 push @todo, $field;
259             }
260             else {
261 44         120 $col_map{$available[0]}= $field->name;
262 44         139 $ambiguous= 0; # made progress, start counting over again
263             }
264             }
265             # Success! convert the col map to an array of col-index-per-field
266 10 50       31 $log->debug("Found headers at ".join(' ', map { _cell_name($row,$_) } sort keys %col_map))
  0         0  
267             if $log->is_debug;
268 10         136 return { reverse %col_map };
269             }
270              
271              
272             sub table_location {
273 6     6 1 31 my ($self)= @_;
274 6 50       15 return undef unless defined $self->{_table_location};
275             # Deep-clone the location
276 6         8 my %loc= %{ $self->_table_location };
  6         56  
277 6         11 $loc{field_col}= { %{ $loc{field_col} } };
  6         25  
278 6         28 return \%loc;
279             }
280              
281              
282             sub record_count {
283 0     0 1 0 my $self= shift;
284 0 0       0 return 0 unless defined $self->_table_location;
285 0         0 return $self->_table_location->{max_row} - $self->_table_location->{min_row} + 1;
286             }
287              
288              
289             sub records {
290 4     4 1 13 my ($self, %opts)= @_;
291 4         17 my $i= $self->iterator(%opts);
292 4         8 my @records;
293 4         9 while (my $r= $i->()) { push @records, $r; }
  12         32  
294 4         18 return \@records;
295             }
296              
297              
298 4     4 1 24 sub record_arrays { shift->records(as => 'array', @_) }
299              
300              
301             our %_Iterators;
302              
303             sub iterator {
304 4     4 1 9 my ($self, %opts)= @_;
305 4         15 my ($as, $blank_row, $on_error)= delete @opts{'as','blank_row','on_error'};
306 4 50       14 croak "Unknown option(s) to iterator: ".join(', ', keys %opts)
307             if keys %opts;
308            
309 4 50       11 $as= 'hash' unless defined $as;
310 4         5 my $hash= ($as eq 'hash');
311            
312 4 50       10 $blank_row= 'end' unless defined $blank_row;
313 4         7 my $skip_blank_row= ($blank_row eq 'skip');
314 4         8 my $end_blank_row= ($blank_row eq 'end');
315            
316 4         9 my $sheet= $self->_table_location->{sheet};
317 4         10 my $min_row= $self->_table_location->{min_row};
318 4         5 my $row= $min_row - 1;
319 4         4 my $col;
320 4         10 my $min_col= $self->_table_location->{min_col};
321 4         10 my $remaining= $self->_table_location->{max_row} - $self->_table_location->{min_row} + 1;
322 4         4 my $is_blank_row;
323 4         6 my %field_col= %{ $self->_table_location->{field_col} };
  4         19  
324 4         8 my (@result_keys, @field_extractors, @validations);
325 4         10 for my $field ($self->field_list) {
326 16         27 my $blank= $field->blank;
327 16         34 my $src_col= $field_col{$field->name};
328            
329             # Don't need an extractor for fields not found in the table if result type is hash,
330             # but if result type is array we need to pad with a null value
331 16 50       33 if (!defined $src_col) {
332 0 0   0   0 $hash or push @field_extractors, sub { undef; };
  0         0  
333 0         0 next;
334             }
335            
336 16 50       31 push @result_keys, $field->name if $hash;
337            
338             # If trimming, use a different implementation than if not, for a little efficiency
339             push @field_extractors, $field->trim?
340             sub {
341 24     24   63 my $v= $sheet->get_cell($row, $src_col);
342 24 50       233 return $blank unless defined $v;
343 24         52 $v= $v->value;
344 24         156 $v =~ s/^\s*(.*?)\s*$/$1/;
345 24 50       55 return $blank unless length $v;
346 24         26 $is_blank_row= 0;
347 24         53 $v;
348             }
349             :
350             sub {
351 24     24   61 my $v= $sheet->get_cell($row, $src_col);
352 24 50 33     270 defined $v && length($v= $v->value)
353             or return $blank;
354 24         148 $is_blank_row= 0;
355 24         56 $v;
356 16 100       87 };
357            
358 16 50       59 if (defined (my $type= $field->type)) {
359             # This sub will access the values array at the same position as the current field_extractor
360 0         0 my $idx= $#field_extractors;
361             push @validations, sub {
362 0 0   0   0 return if $type->check($_[0][$idx]);
363 0         0 $col= $src_col; # so the iterator->col reports the column of the error
364 0         0 croak "Not a ".$type->name." at cell "._cell_name($row, $col);
365 0         0 };
366             }
367             }
368            
369             # Closure over everything, for very fast access
370             my $sub= sub {
371 16     16   102 $log->trace("iterator: remaining=$remaining row=$row sheet=$sheet");
372 16 100       252 again:
373             return unless $remaining > 0;
374 12         13 ++$row;
375 12         17 $col= $min_col;
376 12         13 --$remaining;
377 12         14 $is_blank_row= 1; # This var is closured, and gets set to 0 by the next line
378 12         20 my @values= map { $_->() } @field_extractors;
  48         80  
379 12 50 33     34 goto again if $skip_blank_row && $is_blank_row;
380 12 50 33     54 if ($end_blank_row && $is_blank_row) {
381 0         0 $remaining= 0;
382 0         0 return;
383             }
384 12         23 $_->(\@values) for @validations; # This can die. It can also be an empty list.
385 12 50       43 return $hash? do { my %r; @r{@result_keys}= @values; \%r } : \@values;
  0         0  
  0         0  
  0         0  
386 4         19 };
387            
388             # Blessed coderef, so we can call methods on it
389 4         17 bless $sub, 'Spreadsheet::ExcelTableReader::Iterator';
390            
391             # Store references to all the closered variables so the methods can access them
392             $_Iterators{$sub}= {
393             r_sheet => \$sheet,
394             r_row => \$row,
395             r_col => \$col,
396             r_remaining => \$remaining,
397             min_row => $self->_table_location->{min_row},
398             max_row => $self->_table_location->{max_row},
399 4         38 };
400            
401 4         17 return $sub;
402             }
403              
404             package Spreadsheet::ExcelTableReader::Iterator;
405              
406 4     4   78 sub DESTROY { delete $_Iterators{$_[0]}; }
407 0     0     sub sheet { ${ $_Iterators{$_[0]}{r_sheet} } }
  0            
408 0     0     sub col { ${ $_Iterators{$_[0]}{r_col} } }
  0            
409 0     0     sub row { ${ $_Iterators{$_[0]}{r_row} } }
  0            
410 0     0     sub remaining { ${ $_Iterators{$_[0]}{r_remaining} } }
  0            
411              
412             sub rewind {
413 0     0     my $self= $_Iterators{$_[0]};
414 0           ${$self->{r_row}}= $self->{min_row} - 1;
  0            
415 0           ${$self->{r_remaining}}= $self->{max_row} - $self->{min_row} + 1;
  0            
416 0           return 1;
417             }
418              
419             1;
420              
421             __END__