File Coverage

blib/lib/Excel/ValueReader/XLSX.pm
Criterion Covered Total %
statement 108 111 97.3
branch 28 40 70.0
condition 6 11 54.5
subroutine 17 17 100.0
pod 3 5 60.0
total 162 184 88.0


line stmt bran cond sub pod time code
1             package Excel::ValueReader::XLSX;
2 1     1   179625 use utf8;
  1         2  
  1         7  
3 1     1   533 use Moose;
  1         660258  
  1         7  
4 1     1   7894 use Module::Load qw/load/;
  1         2  
  1         10  
5 1     1   605 use Date::Calc qw/Add_Delta_Days/;
  1         6793  
  1         134  
6 1     1   8 use POSIX qw/strftime modf/;
  1         3  
  1         7  
7 1     1   1554 use Carp qw/croak/;
  1         2  
  1         42  
8 1     1   7 use feature 'state';
  1         2  
  1         1953  
9              
10             our $VERSION = '1.09';
11              
12             #======================================================================
13             # ATTRIBUTES
14             #======================================================================
15              
16             # PUBLIC ATTRIBUTES
17             has 'xlsx' => (is => 'ro', isa => 'Str', required => 1); # path of xlsx file
18             has 'using' => (is => 'ro', isa => 'Str', default => 'Regex'); # name of backend class
19             has 'date_format' => (is => 'ro', isa => 'Str', default => '%d.%m.%Y');
20             has 'time_format' => (is => 'ro', isa => 'Str', default => '%H:%M:%S');
21             has 'datetime_format' => (is => 'ro', isa => 'Str',
22             builder => '_datetime_format', lazy => 1);
23             has 'date_formatter' => (is => 'ro', isa => 'Maybe[CodeRef]',
24             builder => '_date_formatter', lazy => 1);
25              
26              
27              
28             # ATTRIBUTES USED INTERNALLY, NOT DOCUMENTED
29             has 'backend' => (is => 'ro', isa => 'Object', init_arg => undef,
30             builder => '_backend', lazy => 1,
31             handles => [qw/values base_year sheets/]);
32              
33             #======================================================================
34             # BUILDING
35             #======================================================================
36              
37             # syntactic sugar for supporting ->new($path) instead of ->new(xlsx => $path)
38             around BUILDARGS => sub {
39             my $orig = shift;
40             my $class = shift;
41              
42             if ( @_ == 1 && !ref $_[0] ) {
43             return $class->$orig(xlsx => $_[0]);
44             }
45             else {
46             return $class->$orig(@_);
47             }
48             };
49              
50              
51              
52             #======================================================================
53             # ATTRIBUTE CONSTRUCTORS
54             #======================================================================
55              
56              
57             sub _backend {
58 12     12   26 my $self = shift;
59              
60 12         318 my $backend_class = ref($self) . '::Backend::' . $self->using;
61 12         66 load $backend_class;
62              
63 12         945 return $backend_class->new(frontend => $self);
64             }
65              
66             sub _datetime_format {
67 10     10   20 my ($self) = @_;
68 10         236 return $self->date_format . ' ' . $self->time_format;
69             }
70              
71             sub _date_formatter {
72 10     10   26 my ($self) = @_;
73              
74             # local copies of the various formats so that we can build a closure
75 10         261 my @formats = (undef, # 0 -- error
76             $self->date_format, # 1 -- just a date
77             $self->time_format, # 2 -- just a time
78             $self->datetime_format); # 3 -- date and time
79              
80             my $strftime_formatter = sub {
81 156     156   363 my ($xl_date_format, $y, $m, $d, $h, $min, $s, $ms) = @_;
82              
83             # choose the proper format for strftime
84 156         197 my $ix = 0; # index into the @formats array
85 156 100       462 $ix += 1 if $xl_date_format =~ /[dy]/; # the Excel format contains a date portion
86 156 100       304 $ix += 2 if $xl_date_format =~ /[hs]/; # the Excel format contains a time portion
87 156 50       293 my $strftime_format = $formats[$ix]
88             or die "cell with unexpected Excel date format : $xl_date_format";
89              
90             # formatting through strftime
91 156         4849 my $formatted_date = strftime($strftime_format, $s, $min, $h, $d, $m-1, $y-1900);
92              
93 156         602 return $formatted_date;
94 10         57 };
95              
96 10         260 return $strftime_formatter;
97             }
98              
99              
100             #======================================================================
101             # GENERAL METHODS
102             #======================================================================
103              
104              
105             sub sheet_names {
106 2     2 1 4026 my ($self) = @_;
107              
108 2         13 my $sheets = $self->sheets; # arrayref of shape {$name => $sheet_position}
109              
110 2         16 my @sorted_names = sort {$sheets->{$a} <=> $sheets->{$b}} keys %$sheets;
  18         56  
111 2         11 return @sorted_names;
112             }
113              
114              
115             sub A1_to_num { # convert Excel A1 reference format to a number
116 2404     2404 0 25250 my ($self, $string) = @_;
117              
118             # ordinal number for character just before 'A'
119 2404         3384 state $base = ord('A') - 1;
120              
121             # iterate on 'digits' (letters of the A1 cell reference)
122 2404         3197 my $num = 0;
123 2404         5457 foreach my $digit (map {ord($_) - $base} split //, $string) {
  2404         6646  
124 2404         4486 $num = $num*26 + $digit;
125             }
126              
127 2404         7619 return $num;
128             }
129              
130              
131             sub formatted_date {
132 156     156 0 16606 my ($self, $val, $date_format, $date_formatter) = @_;
133              
134             # separate date (integer part) from time (fractional part)
135 156         555 my ($time, $n_days) = modf($val);
136              
137             # Convert $n_days into a date in Date::Calc format (year, month, day).
138             # The algorithm is quite odd because in the 1900 system, 01.01.1900 == 0 while
139             # in the 1904 system, 01.01.1904 == 1; furthermore, in the 1900 system,
140             # Excel treats 1900 as a leap year.
141 156         482 my $base_year = $self->base_year;
142 156 100       321 if ($base_year == 1900) {
143 114         261 my $is_after_february_1900 = $n_days > 60;
144 114 100       379 $n_days -= $is_after_february_1900 ? 2 : 1;
145             }
146 156         515 my @d = Add_Delta_Days($base_year, 1, 1, $n_days);
147              
148             # decode the fractional part (the time) into hours, minutes, seconds, milliseconds
149 156         217 my @t;
150 156         240 foreach my $subdivision (24, 60, 60, 1000) {
151 624         787 $time *= $subdivision;
152 624         1124 ($time, my $time_portion) = modf($time);
153 624         858 push @t, $time_portion;
154             }
155              
156             # dirty hack to deal with float imprecisions : if 999 millisecs, round to the next second
157 156         393 my ($h, $m, $s, $ms) = @t;
158 156 100       276 if ($ms == 999) {
159 2         7 $s += 1, $ms = 0;
160 2 50       8 if ($s == 60) {
161 0         0 $m += 1, $s = 0;
162 0 0       0 if ($m == 60) {
163 0         0 $h += 1, $m = 0;
164             }
165             }
166             }
167             # NOTE : because of this hack, theoretically we could end up with a value
168             # like 01.01.2000 24:00:00, semantically equal to 02.01.2000 00:00:00 but different
169             # in its rendering.
170              
171             # call the date_formatter subroutine
172 156 50 33     4487 $date_formatter //= $self->date_formatter
173             or die ref($self) . " has no date_formatter subroutine";
174 156         423 my $formatted_date = $date_formatter->($date_format, @d, $h, $m, $s, $ms);
175              
176 156         612 return $formatted_date;
177             }
178              
179             #======================================================================
180             # METHODS FOR PARSING EXCEL TABLES
181             #======================================================================
182              
183              
184             sub table_names {
185 2     2 1 9095 my ($self) = @_;
186              
187 2         73 my $table_info = $self->backend->table_info;
188              
189             # sort on table id (field [1] in table_info arrayrefs)
190 2         15 my @table_names = sort {$table_info->{$a}[1] <=> $table_info->{$b}->[1]} keys %$table_info;
  16         40  
191              
192 2         17 return @table_names;
193             }
194              
195              
196             # info fields returned from the backend parsing methods
197             my @table_info_fields = qw/sheet table_id ref columns no_headers/;
198              
199              
200             # the same fields are also the valid args for the method call
201             my $is_valid_arg = "^(" . join("|", @table_info_fields) . ")\$";
202              
203             sub table {
204 10     10 1 9061 my $self = shift;
205              
206             # syntactic sugar : ->table('foo') is treated as ->table(name => 'foo')
207 10 50       49 my %args = @_ == 1 ? (name => $_[0]) : @_;
208              
209             # if called with a table name, derive all other args from internal workbook info
210 10 50       48 if (my $table_name = delete $args{name}) {
211 10   50     77 !$args{$_} or croak "table() : arg '$_' is incompatible with 'name'" for @table_info_fields;
212 10 50       120 @args{@table_info_fields} = @{$self->backend->table_info->{$table_name}}
  10         482  
213             or croak "no table info for table: $table_name";
214             }
215              
216             # check args
217 10         34 my @invalid_args = grep {!/$is_valid_arg/} keys %args;
  50         587  
218 10 50       31 croak "invalid args to table(): " . join ", ", @invalid_args if @invalid_args;
219              
220             # get raw values from the sheet
221 10         60 my $values = $self->values($args{sheet});
222              
223             # restrict values to the table subrange (if applicable)
224 10 50       225 $values = $self->_subrange($values, $args{ref}) if $args{ref};
225              
226             # take headers from first row if not already given in $args{columns}
227 10   33     33 $args{columns} //= $values->[0];
228              
229             # if this table has headers (which is almost always the case), drop the header row
230 10 100       26 shift @$values unless $args{no_headers};
231              
232             # build a table of hashes. This could be done with a simple map(), but using a loop
233             # avoids to store 2 copies of cell values in memory : @$values is shifted when @table is pushed.
234 10         35 my @cols = @{$args{columns}};
  10         31  
235 10 50       22 croak "table contains undefined columns" if grep {!defined $_} @cols;
  32         66  
236 10         21 my @rows;
237 10         27 while (my $vals = shift @$values) {
238 150         164 my %row;
239 150         574 @row{@cols} = @$vals;
240 150         435 push @rows, \%row;
241             }
242              
243             # in scalar context, just return the rows. In list context, also return the column names
244 10 100       74 return wantarray ? (\@cols, \@rows) : \@rows;
245             }
246              
247              
248              
249             sub _subrange {
250 10     10   32 my ($self, $values, $ref) = @_;
251              
252             # parse rows and columns from the $ref string (of shape like for example "A1:D34")
253 10 50       101 my ($col1, $row1, $col2, $row2) = $ref =~ /^([A-Z]+)(\d+):([A-Z]+)(\d+)$/
254             or croak "_subrange : invalid ref: $ref";
255              
256             # restrict to the row range
257 10 100 100     55 if ($row1 > 1 || $row2 < @$values){
258 8         32 $values = [ @$values[$row1-1 .. $row2-1] ];
259             }
260              
261             # restrict to the column range
262 10         24 my @col_nums = map {$self->A1_to_num($_) - 1} ($col1, $col2);
  20         40  
263 10 100       27 if ($col_nums[0] > 1){ # THINK : should check if $colnum2 is smaller that the max row size ??
264 6         12 my @col_range = ($col_nums[0] .. $col_nums[1]);
265 6         13 $values = [map { [ @$_[@col_range] ]} @$values];
  16         44  
266             }
267              
268 10         42 return $values;
269             }
270              
271              
272             1;
273              
274              
275             __END__
276              
277             =head1 NAME
278              
279              
280             Excel::ValueReader::XLSX - extracting values from Excel workbooks in XLSX format, fast
281              
282             =head1 SYNOPSIS
283              
284             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename);
285             # .. or with syntactic sugar :
286             my $reader = Excel::ValueReader::XLSX->new($filename);
287             # .. or with LibXML backend :
288             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
289             using => 'LibXML');
290            
291             foreach my $sheet_name ($reader->sheet_names) {
292             my $grid = $reader->values($sheet_name);
293             my $n_rows = @$grid;
294             print "sheet $sheet_name has $n_rows rows; ",
295             "first cell contains : ", $grid->[0][0];
296             }
297            
298             foreach my $table_name ($reader->table_names) {
299             my ($columns, $rows) = $reader->table($table_name);
300             my $n_data_rows = @$rows;
301             my $n_columns = @$columns;
302             print "table $table_name has $n_data_rows rows and $n_columns columns; ",
303             "column 'foo' in first row contains : ", $rows->[0]{foo};
304             }
305              
306             =head1 DESCRIPTION
307              
308             This module reads the contents of an Excel file in XLSX format.
309             Unlike other modules like L<Spreadsheet::ParseXLSX> or L<Spreadsheet::XLSX>,
310             there is no support for reading formulas, formats or other Excel internal
311             information; all you get are plain values -- but you get them much
312             faster ! Besides, this module also has support for parsing Excel tables.
313              
314             This front module has two different backends for extracting values :
315              
316             =over
317              
318             =item Regex (default)
319              
320             this backend uses regular expressions to parse the XML content.
321              
322             =item LibXML
323              
324             this backend uses L<XML::LibXML::Reader> to parse the XML content.
325             It is probably safer but about three times slower than the Regex backend
326             (but still much faster than L<Spreadsheet::ParseXLSX>).
327              
328             =back
329              
330              
331             =head1 METHODS
332              
333             =head2 new
334              
335             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
336             using => $backend,
337             %date_formatting_options);
338              
339             The C<xlsx> argument is mandatory and points to the C<.xlsx> file to be parsed.
340             The C<using> argument is optional; it specifies the backend to be used for parsing;
341             default is 'Regex'.
342              
343             As syntactic sugar, a shorter form is admitted :
344              
345             my $reader = Excel::ValueReader::XLSX->new($filename);
346              
347             Optional parameters for formatting date and time values
348             are described in the L</DATE AND TIME FORMATS> section below.
349              
350              
351             =head2 sheet_names
352              
353             my @sheets = $reader->sheet_names;
354              
355             Returns the list of worksheet names, in the same order as in the Excel file.
356              
357             =head2 values
358              
359             my $grid = $reader->values($sheet);
360              
361             Returns a bidimensional array of scalars, corresponding to cell
362             values in the specified worksheet. The C<$sheet> argument can be either
363             a sheet name or a sheet position (starting at 1).
364              
365             Unlike the original Excel cells, positions in the grid are zero-based,
366             so for example the content of cell B3 is in C<< $grid->[1][2] >>.
367             The grid is sparse : the size of each row depends on the
368             position of the last non-empty cell in that row.
369             Thanks to Perl's auto-vivification mechanism, any attempt to access
370             a non-existent cell will automatically create the corresponding cell
371             within the grid. The number of rows and columns in the grid can be computed
372             like this :
373              
374             my $nb_rows = @$grid;
375             my $nb_cols = max map {scalar @$_} @$grid; # must import List::Util::max
376              
377             =head2 table_names
378              
379             my @table_names = $reader->table_names;
380              
381             Returns the list of names of table registered in this workbook.
382              
383              
384             =head2 table
385              
386             my $rows = $reader->table(name => $table_name); # or just : $reader->table($table_name)
387             # or
388             my ($columns, $rows) = $reader->table(name => $table_name);
389             # or
390             my ($columns, $rows) = $reader->table(sheet => $sheet [, ref => $range]
391             [, columns => \@columns]
392             [, no_headers => 1]
393             );
394              
395             In its simplest form, this method returns the content of an Excel table referenced by its table name
396             (in Excel, the table name appears and can be modified through the Table tools / Design tab).
397             The table name is passed either through the named argument C<name>, or positionally as unique argument
398             to the method.
399              
400             Rows are returned as hashrefs, where keys of the hashes correspond to column names
401             in the table. In scalar context, the method just returns an arrayref to the list of rows. In list
402             context, the method returns a pair, where the first element is an arrayref of column names, and the
403             second element is an arrayref to the list of rows.
404              
405             Instead of specifying a table name, it is also possible to give a sheet name or sheet number.
406             By default, this considers the whole sheet content as a single table, where column names
407             are on the first row. However, additional arguments can be supplied to change the default
408             behaviour :
409              
410             =over
411              
412             =item ref
413              
414             a specific range of cells within the sheet that contain the table rows and columns.
415             The range must be expressed using traditional Excel notation,
416             like for example C<"C9:E23"> (colums 3 to 5, rows 9 to 23).
417              
418             =item columns
419              
420             an arrayref containing the list of column names.
421             If absent, column names will be taken from the first row in the table.
422              
423             =item no_headers
424              
425             if true, the first row in the table will be treated as a regular data row, instead
426             of being treated as a list of column names. In that case, since column names cannot
427             be inferred from cell values in the first row, the C<columns> argument to the method
428             must be present.
429              
430             =back
431              
432              
433             =head1 AUXILIARY METHODS
434              
435             =head1 A1_to_num
436              
437             my $col_num = $reader->A1_to_num('A'); # 1
438             $col_num = $reader->A1_to_num('AZ'); # 52
439             $col_num = $reader->A1_to_num('AA'); # 26
440             $col_num = $reader->A1_to_num('ABC'); # 731
441              
442             Converts a column expressed as a sequence of capital letters (in Excel's "A1" notation)
443             into the corresponding numeric value.
444              
445              
446             =head1 formatted_date
447              
448             my $date = $reader->formatted_date($numeric_date, $excel_date_format);
449              
450             Given a numeric date, this method returns a string date formatted according
451             to the I<date formatter> routine explained in the next section. The C<$excel_date_format>
452             argument should be the Excel format string for that specific cell; it is used
453             only for for deciding if the numeric value should be presented as a date, as a time,
454             or both. Optionally, a custom date formatter callback could be passed as third argument.
455              
456              
457             =head1 DATE AND TIME FORMATS
458              
459             =head2 Date and time handling
460              
461             In Excel, date and times values are stored as numeric values, where the integer part
462             represents the date, and the fractional part represents the time. What distinguishes
463             such numbers from ordinary numbers is the I<numeric format> applied to the cells
464             where they appear.
465              
466             Numeric formats in Excel are complex to reproduce, in particular
467             because they are locale-dependent; therefore the present module does not attempt
468             to faithfully interpret Excel formats. It just infers from formats which
469             cells should be presented as date and/or time values. All such values are then
470             presented through the same I<date_formatter> routine. The default formatter
471             is based on L<POSIX/strftime>; other behaviours may be specified through the C<date_formatter>
472             parameter (explained below).
473              
474             =head2 Parameters for the default strftime formatter
475              
476             When using the default strftime formatter, the following parameters may be passed
477             to the constructor :
478              
479             =over
480              
481             =item date_format
482              
483             The L<POSIX/strftime> format for representing dates. The default is C<%d.%m.%Y>.
484              
485             =item time_format
486              
487             The L<POSIX/strftime> format for representing times. The default is C<%H:%M:%S>.
488              
489             =item datetime_format
490              
491             The L<POSIX/strftime> format for representing date and time together.
492             The default is the concatenation of C<date_format> and C<time_format>, with
493             a space inbetween.
494              
495             =back
496              
497              
498             =head2 Writing a custom formatter
499              
500             A custom algorithm for date formatting can be specified as a parameter to the constructor
501              
502             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
503             date_formatter => sub {...});
504              
505             If this parameter is C<undef>, date formatting is canceled and therefore date and
506             time values will be presented as plain numbers.
507              
508             If not C<undef>, the date formatting routine will we called as :
509              
510             $date_formater->($excel_date_format, $year, $month, $day, $hour, $minute, $second, $millisecond);
511              
512             where
513              
514             =over
515              
516             =item *
517              
518             C<$excel_date_format> is the Excel numbering format associated to that cell, like for example
519             C<mm-dd-yy> or C<h:mm:ss AM/PM>. See the Excel documentation for the syntax description.
520             This is useful to decide if the value should be presented as a date, a time, or both.
521             The present module uses a simple heuristic : if the format contains C<d> or C<y>, it should
522             be presented as a date; if the format contains C<h> or C<s>, it should be presented
523             as a time. The letter C<m> is not taken into consideration because it is ambiguous :
524             depending on the position in the format string, it may represent either a "month" or a "minute".
525              
526             =item *
527              
528             C<year> is the full year, such as 1993 or 2021. The date system of the Excel file (either 1900 or 1904,
529             see L<https://support.microsoft.com/en-us/office/date-systems-in-excel-e7fe7167-48a9-4b96-bb53-5612a800b487>) is properly taken into account. Excel has no support for dates prior to 1900 or 1904, so the
530             C<year> component wil always be above this value.
531              
532             =item *
533              
534             C<month> is the numeric value of the month, starting at 1
535              
536             =item *
537              
538             C<day> is the numeric value of the day in month, starting at 1
539              
540             =item *
541              
542             C<$hour>, C<$minute>, C<$second>, C<$millisecond> obviously contain the corresponding
543             numeric values.
544              
545             =back
546              
547              
548             =head1 CAVEATS
549              
550             =over
551              
552             =item *
553              
554             This module was optimized for speed, not for completeness of
555             OOXML-SpreadsheetML support; so there may be some edge cases where the
556             output is incorrect with respect to the original Excel data.
557              
558             =item *
559              
560             Embedded newline characters in strings are stored in Excel as C<< \r\n >>,
561             following the old Windows convention. When retrieved through the C<Regex>
562             backend, the result contains the original C<< \r >> and C<< \n >> characters;
563             but when retrieved through the LibXML, C<< \r >> are silently removed by the
564             C<XML::LibXML> package.
565              
566             =back
567              
568             =head1 SEE ALSO
569              
570             The official reference for OOXML-SpreadsheetML format is in
571             L<https://www.ecma-international.org/publications/standards/Ecma-376.htm>.
572              
573             Introductory material on XLSX file structure can be found at
574             L<http://officeopenxml.com/anatomyofOOXML-xlsx.php>.
575              
576             The CPAN module L<Data::XLSX::Parser> is claimed to be in alpha stage;
577             it seems to be working but the documentation is insufficient -- I had
578             to inspect the test suite to understand how to use it.
579              
580             Another unpublished but working module for parsing Excel files in Perl
581             can be found at L<https://github.com/jmcnamara/excel-reader-xlsx>.
582             Some test cases were borrowed from that distribution.
583              
584             Conversions from and to Excel internal date format can also be performed
585             through the L<DateTime::Format::Excel> module.
586              
587             =head1 BENCHMARKS
588              
589             Below are some benchmarks computed with the program C<benchmark.pl> in
590             this distribution. The task was to parse an Excel file of five worksheets
591             with about 62600 rows in total, and report the number of rows per sheet.
592             Reported figures are in seconds.
593              
594             Excel::ValueReader::XLSX::Regex 11 elapsed, 10 cpu, 0 system
595             Excel::ValueReader::XLSX::LibXML 35 elapsed, 34 cpu, 0 system
596             [unpublished] Excel::Reader::XLSX 39 elapsed, 37 cpu, 0 system
597             Spreadsheet::ParseXLSX 244 elapsed, 240 cpu, 1 system
598             Data::XLSX::Parser 37 elapsed, 35 cpu, 0 system
599              
600             These figures show that the regex version is about 3 times faster
601             than the LibXML version, and about 22 times faster than
602             L<Spreadsheet::ParseXLSX>. Tests with a bigger file of about 90000 rows
603             showed similar ratios.
604              
605             Modules
606             C<Excel::Reader::XLSX> (unpublished) and L<Data::XLSX::Parser>
607             are based on L<XML::LibXML> like L<Excel::ValueReader::XLSX::Backend::LibXML>;
608             execution times for those three modules are very close.
609              
610             =head1 ACKNOWLEDGEMENTS
611              
612             =over
613              
614             =item *
615              
616             David Flink signaled (and fixed) a bug about strings with embedded newline characters
617              
618             =back
619              
620              
621             =head1 AUTHOR
622              
623             Laurent Dami, E<lt>dami at cpan.orgE<gt>
624              
625             =head1 COPYRIGHT AND LICENSE
626              
627             Copyright 2020-2022 by Laurent Dami.
628              
629             This library is free software; you can redistribute it and/or modify
630             it under the same terms as Perl itself.