File Coverage

blib/lib/Excel/ValueReader/XLSX.pm
Criterion Covered Total %
statement 108 111 97.3
branch 28 40 70.0
condition 6 11 54.5
subroutine 17 17 100.0
pod 3 5 60.0
total 162 184 88.0


line stmt bran cond sub pod time code
1             package Excel::ValueReader::XLSX;
2 1     1   156168 use utf8;
  1         2  
  1         6  
3 1     1   677 use Moose;
  1         476794  
  1         8  
4 1     1   9062 use Module::Load qw/load/;
  1         2  
  1         11  
5 1     1   754 use Date::Calc qw/Add_Delta_Days/;
  1         7125  
  1         115  
6 1     1   7 use POSIX qw/strftime modf/;
  1         2  
  1         15  
7 1     1   1758 use Carp qw/croak/;
  1         2  
  1         51  
8 1     1   6 use feature 'state';
  1         2  
  1         1933  
9              
10             our $VERSION = '1.10';
11              
12             #======================================================================
13             # ATTRIBUTES
14             #======================================================================
15              
16             # PUBLIC ATTRIBUTES
17             has 'xlsx' => (is => 'ro', isa => 'Str', required => 1); # path of xlsx file
18             has 'using' => (is => 'ro', isa => 'Str', default => 'Regex'); # name of backend class
19             has 'date_format' => (is => 'ro', isa => 'Str', default => '%d.%m.%Y');
20             has 'time_format' => (is => 'ro', isa => 'Str', default => '%H:%M:%S');
21             has 'datetime_format' => (is => 'ro', isa => 'Str',
22             builder => '_datetime_format', lazy => 1);
23             has 'date_formatter' => (is => 'ro', isa => 'Maybe[CodeRef]',
24             builder => '_date_formatter', lazy => 1);
25              
26              
27              
28             # ATTRIBUTES USED INTERNALLY, NOT DOCUMENTED
29             has 'backend' => (is => 'ro', isa => 'Object', init_arg => undef,
30             builder => '_backend', lazy => 1,
31             handles => [qw/values base_year sheets/]);
32              
33             #======================================================================
34             # BUILDING
35             #======================================================================
36              
37             # syntactic sugar for supporting ->new($path) instead of ->new(xlsx => $path)
38             around BUILDARGS => sub {
39             my $orig = shift;
40             my $class = shift;
41              
42             if ( @_ == 1 && !ref $_[0] ) {
43             return $class->$orig(xlsx => $_[0]);
44             }
45             else {
46             return $class->$orig(@_);
47             }
48             };
49              
50              
51              
52             #======================================================================
53             # ATTRIBUTE CONSTRUCTORS
54             #======================================================================
55              
56              
57             sub _backend {
58 14     14   34 my $self = shift;
59              
60 14         439 my $backend_class = ref($self) . '::Backend::' . $self->using;
61 14         71 load $backend_class;
62              
63 14         1112 return $backend_class->new(frontend => $self);
64             }
65              
66             sub _datetime_format {
67 12     12   27 my ($self) = @_;
68 12         339 return $self->date_format . ' ' . $self->time_format;
69             }
70              
71             sub _date_formatter {
72 12     12   31 my ($self) = @_;
73              
74             # local copies of the various formats so that we can build a closure
75 12         360 my @formats = (undef, # 0 -- error
76             $self->date_format, # 1 -- just a date
77             $self->time_format, # 2 -- just a time
78             $self->datetime_format); # 3 -- date and time
79              
80             my $strftime_formatter = sub {
81 156     156   353 my ($xl_date_format, $y, $m, $d, $h, $min, $s, $ms) = @_;
82              
83             # choose the proper format for strftime
84 156         210 my $ix = 0; # index into the @formats array
85 156 100       466 $ix += 1 if $xl_date_format =~ /[dy]/; # the Excel format contains a date portion
86 156 100       349 $ix += 2 if $xl_date_format =~ /[hs]/; # the Excel format contains a time portion
87 156 50       331 my $strftime_format = $formats[$ix]
88             or die "cell with unexpected Excel date format : $xl_date_format";
89              
90             # formatting through strftime
91 156         5036 my $formatted_date = strftime($strftime_format, $s, $min, $h, $d, $m-1, $y-1900);
92              
93 156         630 return $formatted_date;
94 12         74 };
95              
96 12         363 return $strftime_formatter;
97             }
98              
99              
100             #======================================================================
101             # GENERAL METHODS
102             #======================================================================
103              
104              
105             sub sheet_names {
106 2     2 1 4529 my ($self) = @_;
107              
108 2         19 my $sheets = $self->sheets; # arrayref of shape {$name => $sheet_position}
109              
110 2         18 my @sorted_names = sort {$sheets->{$a} <=> $sheets->{$b}} keys %$sheets;
  18         61  
111 2         12 return @sorted_names;
112             }
113              
114              
115             sub A1_to_num { # convert Excel A1 reference format to a number
116 2406     2406 0 22030 my ($self, $string) = @_;
117              
118             # ordinal number for character just before 'A'
119 2406         3503 state $base = ord('A') - 1;
120              
121             # iterate on 'digits' (letters of the A1 cell reference)
122 2406         3258 my $num = 0;
123 2406         5368 foreach my $digit (map {ord($_) - $base} split //, $string) {
  2406         6387  
124 2406         4373 $num = $num*26 + $digit;
125             }
126              
127 2406         6793 return $num;
128             }
129              
130              
131             sub formatted_date {
132 156     156 0 8233 my ($self, $val, $date_format, $date_formatter) = @_;
133              
134             # separate date (integer part) from time (fractional part)
135 156         524 my ($time, $n_days) = modf($val);
136              
137             # Convert $n_days into a date in Date::Calc format (year, month, day).
138             # The algorithm is quite odd because in the 1900 system, 01.01.1900 == 0 while
139             # in the 1904 system, 01.01.1904 == 1; furthermore, in the 1900 system,
140             # Excel treats 1900 as a leap year.
141 156         512 my $base_year = $self->base_year;
142 156 100       402 if ($base_year == 1900) {
143 114         226 my $is_after_february_1900 = $n_days > 60;
144 114 100       245 $n_days -= $is_after_february_1900 ? 2 : 1;
145             }
146 156         577 my @d = Add_Delta_Days($base_year, 1, 1, $n_days);
147              
148             # decode the fractional part (the time) into hours, minutes, seconds, milliseconds
149 156         240 my @t;
150 156         289 foreach my $subdivision (24, 60, 60, 1000) {
151 624         817 $time *= $subdivision;
152 624         1179 ($time, my $time_portion) = modf($time);
153 624         1051 push @t, $time_portion;
154             }
155              
156             # dirty hack to deal with float imprecisions : if 999 millisecs, round to the next second
157 156         325 my ($h, $m, $s, $ms) = @t;
158 156 100       319 if ($ms == 999) {
159 2         17 $s += 1, $ms = 0;
160 2 50       67 if ($s == 60) {
161 0         0 $m += 1, $s = 0;
162 0 0       0 if ($m == 60) {
163 0         0 $h += 1, $m = 0;
164             }
165             }
166             }
167             # NOTE : because of this hack, theoretically we could end up with a value
168             # like 01.01.2000 24:00:00, semantically equal to 02.01.2000 00:00:00 but different
169             # in its rendering.
170              
171             # call the date_formatter subroutine
172 156 50 33     4891 $date_formatter //= $self->date_formatter
173             or die ref($self) . " has no date_formatter subroutine";
174 156         366 my $formatted_date = $date_formatter->($date_format, @d, $h, $m, $s, $ms);
175              
176 156         648 return $formatted_date;
177             }
178              
179             #======================================================================
180             # METHODS FOR PARSING EXCEL TABLES
181             #======================================================================
182              
183              
184             sub table_names {
185 2     2 1 5625 my ($self) = @_;
186              
187 2         82 my $table_info = $self->backend->table_info;
188              
189             # sort on table id (field [1] in table_info arrayrefs)
190 2         19 my @table_names = sort {$table_info->{$a}[1] <=> $table_info->{$b}->[1]} keys %$table_info;
  13         35  
191              
192 2         19 return @table_names;
193             }
194              
195              
196             # info fields returned from the backend parsing methods
197             my @table_info_fields = qw/sheet table_id ref columns no_headers/;
198              
199              
200             # the same fields are also the valid args for the method call
201             my $is_valid_arg = "^(" . join("|", @table_info_fields) . ")\$";
202              
203             sub table {
204 10     10 1 7119 my $self = shift;
205              
206             # syntactic sugar : ->table('foo') is treated as ->table(name => 'foo')
207 10 50       76 my %args = @_ == 1 ? (name => $_[0]) : @_;
208              
209             # if called with a table name, derive all other args from internal workbook info
210 10 50       38 if (my $table_name = delete $args{name}) {
211 10   50     72 !$args{$_} or croak "table() : arg '$_' is incompatible with 'name'" for @table_info_fields;
212 10 50       18 @args{@table_info_fields} = @{$self->backend->table_info->{$table_name}}
  10         387  
213             or croak "no table info for table: $table_name";
214             }
215              
216             # check args
217 10         38 my @invalid_args = grep {!/$is_valid_arg/} keys %args;
  50         263  
218 10 50       32 croak "invalid args to table(): " . join ", ", @invalid_args if @invalid_args;
219              
220             # get raw values from the sheet
221 10         46 my $values = $self->values($args{sheet});
222              
223             # restrict values to the table subrange (if applicable)
224 10 50       194 $values = $self->_subrange($values, $args{ref}) if $args{ref};
225              
226             # take headers from first row if not already given in $args{columns}
227 10   33     26 $args{columns} //= $values->[0];
228              
229             # if this table has headers (which is almost always the case), drop the header row
230 10 100       25 shift @$values unless $args{no_headers};
231              
232             # build a table of hashes. This could be done with a simple map(), but using a loop
233             # avoids to store 2 copies of cell values in memory : @$values is shifted when @table is pushed.
234 10         19 my @cols = @{$args{columns}};
  10         32  
235 10 50       20 croak "table contains undefined columns" if grep {!defined $_} @cols;
  32         70  
236 10         21 my @rows;
237 10         29 while (my $vals = shift @$values) {
238 150         211 my %row;
239 150         517 @row{@cols} = @$vals;
240 150         429 push @rows, \%row;
241             }
242              
243             # in scalar context, just return the rows. In list context, also return the column names
244 10 100       83 return wantarray ? (\@cols, \@rows) : \@rows;
245             }
246              
247              
248              
249             sub _subrange {
250 10     10   26 my ($self, $values, $ref) = @_;
251              
252             # parse rows and columns from the $ref string (of shape like for example "A1:D34")
253 10 50       91 my ($col1, $row1, $col2, $row2) = $ref =~ /^([A-Z]+)(\d+):([A-Z]+)(\d+)$/
254             or croak "_subrange : invalid ref: $ref";
255              
256             # restrict to the row range
257 10 100 100     46 if ($row1 > 1 || $row2 < @$values){
258 8         31 $values = [ @$values[$row1-1 .. $row2-1] ];
259             }
260              
261             # restrict to the column range
262 10         33 my @col_nums = map {$self->A1_to_num($_) - 1} ($col1, $col2);
  20         41  
263 10 100       36 if ($col_nums[0] > 1){ # THINK : should check if $colnum2 is smaller that the max row size ??
264 6         17 my @col_range = ($col_nums[0] .. $col_nums[1]);
265 6         11 $values = [map { [ @$_[@col_range] ]} @$values];
  16         49  
266             }
267              
268 10         56 return $values;
269             }
270              
271              
272             1;
273              
274              
275             __END__
276              
277             =head1 NAME
278              
279              
280             Excel::ValueReader::XLSX - extracting values from Excel workbooks in XLSX format, fast
281              
282             =head1 SYNOPSIS
283              
284             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename);
285             # .. or with syntactic sugar :
286             my $reader = Excel::ValueReader::XLSX->new($filename);
287             # .. or with LibXML backend :
288             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
289             using => 'LibXML');
290            
291             foreach my $sheet_name ($reader->sheet_names) {
292             my $grid = $reader->values($sheet_name);
293             my $n_rows = @$grid;
294             print "sheet $sheet_name has $n_rows rows; ",
295             "first cell contains : ", $grid->[0][0];
296             }
297            
298             foreach my $table_name ($reader->table_names) {
299             my ($columns, $rows) = $reader->table($table_name);
300             my $n_data_rows = @$rows;
301             my $n_columns = @$columns;
302             print "table $table_name has $n_data_rows rows and $n_columns columns; ",
303             "column 'foo' in first row contains : ", $rows->[0]{foo};
304             }
305              
306             =head1 DESCRIPTION
307              
308             This module reads the contents of an Excel file in XLSX format.
309             Unlike other modules like L<Spreadsheet::ParseXLSX> or L<Spreadsheet::XLSX>,
310             there is no support for reading formulas, formats or other Excel internal
311             information; all you get are plain values -- but you get them much
312             faster ! Besides, this module also has support for parsing Excel tables.
313              
314             This front module has two different backends for extracting values :
315              
316             =over
317              
318             =item Regex (default)
319              
320             this backend uses regular expressions to parse the XML content.
321              
322             =item LibXML
323              
324             this backend uses L<XML::LibXML::Reader> to parse the XML content.
325             It is probably safer but about three times slower than the Regex backend
326             (but still much faster than L<Spreadsheet::ParseXLSX>).
327              
328             =back
329              
330              
331             =head1 METHODS
332              
333             =head2 new
334              
335             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
336             using => $backend,
337             %date_formatting_options);
338              
339             The C<xlsx> argument is mandatory and points to the C<.xlsx> file to be parsed.
340             The C<using> argument is optional; it specifies the backend to be used for parsing;
341             default is 'Regex'.
342              
343             As syntactic sugar, a shorter form is admitted :
344              
345             my $reader = Excel::ValueReader::XLSX->new($filename);
346              
347             Optional parameters for formatting date and time values
348             are described in the L</DATE AND TIME FORMATS> section below.
349              
350              
351             =head2 sheet_names
352              
353             my @sheets = $reader->sheet_names;
354              
355             Returns the list of worksheet names, in the same order as in the Excel file.
356              
357             =head2 values
358              
359             my $grid = $reader->values($sheet);
360              
361             Returns a bidimensional array of scalars, corresponding to cell
362             values in the specified worksheet. The C<$sheet> argument can be either
363             a sheet name or a sheet position (starting at 1).
364              
365             Unlike the original Excel cells, positions in the grid are zero-based,
366             so for example the content of cell B3 is in C<< $grid->[1][2] >>.
367             The grid is sparse : the size of each row depends on the
368             position of the last non-empty cell in that row.
369             Thanks to Perl's auto-vivification mechanism, any attempt to access
370             a non-existent cell will automatically create the corresponding cell
371             within the grid. The number of rows and columns in the grid can be computed
372             like this :
373              
374             my $nb_rows = @$grid;
375             my $nb_cols = max map {scalar @$_} @$grid; # must import List::Util::max
376              
377             =head2 table_names
378              
379             my @table_names = $reader->table_names;
380              
381             Returns the list of names of tables registered in this workbook.
382              
383              
384             =head2 table
385              
386             my $rows = $reader->table(name => $table_name); # or just : $reader->table($table_name)
387             # or
388             my ($columns, $rows) = $reader->table(name => $table_name);
389             # or
390             my ($columns, $rows) = $reader->table(sheet => $sheet [, ref => $range]
391             [, columns => \@columns]
392             [, no_headers => 1]
393             );
394              
395             In its simplest form, this method returns the content of an Excel table referenced by its table name
396             (in Excel, the table name appears and can be modified through the ribbon tab
397             entry "Table tools / Design").
398             The table name is passed either through the named argument C<name>, or positionally as unique argument
399             to the method.
400              
401             Rows are returned as hashrefs, where keys of the hashes correspond to column names
402             in the table. In scalar context, the method just returns an arrayref to the list of rows. In list
403             context, the method returns a pair, where the first element is an arrayref of column names, and the
404             second element is an arrayref to the list of rows.
405              
406             Instead of specifying a table name, it is also possible to give a sheet name or sheet number.
407             By default, this considers the whole sheet content as a single table, where column names
408             are on the first row. However, additional arguments can be supplied to change the default
409             behaviour :
410              
411             =over
412              
413             =item ref
414              
415             a specific range of cells within the sheet that contain the table rows and columns.
416             The range must be expressed using traditional Excel notation,
417             like for example C<"C9:E23"> (colums 3 to 5, rows 9 to 23).
418              
419             =item columns
420              
421             an arrayref containing the list of column names.
422             If absent, column names will be taken from the first row in the table.
423              
424             =item no_headers
425              
426             if true, the first row in the table will be treated as a regular data row, instead
427             of being treated as a list of column names. In that case, since column names cannot
428             be inferred from cell values in the first row, the C<columns> argument to the method
429             must be present.
430              
431             =back
432              
433              
434             =head1 AUXILIARY METHODS
435              
436             =head1 A1_to_num
437              
438             my $col_num = $reader->A1_to_num('A'); # 1
439             $col_num = $reader->A1_to_num('AZ'); # 52
440             $col_num = $reader->A1_to_num('AA'); # 26
441             $col_num = $reader->A1_to_num('ABC'); # 731
442              
443             Converts a column expressed as a sequence of capital letters (in Excel's "A1" notation)
444             into the corresponding numeric value.
445              
446              
447             =head1 formatted_date
448              
449             my $date = $reader->formatted_date($numeric_date, $excel_date_format);
450              
451             Given a numeric date, this method returns a string date formatted according
452             to the I<date formatter> routine explained in the next section. The C<$excel_date_format>
453             argument should be the Excel format string for that specific cell; it is used
454             only for for deciding if the numeric value should be presented as a date, as a time,
455             or both. Optionally, a custom date formatter callback could be passed as third argument.
456              
457              
458             =head1 DATE AND TIME FORMATS
459              
460             =head2 Date and time handling
461              
462             In Excel, date and times values are stored as numeric values, where the integer part
463             represents the date, and the fractional part represents the time. What distinguishes
464             such numbers from ordinary numbers is the I<numeric format> applied to the cells
465             where they appear.
466              
467             Numeric formats in Excel are complex to reproduce, in particular
468             because they are locale-dependent; therefore the present module does not attempt
469             to faithfully interpret Excel formats. It just infers from formats which
470             cells should be presented as date and/or time values. All such values are then
471             presented through the same I<date_formatter> routine. The default formatter
472             is based on L<POSIX/strftime>; other behaviours may be specified through the C<date_formatter>
473             parameter (explained below).
474              
475             =head2 Parameters for the default strftime formatter
476              
477             When using the default strftime formatter, the following parameters may be passed
478             to the constructor :
479              
480             =over
481              
482             =item date_format
483              
484             The L<POSIX/strftime> format for representing dates. The default is C<%d.%m.%Y>.
485              
486             =item time_format
487              
488             The L<POSIX/strftime> format for representing times. The default is C<%H:%M:%S>.
489              
490             =item datetime_format
491              
492             The L<POSIX/strftime> format for representing date and time together.
493             The default is the concatenation of C<date_format> and C<time_format>, with
494             a space inbetween.
495              
496             =back
497              
498              
499             =head2 Writing a custom formatter
500              
501             A custom algorithm for date formatting can be specified as a parameter to the constructor
502              
503             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
504             date_formatter => sub {...});
505              
506             If this parameter is C<undef>, date formatting is canceled and therefore date and
507             time values will be presented as plain numbers.
508              
509             If not C<undef>, the date formatting routine will we called as :
510              
511             $date_formater->($excel_date_format, $year, $month, $day, $hour, $minute, $second, $millisecond);
512              
513             where
514              
515             =over
516              
517             =item *
518              
519             C<$excel_date_format> is the Excel numbering format associated to that cell, like for example
520             C<mm-dd-yy> or C<h:mm:ss AM/PM>. See the Excel documentation for the syntax description.
521             This is useful to decide if the value should be presented as a date, a time, or both.
522             The present module uses a simple heuristic : if the format contains C<d> or C<y>, it should
523             be presented as a date; if the format contains C<h> or C<s>, it should be presented
524             as a time. The letter C<m> is not taken into consideration because it is ambiguous :
525             depending on the position in the format string, it may represent either a "month" or a "minute".
526              
527             =item *
528              
529             C<year> is the full year, such as 1993 or 2021. The date system of the Excel file (either 1900 or 1904,
530             see L<https://support.microsoft.com/en-us/office/date-systems-in-excel-e7fe7167-48a9-4b96-bb53-5612a800b487>) is properly taken into account. Excel has no support for dates prior to 1900 or 1904, so the
531             C<year> component wil always be above this value.
532              
533             =item *
534              
535             C<month> is the numeric value of the month, starting at 1
536              
537             =item *
538              
539             C<day> is the numeric value of the day in month, starting at 1
540              
541             =item *
542              
543             C<$hour>, C<$minute>, C<$second>, C<$millisecond> obviously contain the corresponding
544             numeric values.
545              
546             =back
547              
548              
549             =head1 CAVEATS
550              
551             =over
552              
553             =item *
554              
555             This module was optimized for speed, not for completeness of
556             OOXML-SpreadsheetML support; so there may be some edge cases where the
557             output is incorrect with respect to the original Excel data.
558              
559             =item *
560              
561             Embedded newline characters in strings are stored in Excel as C<< \r\n >>,
562             following the old Windows convention. When retrieved through the C<Regex>
563             backend, the result contains the original C<< \r >> and C<< \n >> characters;
564             but when retrieved through the LibXML, C<< \r >> are silently removed by the
565             C<XML::LibXML> package.
566              
567             =back
568              
569             =head1 SEE ALSO
570              
571             The official reference for OOXML-SpreadsheetML format is in
572             L<https://www.ecma-international.org/publications/standards/Ecma-376.htm>.
573              
574             Introductory material on XLSX file structure can be found at
575             L<http://officeopenxml.com/anatomyofOOXML-xlsx.php>.
576              
577             The CPAN module L<Data::XLSX::Parser> is claimed to be in alpha stage;
578             it seems to be working but the documentation is insufficient -- I had
579             to inspect the test suite to understand how to use it.
580              
581             Another unpublished but working module for parsing Excel files in Perl
582             can be found at L<https://github.com/jmcnamara/excel-reader-xlsx>.
583             Some test cases were borrowed from that distribution.
584              
585             Conversions from and to Excel internal date format can also be performed
586             through the L<DateTime::Format::Excel> module.
587              
588             =head1 BENCHMARKS
589              
590             Below are some benchmarks computed with the program C<benchmark.pl> in
591             this distribution. The task was to parse an Excel file of five worksheets
592             with about 62600 rows in total, and report the number of rows per sheet.
593             Reported figures are in seconds.
594              
595             Excel::ValueReader::XLSX::Regex 11 elapsed, 10 cpu, 0 system
596             Excel::ValueReader::XLSX::LibXML 35 elapsed, 34 cpu, 0 system
597             [unpublished] Excel::Reader::XLSX 39 elapsed, 37 cpu, 0 system
598             Spreadsheet::ParseXLSX 244 elapsed, 240 cpu, 1 system
599             Data::XLSX::Parser 37 elapsed, 35 cpu, 0 system
600              
601             These figures show that the regex version is about 3 times faster
602             than the LibXML version, and about 22 times faster than
603             L<Spreadsheet::ParseXLSX>. Tests with a bigger file of about 90000 rows
604             showed similar ratios.
605              
606             Modules
607             C<Excel::Reader::XLSX> (unpublished) and L<Data::XLSX::Parser>
608             are based on L<XML::LibXML> like L<Excel::ValueReader::XLSX::Backend::LibXML>;
609             execution times for those three modules are very close.
610              
611             =head1 ACKNOWLEDGEMENTS
612              
613             =over
614              
615             =item *
616              
617             David Flink signaled (and fixed) a bug about strings with embedded newline characters, and
618             signaled that the 'r' attribute in cells is optional.
619              
620             =item *
621              
622             Ulibuck signaled bugs several minor bugs on the LibXML backend
623              
624             =back
625              
626              
627             =head1 AUTHOR
628              
629             Laurent Dami, E<lt>dami at cpan.orgE<gt>
630              
631             =head1 COPYRIGHT AND LICENSE
632              
633             Copyright 2020-2023 by Laurent Dami.
634              
635             This library is free software; you can redistribute it and/or modify
636             it under the same terms as Perl itself.