File Coverage

blib/lib/Excel/ValueReader/XLSX.pm

Criterion	Covered	Total	%
statement	108	111	97.3
branch	28	40	70.0
condition	6	11	54.5
subroutine	17	17	100.0
pod	3	5	60.0
total	162	184	88.0

line	stmt	bran	cond	sub	pod	time	code
1							package Excel::ValueReader::XLSX;
2	1			1		179625	use utf8;
	1					2
	1					7
3	1			1		533	use Moose;
	1					660258
	1					7
4	1			1		7894	use Module::Load qw/load/;
	1					2
	1					10
5	1			1		605	use Date::Calc qw/Add_Delta_Days/;
	1					6793
	1					134
6	1			1		8	use POSIX qw/strftime modf/;
	1					3
	1					7
7	1			1		1554	use Carp qw/croak/;
	1					2
	1					42
8	1			1		7	use feature 'state';
	1					2
	1					1953
9
10							our $VERSION = '1.09';
11
12							#======================================================================
13							# ATTRIBUTES
14							#======================================================================
15
16							# PUBLIC ATTRIBUTES
17							has 'xlsx' => (is => 'ro', isa => 'Str', required => 1); # path of xlsx file
18							has 'using' => (is => 'ro', isa => 'Str', default => 'Regex'); # name of backend class
19							has 'date_format' => (is => 'ro', isa => 'Str', default => '%d.%m.%Y');
20							has 'time_format' => (is => 'ro', isa => 'Str', default => '%H:%M:%S');
21							has 'datetime_format' => (is => 'ro', isa => 'Str',
22							builder => '_datetime_format', lazy => 1);
23							has 'date_formatter' => (is => 'ro', isa => 'Maybe[CodeRef]',
24							builder => '_date_formatter', lazy => 1);
25
26
27
28							# ATTRIBUTES USED INTERNALLY, NOT DOCUMENTED
29							has 'backend' => (is => 'ro', isa => 'Object', init_arg => undef,
30							builder => '_backend', lazy => 1,
31							handles => [qw/values base_year sheets/]);
32
33							#======================================================================
34							# BUILDING
35							#======================================================================
36
37							# syntactic sugar for supporting ->new($path) instead of ->new(xlsx => $path)
38							around BUILDARGS => sub {
39							my $orig = shift;
40							my $class = shift;
41
42							if ( @_ == 1 && !ref $_[0] ) {
43							return $class->$orig(xlsx => $_[0]);
44							}
45							else {
46							return $class->$orig(@_);
47							}
48							};
49
50
51
52							#======================================================================
53							# ATTRIBUTE CONSTRUCTORS
54							#======================================================================
55
56
57							sub _backend {
58	12			12		26	my $self = shift;
59
60	12					318	my $backend_class = ref($self) . '::Backend::' . $self->using;
61	12					66	load $backend_class;
62
63	12					945	return $backend_class->new(frontend => $self);
64							}
65
66							sub _datetime_format {
67	10			10		20	my ($self) = @_;
68	10					236	return $self->date_format . ' ' . $self->time_format;
69							}
70
71							sub _date_formatter {
72	10			10		26	my ($self) = @_;
73
74							# local copies of the various formats so that we can build a closure
75	10					261	my @formats = (undef, # 0 -- error
76							$self->date_format, # 1 -- just a date
77							$self->time_format, # 2 -- just a time
78							$self->datetime_format); # 3 -- date and time
79
80							my $strftime_formatter = sub {
81	156			156		363	my ($xl_date_format, $y, $m, $d, $h, $min, $s, $ms) = @_;
82
83							# choose the proper format for strftime
84	156					197	my $ix = 0; # index into the @formats array
85	156	100				462	$ix += 1 if $xl_date_format =~ /[dy]/; # the Excel format contains a date portion
86	156	100				304	$ix += 2 if $xl_date_format =~ /[hs]/; # the Excel format contains a time portion
87	156	50				293	my $strftime_format = $formats[$ix]
88							or die "cell with unexpected Excel date format : $xl_date_format";
89
90							# formatting through strftime
91	156					4849	my $formatted_date = strftime($strftime_format, $s, $min, $h, $d, $m-1, $y-1900);
92
93	156					602	return $formatted_date;
94	10					57	};
95
96	10					260	return $strftime_formatter;
97							}
98
99
100							#======================================================================
101							# GENERAL METHODS
102							#======================================================================
103
104
105							sub sheet_names {
106	2			2	1	4026	my ($self) = @_;
107
108	2					13	my $sheets = $self->sheets; # arrayref of shape {$name => $sheet_position}
109
110	2					16	my @sorted_names = sort {$sheets->{$a} <=> $sheets->{$b}} keys %$sheets;
	18					56
111	2					11	return @sorted_names;
112							}
113
114
115							sub A1_to_num { # convert Excel A1 reference format to a number
116	2404			2404	0	25250	my ($self, $string) = @_;
117
118							# ordinal number for character just before 'A'
119	2404					3384	state $base = ord('A') - 1;
120
121							# iterate on 'digits' (letters of the A1 cell reference)
122	2404					3197	my $num = 0;
123	2404					5457	foreach my $digit (map {ord($_) - $base} split //, $string) {
	2404					6646
124	2404					4486	$num = $num*26 + $digit;
125							}
126
127	2404					7619	return $num;
128							}
129
130
131							sub formatted_date {
132	156			156	0	16606	my ($self, $val, $date_format, $date_formatter) = @_;
133
134							# separate date (integer part) from time (fractional part)
135	156					555	my ($time, $n_days) = modf($val);
136
137							# Convert $n_days into a date in Date::Calc format (year, month, day).
138							# The algorithm is quite odd because in the 1900 system, 01.01.1900 == 0 while
139							# in the 1904 system, 01.01.1904 == 1; furthermore, in the 1900 system,
140							# Excel treats 1900 as a leap year.
141	156					482	my $base_year = $self->base_year;
142	156	100				321	if ($base_year == 1900) {
143	114					261	my $is_after_february_1900 = $n_days > 60;
144	114	100				379	$n_days -= $is_after_february_1900 ? 2 : 1;
145							}
146	156					515	my @d = Add_Delta_Days($base_year, 1, 1, $n_days);
147
148							# decode the fractional part (the time) into hours, minutes, seconds, milliseconds
149	156					217	my @t;
150	156					240	foreach my $subdivision (24, 60, 60, 1000) {
151	624					787	$time *= $subdivision;
152	624					1124	($time, my $time_portion) = modf($time);
153	624					858	push @t, $time_portion;
154							}
155
156							# dirty hack to deal with float imprecisions : if 999 millisecs, round to the next second
157	156					393	my ($h, $m, $s, $ms) = @t;
158	156	100				276	if ($ms == 999) {
159	2					7	$s += 1, $ms = 0;
160	2	50				8	if ($s == 60) {
161	0					0	$m += 1, $s = 0;
162	0	0				0	if ($m == 60) {
163	0					0	$h += 1, $m = 0;
164							}
165							}
166							}
167							# NOTE : because of this hack, theoretically we could end up with a value
168							# like 01.01.2000 24:00:00, semantically equal to 02.01.2000 00:00:00 but different
169							# in its rendering.
170
171							# call the date_formatter subroutine
172	156	50	33			4487	$date_formatter //= $self->date_formatter
173							or die ref($self) . " has no date_formatter subroutine";
174	156					423	my $formatted_date = $date_formatter->($date_format, @d, $h, $m, $s, $ms);
175
176	156					612	return $formatted_date;
177							}
178
179							#======================================================================
180							# METHODS FOR PARSING EXCEL TABLES
181							#======================================================================
182
183
184							sub table_names {
185	2			2	1	9095	my ($self) = @_;
186
187	2					73	my $table_info = $self->backend->table_info;
188
189							# sort on table id (field [1] in table_info arrayrefs)
190	2					15	my @table_names = sort {$table_info->{$a}[1] <=> $table_info->{$b}->[1]} keys %$table_info;
	16					40
191
192	2					17	return @table_names;
193							}
194
195
196							# info fields returned from the backend parsing methods
197							my @table_info_fields = qw/sheet table_id ref columns no_headers/;
198
199
200							# the same fields are also the valid args for the method call
201							my $is_valid_arg = "^(" . join("\|", @table_info_fields) . ")\$";
202
203							sub table {
204	10			10	1	9061	my $self = shift;
205
206							# syntactic sugar : ->table('foo') is treated as ->table(name => 'foo')
207	10	50				49	my %args = @_ == 1 ? (name => $_[0]) : @_;
208
209							# if called with a table name, derive all other args from internal workbook info
210	10	50				48	if (my $table_name = delete $args{name}) {
211	10		50			77	!$args{$_} or croak "table() : arg '$_' is incompatible with 'name'" for @table_info_fields;
212	10	50				120	@args{@table_info_fields} = @{$self->backend->table_info->{$table_name}}
	10					482
213							or croak "no table info for table: $table_name";
214							}
215
216							# check args
217	10					34	my @invalid_args = grep {!/$is_valid_arg/} keys %args;
	50					587
218	10	50				31	croak "invalid args to table(): " . join ", ", @invalid_args if @invalid_args;
219
220							# get raw values from the sheet
221	10					60	my $values = $self->values($args{sheet});
222
223							# restrict values to the table subrange (if applicable)
224	10	50				225	$values = $self->_subrange($values, $args{ref}) if $args{ref};
225
226							# take headers from first row if not already given in $args{columns}
227	10		33			33	$args{columns} //= $values->[0];
228
229							# if this table has headers (which is almost always the case), drop the header row
230	10	100				26	shift @$values unless $args{no_headers};
231
232							# build a table of hashes. This could be done with a simple map(), but using a loop
233							# avoids to store 2 copies of cell values in memory : @$values is shifted when @table is pushed.
234	10					35	my @cols = @{$args{columns}};
	10					31
235	10	50				22	croak "table contains undefined columns" if grep {!defined $_} @cols;
	32					66
236	10					21	my @rows;
237	10					27	while (my $vals = shift @$values) {
238	150					164	my %row;
239	150					574	@row{@cols} = @$vals;
240	150					435	push @rows, \%row;
241							}
242
243							# in scalar context, just return the rows. In list context, also return the column names
244	10	100				74	return wantarray ? (\@cols, \@rows) : \@rows;
245							}
246
247
248
249							sub _subrange {
250	10			10		32	my ($self, $values, $ref) = @_;
251
252							# parse rows and columns from the $ref string (of shape like for example "A1:D34")
253	10	50				101	my ($col1, $row1, $col2, $row2) = $ref =~ /^([A-Z]+)(\d+):([A-Z]+)(\d+)$/
254							or croak "_subrange : invalid ref: $ref";
255
256							# restrict to the row range
257	10	100	100			55	if ($row1 > 1 \|\| $row2 < @$values){
258	8					32	$values = [ @$values[$row1-1 .. $row2-1] ];
259							}
260
261							# restrict to the column range
262	10					24	my @col_nums = map {$self->A1_to_num($_) - 1} ($col1, $col2);
	20					40
263	10	100				27	if ($col_nums[0] > 1){ # THINK : should check if $colnum2 is smaller that the max row size ??
264	6					12	my @col_range = ($col_nums[0] .. $col_nums[1]);
265	6					13	$values = [map { [ @$_[@col_range] ]} @$values];
	16					44
266							}
267
268	10					42	return $values;
269							}
270
271
272							1;
273
274
275							__END__
276
277							=head1 NAME
278
279
280							Excel::ValueReader::XLSX - extracting values from Excel workbooks in XLSX format, fast
281
282							=head1 SYNOPSIS
283
284							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename);
285							# .. or with syntactic sugar :
286							my $reader = Excel::ValueReader::XLSX->new($filename);
287							# .. or with LibXML backend :
288							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
289							using => 'LibXML');
290
291							foreach my $sheet_name ($reader->sheet_names) {
292							my $grid = $reader->values($sheet_name);
293							my $n_rows = @$grid;
294							print "sheet $sheet_name has $n_rows rows; ",
295							"first cell contains : ", $grid->[0][0];
296							}
297
298							foreach my $table_name ($reader->table_names) {
299							my ($columns, $rows) = $reader->table($table_name);
300							my $n_data_rows = @$rows;
301							my $n_columns = @$columns;
302							print "table $table_name has $n_data_rows rows and $n_columns columns; ",
303							"column 'foo' in first row contains : ", $rows->[0]{foo};
304							}
305
306							=head1 DESCRIPTION
307
308							This module reads the contents of an Excel file in XLSX format.
309							Unlike other modules like L<Spreadsheet::ParseXLSX> or L<Spreadsheet::XLSX>,
310							there is no support for reading formulas, formats or other Excel internal
311							information; all you get are plain values -- but you get them much
312							faster ! Besides, this module also has support for parsing Excel tables.
313
314							This front module has two different backends for extracting values :
315
316							=over
317
318							=item Regex (default)
319
320							this backend uses regular expressions to parse the XML content.
321
322							=item LibXML
323
324							this backend uses L<XML::LibXML::Reader> to parse the XML content.
325							It is probably safer but about three times slower than the Regex backend
326							(but still much faster than L<Spreadsheet::ParseXLSX>).
327
328							=back
329
330
331							=head1 METHODS
332
333							=head2 new
334
335							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
336							using => $backend,
337							%date_formatting_options);
338
339							The C<xlsx> argument is mandatory and points to the C<.xlsx> file to be parsed.
340							The C<using> argument is optional; it specifies the backend to be used for parsing;
341							default is 'Regex'.
342
343							As syntactic sugar, a shorter form is admitted :
344
345							my $reader = Excel::ValueReader::XLSX->new($filename);
346
347							Optional parameters for formatting date and time values
348							are described in the L</DATE AND TIME FORMATS> section below.
349
350
351							=head2 sheet_names
352
353							my @sheets = $reader->sheet_names;
354
355							Returns the list of worksheet names, in the same order as in the Excel file.
356
357							=head2 values
358
359							my $grid = $reader->values($sheet);
360
361							Returns a bidimensional array of scalars, corresponding to cell
362							values in the specified worksheet. The C<$sheet> argument can be either
363							a sheet name or a sheet position (starting at 1).
364
365							Unlike the original Excel cells, positions in the grid are zero-based,
366							so for example the content of cell B3 is in C<< $grid->[1][2] >>.
367							The grid is sparse : the size of each row depends on the
368							position of the last non-empty cell in that row.
369							Thanks to Perl's auto-vivification mechanism, any attempt to access
370							a non-existent cell will automatically create the corresponding cell
371							within the grid. The number of rows and columns in the grid can be computed
372							like this :
373
374							my $nb_rows = @$grid;
375							my $nb_cols = max map {scalar @$_} @$grid; # must import List::Util::max
376
377							=head2 table_names
378
379							my @table_names = $reader->table_names;
380
381							Returns the list of names of table registered in this workbook.
382
383
384							=head2 table
385
386							my $rows = $reader->table(name => $table_name); # or just : $reader->table($table_name)
387							# or
388							my ($columns, $rows) = $reader->table(name => $table_name);
389							# or
390							my ($columns, $rows) = $reader->table(sheet => $sheet [, ref => $range]
391							[, columns => \@columns]
392							[, no_headers => 1]
393							);
394
395							In its simplest form, this method returns the content of an Excel table referenced by its table name
396							(in Excel, the table name appears and can be modified through the Table tools / Design tab).
397							The table name is passed either through the named argument C<name>, or positionally as unique argument
398							to the method.
399
400							Rows are returned as hashrefs, where keys of the hashes correspond to column names
401							in the table. In scalar context, the method just returns an arrayref to the list of rows. In list
402							context, the method returns a pair, where the first element is an arrayref of column names, and the
403							second element is an arrayref to the list of rows.
404
405							Instead of specifying a table name, it is also possible to give a sheet name or sheet number.
406							By default, this considers the whole sheet content as a single table, where column names
407							are on the first row. However, additional arguments can be supplied to change the default
408							behaviour :
409
410							=over
411
412							=item ref
413
414							a specific range of cells within the sheet that contain the table rows and columns.
415							The range must be expressed using traditional Excel notation,
416							like for example C<"C9:E23"> (colums 3 to 5, rows 9 to 23).
417
418							=item columns
419
420							an arrayref containing the list of column names.
421							If absent, column names will be taken from the first row in the table.
422
423							=item no_headers
424
425							if true, the first row in the table will be treated as a regular data row, instead
426							of being treated as a list of column names. In that case, since column names cannot
427							be inferred from cell values in the first row, the C<columns> argument to the method
428							must be present.
429
430							=back
431
432
433							=head1 AUXILIARY METHODS
434
435							=head1 A1_to_num
436
437							my $col_num = $reader->A1_to_num('A'); # 1
438							$col_num = $reader->A1_to_num('AZ'); # 52
439							$col_num = $reader->A1_to_num('AA'); # 26
440							$col_num = $reader->A1_to_num('ABC'); # 731
441
442							Converts a column expressed as a sequence of capital letters (in Excel's "A1" notation)
443							into the corresponding numeric value.
444
445
446							=head1 formatted_date
447
448							my $date = $reader->formatted_date($numeric_date, $excel_date_format);
449
450							Given a numeric date, this method returns a string date formatted according
451							to the I<date formatter> routine explained in the next section. The C<$excel_date_format>
452							argument should be the Excel format string for that specific cell; it is used
453							only for for deciding if the numeric value should be presented as a date, as a time,
454							or both. Optionally, a custom date formatter callback could be passed as third argument.
455
456
457							=head1 DATE AND TIME FORMATS
458
459							=head2 Date and time handling
460
461							In Excel, date and times values are stored as numeric values, where the integer part
462							represents the date, and the fractional part represents the time. What distinguishes
463							such numbers from ordinary numbers is the I<numeric format> applied to the cells
464							where they appear.
465
466							Numeric formats in Excel are complex to reproduce, in particular
467							because they are locale-dependent; therefore the present module does not attempt
468							to faithfully interpret Excel formats. It just infers from formats which
469							cells should be presented as date and/or time values. All such values are then
470							presented through the same I<date_formatter> routine. The default formatter
471							is based on L<POSIX/strftime>; other behaviours may be specified through the C<date_formatter>
472							parameter (explained below).
473
474							=head2 Parameters for the default strftime formatter
475
476							When using the default strftime formatter, the following parameters may be passed
477							to the constructor :
478
479							=over
480
481							=item date_format
482
483							The L<POSIX/strftime> format for representing dates. The default is C<%d.%m.%Y>.
484
485							=item time_format
486
487							The L<POSIX/strftime> format for representing times. The default is C<%H:%M:%S>.
488
489							=item datetime_format
490
491							The L<POSIX/strftime> format for representing date and time together.
492							The default is the concatenation of C<date_format> and C<time_format>, with
493							a space inbetween.
494
495							=back
496
497
498							=head2 Writing a custom formatter
499
500							A custom algorithm for date formatting can be specified as a parameter to the constructor
501
502							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
503							date_formatter => sub {...});
504
505							If this parameter is C<undef>, date formatting is canceled and therefore date and
506							time values will be presented as plain numbers.
507
508							If not C<undef>, the date formatting routine will we called as :
509
510							$date_formater->($excel_date_format, $year, $month, $day, $hour, $minute, $second, $millisecond);
511
512							where
513
514							=over
515
516							=item *
517
518							C<$excel_date_format> is the Excel numbering format associated to that cell, like for example
519							C<mm-dd-yy> or C<h:mm:ss AM/PM>. See the Excel documentation for the syntax description.
520							This is useful to decide if the value should be presented as a date, a time, or both.
521							The present module uses a simple heuristic : if the format contains C<d> or C<y>, it should
522							be presented as a date; if the format contains C<h> or C<s>, it should be presented
523							as a time. The letter C<m> is not taken into consideration because it is ambiguous :
524							depending on the position in the format string, it may represent either a "month" or a "minute".
525
526							=item *
527
528							C<year> is the full year, such as 1993 or 2021. The date system of the Excel file (either 1900 or 1904,
529							see L<https://support.microsoft.com/en-us/office/date-systems-in-excel-e7fe7167-48a9-4b96-bb53-5612a800b487>) is properly taken into account. Excel has no support for dates prior to 1900 or 1904, so the
530							C<year> component wil always be above this value.
531
532							=item *
533
534							C<month> is the numeric value of the month, starting at 1
535
536							=item *
537
538							C<day> is the numeric value of the day in month, starting at 1
539
540							=item *
541
542							C<$hour>, C<$minute>, C<$second>, C<$millisecond> obviously contain the corresponding
543							numeric values.
544
545							=back
546
547
548							=head1 CAVEATS
549
550							=over
551
552							=item *
553
554							This module was optimized for speed, not for completeness of
555							OOXML-SpreadsheetML support; so there may be some edge cases where the
556							output is incorrect with respect to the original Excel data.
557
558							=item *
559
560							Embedded newline characters in strings are stored in Excel as C<< \r\n >>,
561							following the old Windows convention. When retrieved through the C<Regex>
562							backend, the result contains the original C<< \r >> and C<< \n >> characters;
563							but when retrieved through the LibXML, C<< \r >> are silently removed by the
564							C<XML::LibXML> package.
565
566							=back
567
568							=head1 SEE ALSO
569
570							The official reference for OOXML-SpreadsheetML format is in
571							L<https://www.ecma-international.org/publications/standards/Ecma-376.htm>.
572
573							Introductory material on XLSX file structure can be found at
574							L<http://officeopenxml.com/anatomyofOOXML-xlsx.php>.
575
576							The CPAN module L<Data::XLSX::Parser> is claimed to be in alpha stage;
577							it seems to be working but the documentation is insufficient -- I had
578							to inspect the test suite to understand how to use it.
579
580							Another unpublished but working module for parsing Excel files in Perl
581							can be found at L<https://github.com/jmcnamara/excel-reader-xlsx>.
582							Some test cases were borrowed from that distribution.
583
584							Conversions from and to Excel internal date format can also be performed
585							through the L<DateTime::Format::Excel> module.
586
587							=head1 BENCHMARKS
588
589							Below are some benchmarks computed with the program C<benchmark.pl> in
590							this distribution. The task was to parse an Excel file of five worksheets
591							with about 62600 rows in total, and report the number of rows per sheet.
592							Reported figures are in seconds.
593
594							Excel::ValueReader::XLSX::Regex 11 elapsed, 10 cpu, 0 system
595							Excel::ValueReader::XLSX::LibXML 35 elapsed, 34 cpu, 0 system
596							[unpublished] Excel::Reader::XLSX 39 elapsed, 37 cpu, 0 system
597							Spreadsheet::ParseXLSX 244 elapsed, 240 cpu, 1 system
598							Data::XLSX::Parser 37 elapsed, 35 cpu, 0 system
599
600							These figures show that the regex version is about 3 times faster
601							than the LibXML version, and about 22 times faster than
602							L<Spreadsheet::ParseXLSX>. Tests with a bigger file of about 90000 rows
603							showed similar ratios.
604
605							Modules
606							C<Excel::Reader::XLSX> (unpublished) and L<Data::XLSX::Parser>
607							are based on L<XML::LibXML> like L<Excel::ValueReader::XLSX::Backend::LibXML>;
608							execution times for those three modules are very close.
609
610							=head1 ACKNOWLEDGEMENTS
611
612							=over
613
614							=item *
615
616							David Flink signaled (and fixed) a bug about strings with embedded newline characters
617
618							=back
619
620
621							=head1 AUTHOR
622
623							Laurent Dami, E<lt>dami at cpan.orgE<gt>
624
625							=head1 COPYRIGHT AND LICENSE
626
627							Copyright 2020-2022 by Laurent Dami.
628
629							This library is free software; you can redistribute it and/or modify
630							it under the same terms as Perl itself.