File Coverage

blib/lib/Excel/ValueReader/XLSX.pm

Criterion	Covered	Total	%
statement	108	111	97.3
branch	28	40	70.0
condition	6	11	54.5
subroutine	17	17	100.0
pod	3	5	60.0
total	162	184	88.0

line	stmt	bran	cond	sub	pod	time	code
1							package Excel::ValueReader::XLSX;
2	1			1		156168	use utf8;
	1					2
	1					6
3	1			1		677	use Moose;
	1					476794
	1					8
4	1			1		9062	use Module::Load qw/load/;
	1					2
	1					11
5	1			1		754	use Date::Calc qw/Add_Delta_Days/;
	1					7125
	1					115
6	1			1		7	use POSIX qw/strftime modf/;
	1					2
	1					15
7	1			1		1758	use Carp qw/croak/;
	1					2
	1					51
8	1			1		6	use feature 'state';
	1					2
	1					1933
9
10							our $VERSION = '1.10';
11
12							#======================================================================
13							# ATTRIBUTES
14							#======================================================================
15
16							# PUBLIC ATTRIBUTES
17							has 'xlsx' => (is => 'ro', isa => 'Str', required => 1); # path of xlsx file
18							has 'using' => (is => 'ro', isa => 'Str', default => 'Regex'); # name of backend class
19							has 'date_format' => (is => 'ro', isa => 'Str', default => '%d.%m.%Y');
20							has 'time_format' => (is => 'ro', isa => 'Str', default => '%H:%M:%S');
21							has 'datetime_format' => (is => 'ro', isa => 'Str',
22							builder => '_datetime_format', lazy => 1);
23							has 'date_formatter' => (is => 'ro', isa => 'Maybe[CodeRef]',
24							builder => '_date_formatter', lazy => 1);
25
26
27
28							# ATTRIBUTES USED INTERNALLY, NOT DOCUMENTED
29							has 'backend' => (is => 'ro', isa => 'Object', init_arg => undef,
30							builder => '_backend', lazy => 1,
31							handles => [qw/values base_year sheets/]);
32
33							#======================================================================
34							# BUILDING
35							#======================================================================
36
37							# syntactic sugar for supporting ->new($path) instead of ->new(xlsx => $path)
38							around BUILDARGS => sub {
39							my $orig = shift;
40							my $class = shift;
41
42							if ( @_ == 1 && !ref $_[0] ) {
43							return $class->$orig(xlsx => $_[0]);
44							}
45							else {
46							return $class->$orig(@_);
47							}
48							};
49
50
51
52							#======================================================================
53							# ATTRIBUTE CONSTRUCTORS
54							#======================================================================
55
56
57							sub _backend {
58	14			14		34	my $self = shift;
59
60	14					439	my $backend_class = ref($self) . '::Backend::' . $self->using;
61	14					71	load $backend_class;
62
63	14					1112	return $backend_class->new(frontend => $self);
64							}
65
66							sub _datetime_format {
67	12			12		27	my ($self) = @_;
68	12					339	return $self->date_format . ' ' . $self->time_format;
69							}
70
71							sub _date_formatter {
72	12			12		31	my ($self) = @_;
73
74							# local copies of the various formats so that we can build a closure
75	12					360	my @formats = (undef, # 0 -- error
76							$self->date_format, # 1 -- just a date
77							$self->time_format, # 2 -- just a time
78							$self->datetime_format); # 3 -- date and time
79
80							my $strftime_formatter = sub {
81	156			156		353	my ($xl_date_format, $y, $m, $d, $h, $min, $s, $ms) = @_;
82
83							# choose the proper format for strftime
84	156					210	my $ix = 0; # index into the @formats array
85	156	100				466	$ix += 1 if $xl_date_format =~ /[dy]/; # the Excel format contains a date portion
86	156	100				349	$ix += 2 if $xl_date_format =~ /[hs]/; # the Excel format contains a time portion
87	156	50				331	my $strftime_format = $formats[$ix]
88							or die "cell with unexpected Excel date format : $xl_date_format";
89
90							# formatting through strftime
91	156					5036	my $formatted_date = strftime($strftime_format, $s, $min, $h, $d, $m-1, $y-1900);
92
93	156					630	return $formatted_date;
94	12					74	};
95
96	12					363	return $strftime_formatter;
97							}
98
99
100							#======================================================================
101							# GENERAL METHODS
102							#======================================================================
103
104
105							sub sheet_names {
106	2			2	1	4529	my ($self) = @_;
107
108	2					19	my $sheets = $self->sheets; # arrayref of shape {$name => $sheet_position}
109
110	2					18	my @sorted_names = sort {$sheets->{$a} <=> $sheets->{$b}} keys %$sheets;
	18					61
111	2					12	return @sorted_names;
112							}
113
114
115							sub A1_to_num { # convert Excel A1 reference format to a number
116	2406			2406	0	22030	my ($self, $string) = @_;
117
118							# ordinal number for character just before 'A'
119	2406					3503	state $base = ord('A') - 1;
120
121							# iterate on 'digits' (letters of the A1 cell reference)
122	2406					3258	my $num = 0;
123	2406					5368	foreach my $digit (map {ord($_) - $base} split //, $string) {
	2406					6387
124	2406					4373	$num = $num*26 + $digit;
125							}
126
127	2406					6793	return $num;
128							}
129
130
131							sub formatted_date {
132	156			156	0	8233	my ($self, $val, $date_format, $date_formatter) = @_;
133
134							# separate date (integer part) from time (fractional part)
135	156					524	my ($time, $n_days) = modf($val);
136
137							# Convert $n_days into a date in Date::Calc format (year, month, day).
138							# The algorithm is quite odd because in the 1900 system, 01.01.1900 == 0 while
139							# in the 1904 system, 01.01.1904 == 1; furthermore, in the 1900 system,
140							# Excel treats 1900 as a leap year.
141	156					512	my $base_year = $self->base_year;
142	156	100				402	if ($base_year == 1900) {
143	114					226	my $is_after_february_1900 = $n_days > 60;
144	114	100				245	$n_days -= $is_after_february_1900 ? 2 : 1;
145							}
146	156					577	my @d = Add_Delta_Days($base_year, 1, 1, $n_days);
147
148							# decode the fractional part (the time) into hours, minutes, seconds, milliseconds
149	156					240	my @t;
150	156					289	foreach my $subdivision (24, 60, 60, 1000) {
151	624					817	$time *= $subdivision;
152	624					1179	($time, my $time_portion) = modf($time);
153	624					1051	push @t, $time_portion;
154							}
155
156							# dirty hack to deal with float imprecisions : if 999 millisecs, round to the next second
157	156					325	my ($h, $m, $s, $ms) = @t;
158	156	100				319	if ($ms == 999) {
159	2					17	$s += 1, $ms = 0;
160	2	50				67	if ($s == 60) {
161	0					0	$m += 1, $s = 0;
162	0	0				0	if ($m == 60) {
163	0					0	$h += 1, $m = 0;
164							}
165							}
166							}
167							# NOTE : because of this hack, theoretically we could end up with a value
168							# like 01.01.2000 24:00:00, semantically equal to 02.01.2000 00:00:00 but different
169							# in its rendering.
170
171							# call the date_formatter subroutine
172	156	50	33			4891	$date_formatter //= $self->date_formatter
173							or die ref($self) . " has no date_formatter subroutine";
174	156					366	my $formatted_date = $date_formatter->($date_format, @d, $h, $m, $s, $ms);
175
176	156					648	return $formatted_date;
177							}
178
179							#======================================================================
180							# METHODS FOR PARSING EXCEL TABLES
181							#======================================================================
182
183
184							sub table_names {
185	2			2	1	5625	my ($self) = @_;
186
187	2					82	my $table_info = $self->backend->table_info;
188
189							# sort on table id (field [1] in table_info arrayrefs)
190	2					19	my @table_names = sort {$table_info->{$a}[1] <=> $table_info->{$b}->[1]} keys %$table_info;
	13					35
191
192	2					19	return @table_names;
193							}
194
195
196							# info fields returned from the backend parsing methods
197							my @table_info_fields = qw/sheet table_id ref columns no_headers/;
198
199
200							# the same fields are also the valid args for the method call
201							my $is_valid_arg = "^(" . join("\|", @table_info_fields) . ")\$";
202
203							sub table {
204	10			10	1	7119	my $self = shift;
205
206							# syntactic sugar : ->table('foo') is treated as ->table(name => 'foo')
207	10	50				76	my %args = @_ == 1 ? (name => $_[0]) : @_;
208
209							# if called with a table name, derive all other args from internal workbook info
210	10	50				38	if (my $table_name = delete $args{name}) {
211	10		50			72	!$args{$_} or croak "table() : arg '$_' is incompatible with 'name'" for @table_info_fields;
212	10	50				18	@args{@table_info_fields} = @{$self->backend->table_info->{$table_name}}
	10					387
213							or croak "no table info for table: $table_name";
214							}
215
216							# check args
217	10					38	my @invalid_args = grep {!/$is_valid_arg/} keys %args;
	50					263
218	10	50				32	croak "invalid args to table(): " . join ", ", @invalid_args if @invalid_args;
219
220							# get raw values from the sheet
221	10					46	my $values = $self->values($args{sheet});
222
223							# restrict values to the table subrange (if applicable)
224	10	50				194	$values = $self->_subrange($values, $args{ref}) if $args{ref};
225
226							# take headers from first row if not already given in $args{columns}
227	10		33			26	$args{columns} //= $values->[0];
228
229							# if this table has headers (which is almost always the case), drop the header row
230	10	100				25	shift @$values unless $args{no_headers};
231
232							# build a table of hashes. This could be done with a simple map(), but using a loop
233							# avoids to store 2 copies of cell values in memory : @$values is shifted when @table is pushed.
234	10					19	my @cols = @{$args{columns}};
	10					32
235	10	50				20	croak "table contains undefined columns" if grep {!defined $_} @cols;
	32					70
236	10					21	my @rows;
237	10					29	while (my $vals = shift @$values) {
238	150					211	my %row;
239	150					517	@row{@cols} = @$vals;
240	150					429	push @rows, \%row;
241							}
242
243							# in scalar context, just return the rows. In list context, also return the column names
244	10	100				83	return wantarray ? (\@cols, \@rows) : \@rows;
245							}
246
247
248
249							sub _subrange {
250	10			10		26	my ($self, $values, $ref) = @_;
251
252							# parse rows and columns from the $ref string (of shape like for example "A1:D34")
253	10	50				91	my ($col1, $row1, $col2, $row2) = $ref =~ /^([A-Z]+)(\d+):([A-Z]+)(\d+)$/
254							or croak "_subrange : invalid ref: $ref";
255
256							# restrict to the row range
257	10	100	100			46	if ($row1 > 1 \|\| $row2 < @$values){
258	8					31	$values = [ @$values[$row1-1 .. $row2-1] ];
259							}
260
261							# restrict to the column range
262	10					33	my @col_nums = map {$self->A1_to_num($_) - 1} ($col1, $col2);
	20					41
263	10	100				36	if ($col_nums[0] > 1){ # THINK : should check if $colnum2 is smaller that the max row size ??
264	6					17	my @col_range = ($col_nums[0] .. $col_nums[1]);
265	6					11	$values = [map { [ @$_[@col_range] ]} @$values];
	16					49
266							}
267
268	10					56	return $values;
269							}
270
271
272							1;
273
274
275							__END__
276
277							=head1 NAME
278
279
280							Excel::ValueReader::XLSX - extracting values from Excel workbooks in XLSX format, fast
281
282							=head1 SYNOPSIS
283
284							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename);
285							# .. or with syntactic sugar :
286							my $reader = Excel::ValueReader::XLSX->new($filename);
287							# .. or with LibXML backend :
288							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
289							using => 'LibXML');
290
291							foreach my $sheet_name ($reader->sheet_names) {
292							my $grid = $reader->values($sheet_name);
293							my $n_rows = @$grid;
294							print "sheet $sheet_name has $n_rows rows; ",
295							"first cell contains : ", $grid->[0][0];
296							}
297
298							foreach my $table_name ($reader->table_names) {
299							my ($columns, $rows) = $reader->table($table_name);
300							my $n_data_rows = @$rows;
301							my $n_columns = @$columns;
302							print "table $table_name has $n_data_rows rows and $n_columns columns; ",
303							"column 'foo' in first row contains : ", $rows->[0]{foo};
304							}
305
306							=head1 DESCRIPTION
307
308							This module reads the contents of an Excel file in XLSX format.
309							Unlike other modules like L<Spreadsheet::ParseXLSX> or L<Spreadsheet::XLSX>,
310							there is no support for reading formulas, formats or other Excel internal
311							information; all you get are plain values -- but you get them much
312							faster ! Besides, this module also has support for parsing Excel tables.
313
314							This front module has two different backends for extracting values :
315
316							=over
317
318							=item Regex (default)
319
320							this backend uses regular expressions to parse the XML content.
321
322							=item LibXML
323
324							this backend uses L<XML::LibXML::Reader> to parse the XML content.
325							It is probably safer but about three times slower than the Regex backend
326							(but still much faster than L<Spreadsheet::ParseXLSX>).
327
328							=back
329
330
331							=head1 METHODS
332
333							=head2 new
334
335							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
336							using => $backend,
337							%date_formatting_options);
338
339							The C<xlsx> argument is mandatory and points to the C<.xlsx> file to be parsed.
340							The C<using> argument is optional; it specifies the backend to be used for parsing;
341							default is 'Regex'.
342
343							As syntactic sugar, a shorter form is admitted :
344
345							my $reader = Excel::ValueReader::XLSX->new($filename);
346
347							Optional parameters for formatting date and time values
348							are described in the L</DATE AND TIME FORMATS> section below.
349
350
351							=head2 sheet_names
352
353							my @sheets = $reader->sheet_names;
354
355							Returns the list of worksheet names, in the same order as in the Excel file.
356
357							=head2 values
358
359							my $grid = $reader->values($sheet);
360
361							Returns a bidimensional array of scalars, corresponding to cell
362							values in the specified worksheet. The C<$sheet> argument can be either
363							a sheet name or a sheet position (starting at 1).
364
365							Unlike the original Excel cells, positions in the grid are zero-based,
366							so for example the content of cell B3 is in C<< $grid->[1][2] >>.
367							The grid is sparse : the size of each row depends on the
368							position of the last non-empty cell in that row.
369							Thanks to Perl's auto-vivification mechanism, any attempt to access
370							a non-existent cell will automatically create the corresponding cell
371							within the grid. The number of rows and columns in the grid can be computed
372							like this :
373
374							my $nb_rows = @$grid;
375							my $nb_cols = max map {scalar @$_} @$grid; # must import List::Util::max
376
377							=head2 table_names
378
379							my @table_names = $reader->table_names;
380
381							Returns the list of names of tables registered in this workbook.
382
383
384							=head2 table
385
386							my $rows = $reader->table(name => $table_name); # or just : $reader->table($table_name)
387							# or
388							my ($columns, $rows) = $reader->table(name => $table_name);
389							# or
390							my ($columns, $rows) = $reader->table(sheet => $sheet [, ref => $range]
391							[, columns => \@columns]
392							[, no_headers => 1]
393							);
394
395							In its simplest form, this method returns the content of an Excel table referenced by its table name
396							(in Excel, the table name appears and can be modified through the ribbon tab
397							entry "Table tools / Design").
398							The table name is passed either through the named argument C<name>, or positionally as unique argument
399							to the method.
400
401							Rows are returned as hashrefs, where keys of the hashes correspond to column names
402							in the table. In scalar context, the method just returns an arrayref to the list of rows. In list
403							context, the method returns a pair, where the first element is an arrayref of column names, and the
404							second element is an arrayref to the list of rows.
405
406							Instead of specifying a table name, it is also possible to give a sheet name or sheet number.
407							By default, this considers the whole sheet content as a single table, where column names
408							are on the first row. However, additional arguments can be supplied to change the default
409							behaviour :
410
411							=over
412
413							=item ref
414
415							a specific range of cells within the sheet that contain the table rows and columns.
416							The range must be expressed using traditional Excel notation,
417							like for example C<"C9:E23"> (colums 3 to 5, rows 9 to 23).
418
419							=item columns
420
421							an arrayref containing the list of column names.
422							If absent, column names will be taken from the first row in the table.
423
424							=item no_headers
425
426							if true, the first row in the table will be treated as a regular data row, instead
427							of being treated as a list of column names. In that case, since column names cannot
428							be inferred from cell values in the first row, the C<columns> argument to the method
429							must be present.
430
431							=back
432
433
434							=head1 AUXILIARY METHODS
435
436							=head1 A1_to_num
437
438							my $col_num = $reader->A1_to_num('A'); # 1
439							$col_num = $reader->A1_to_num('AZ'); # 52
440							$col_num = $reader->A1_to_num('AA'); # 26
441							$col_num = $reader->A1_to_num('ABC'); # 731
442
443							Converts a column expressed as a sequence of capital letters (in Excel's "A1" notation)
444							into the corresponding numeric value.
445
446
447							=head1 formatted_date
448
449							my $date = $reader->formatted_date($numeric_date, $excel_date_format);
450
451							Given a numeric date, this method returns a string date formatted according
452							to the I<date formatter> routine explained in the next section. The C<$excel_date_format>
453							argument should be the Excel format string for that specific cell; it is used
454							only for for deciding if the numeric value should be presented as a date, as a time,
455							or both. Optionally, a custom date formatter callback could be passed as third argument.
456
457
458							=head1 DATE AND TIME FORMATS
459
460							=head2 Date and time handling
461
462							In Excel, date and times values are stored as numeric values, where the integer part
463							represents the date, and the fractional part represents the time. What distinguishes
464							such numbers from ordinary numbers is the I<numeric format> applied to the cells
465							where they appear.
466
467							Numeric formats in Excel are complex to reproduce, in particular
468							because they are locale-dependent; therefore the present module does not attempt
469							to faithfully interpret Excel formats. It just infers from formats which
470							cells should be presented as date and/or time values. All such values are then
471							presented through the same I<date_formatter> routine. The default formatter
472							is based on L<POSIX/strftime>; other behaviours may be specified through the C<date_formatter>
473							parameter (explained below).
474
475							=head2 Parameters for the default strftime formatter
476
477							When using the default strftime formatter, the following parameters may be passed
478							to the constructor :
479
480							=over
481
482							=item date_format
483
484							The L<POSIX/strftime> format for representing dates. The default is C<%d.%m.%Y>.
485
486							=item time_format
487
488							The L<POSIX/strftime> format for representing times. The default is C<%H:%M:%S>.
489
490							=item datetime_format
491
492							The L<POSIX/strftime> format for representing date and time together.
493							The default is the concatenation of C<date_format> and C<time_format>, with
494							a space inbetween.
495
496							=back
497
498
499							=head2 Writing a custom formatter
500
501							A custom algorithm for date formatting can be specified as a parameter to the constructor
502
503							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
504							date_formatter => sub {...});
505
506							If this parameter is C<undef>, date formatting is canceled and therefore date and
507							time values will be presented as plain numbers.
508
509							If not C<undef>, the date formatting routine will we called as :
510
511							$date_formater->($excel_date_format, $year, $month, $day, $hour, $minute, $second, $millisecond);
512
513							where
514
515							=over
516
517							=item *
518
519							C<$excel_date_format> is the Excel numbering format associated to that cell, like for example
520							C<mm-dd-yy> or C<h:mm:ss AM/PM>. See the Excel documentation for the syntax description.
521							This is useful to decide if the value should be presented as a date, a time, or both.
522							The present module uses a simple heuristic : if the format contains C<d> or C<y>, it should
523							be presented as a date; if the format contains C<h> or C<s>, it should be presented
524							as a time. The letter C<m> is not taken into consideration because it is ambiguous :
525							depending on the position in the format string, it may represent either a "month" or a "minute".
526
527							=item *
528
529							C<year> is the full year, such as 1993 or 2021. The date system of the Excel file (either 1900 or 1904,
530							see L<https://support.microsoft.com/en-us/office/date-systems-in-excel-e7fe7167-48a9-4b96-bb53-5612a800b487>) is properly taken into account. Excel has no support for dates prior to 1900 or 1904, so the
531							C<year> component wil always be above this value.
532
533							=item *
534
535							C<month> is the numeric value of the month, starting at 1
536
537							=item *
538
539							C<day> is the numeric value of the day in month, starting at 1
540
541							=item *
542
543							C<$hour>, C<$minute>, C<$second>, C<$millisecond> obviously contain the corresponding
544							numeric values.
545
546							=back
547
548
549							=head1 CAVEATS
550
551							=over
552
553							=item *
554
555							This module was optimized for speed, not for completeness of
556							OOXML-SpreadsheetML support; so there may be some edge cases where the
557							output is incorrect with respect to the original Excel data.
558
559							=item *
560
561							Embedded newline characters in strings are stored in Excel as C<< \r\n >>,
562							following the old Windows convention. When retrieved through the C<Regex>
563							backend, the result contains the original C<< \r >> and C<< \n >> characters;
564							but when retrieved through the LibXML, C<< \r >> are silently removed by the
565							C<XML::LibXML> package.
566
567							=back
568
569							=head1 SEE ALSO
570
571							The official reference for OOXML-SpreadsheetML format is in
572							L<https://www.ecma-international.org/publications/standards/Ecma-376.htm>.
573
574							Introductory material on XLSX file structure can be found at
575							L<http://officeopenxml.com/anatomyofOOXML-xlsx.php>.
576
577							The CPAN module L<Data::XLSX::Parser> is claimed to be in alpha stage;
578							it seems to be working but the documentation is insufficient -- I had
579							to inspect the test suite to understand how to use it.
580
581							Another unpublished but working module for parsing Excel files in Perl
582							can be found at L<https://github.com/jmcnamara/excel-reader-xlsx>.
583							Some test cases were borrowed from that distribution.
584
585							Conversions from and to Excel internal date format can also be performed
586							through the L<DateTime::Format::Excel> module.
587
588							=head1 BENCHMARKS
589
590							Below are some benchmarks computed with the program C<benchmark.pl> in
591							this distribution. The task was to parse an Excel file of five worksheets
592							with about 62600 rows in total, and report the number of rows per sheet.
593							Reported figures are in seconds.
594
595							Excel::ValueReader::XLSX::Regex 11 elapsed, 10 cpu, 0 system
596							Excel::ValueReader::XLSX::LibXML 35 elapsed, 34 cpu, 0 system
597							[unpublished] Excel::Reader::XLSX 39 elapsed, 37 cpu, 0 system
598							Spreadsheet::ParseXLSX 244 elapsed, 240 cpu, 1 system
599							Data::XLSX::Parser 37 elapsed, 35 cpu, 0 system
600
601							These figures show that the regex version is about 3 times faster
602							than the LibXML version, and about 22 times faster than
603							L<Spreadsheet::ParseXLSX>. Tests with a bigger file of about 90000 rows
604							showed similar ratios.
605
606							Modules
607							C<Excel::Reader::XLSX> (unpublished) and L<Data::XLSX::Parser>
608							are based on L<XML::LibXML> like L<Excel::ValueReader::XLSX::Backend::LibXML>;
609							execution times for those three modules are very close.
610
611							=head1 ACKNOWLEDGEMENTS
612
613							=over
614
615							=item *
616
617							David Flink signaled (and fixed) a bug about strings with embedded newline characters, and
618							signaled that the 'r' attribute in cells is optional.
619
620							=item *
621
622							Ulibuck signaled bugs several minor bugs on the LibXML backend
623
624							=back
625
626
627							=head1 AUTHOR
628
629							Laurent Dami, E<lt>dami at cpan.orgE<gt>
630
631							=head1 COPYRIGHT AND LICENSE
632
633							Copyright 2020-2023 by Laurent Dami.
634
635							This library is free software; you can redistribute it and/or modify
636							it under the same terms as Perl itself.