File Coverage

blib/lib/Excel/ValueReader/XLSX/Backend/Regex.pm

Criterion	Covered	Total	%
statement	93	96	96.8
branch	21	26	80.7
condition	25	28	89.2
subroutine	13	13	100.0
pod	1	1	100.0
total	153	164	93.2

line	stmt	bran	cond	sub	pod	time	code
1							package Excel::ValueReader::XLSX::Backend::Regex;
2	1			1		709	use utf8;
	1					3
	1					8
3	1			1		36	use 5.10.1;
	1					4
4	1			1		10	use Moose;
	1					2
	1					13
5	1			1		8745	use Scalar::Util qw/looks_like_number/;
	1					3
	1					57
6	1			1		6	use Carp qw/croak/;
	1					2
	1					1942
7
8							extends 'Excel::ValueReader::XLSX::Backend';
9
10							our $VERSION = '1.10';
11
12							#======================================================================
13							# LAZY ATTRIBUTE CONSTRUCTORS
14							#======================================================================
15
16							sub _strings {
17	4			4		11	my $self = shift;
18	4					8	my @strings;
19
20							# read from the sharedStrings zip member
21	4					16	my $contents = $self->_zip_member_contents('xl/sharedStrings.xml');
22
23							# iterate on <si> nodes
24	4					47	while ($contents =~ m[<si>(.*?)</si>]sg) {
25	259					578	my $innerXML = $1;
26
27							# concatenate contents from all <t> nodes (usually there is only 1) and decode XML entities
28	259					992	my $string = join "", ($innerXML =~ m[<t[^>]*>(.+?)</t>]sg);
29	259					665	_decode_xml_entities($string);
30
31	259					1032	push @strings, $string;
32							}
33
34	4					156	return \@strings;
35							}
36
37
38							sub _workbook_data {
39	7			7		17	my $self = shift;
40
41							# read from the workbook.xml zip member
42	7					27	my $workbook = $self->_zip_member_contents('xl/workbook.xml');
43
44							# extract sheet names
45	7					100	my @sheet_names = ($workbook =~ m[<sheet name="(.+?)"]g);
46	7					45	my %sheets = map {$sheet_names[$_] => $_+1} 0 .. $#sheet_names;
	24					76
47
48							# does this workbook use the 1904 calendar ?
49	7					42	my ($date1904) = $workbook =~ m[date1904="(.+?)"];
50	7	100	100			39	my $base_year = $date1904 && $date1904 =~ /^(1\|true)$/ ? 1904 : 1900;
51
52	7					311	return {sheets => \%sheets, base_year => $base_year};
53							}
54
55
56
57							sub _date_styles {
58	5			5		11	my $self = shift;
59
60	5					14	state $date_style_regex = qr{[dy]\|\bmm\b};
61
62							# read from the styles.xml zip member
63	5					17	my $styles = $self->_zip_member_contents('xl/styles.xml');
64
65							# start with Excel builtin number formats for dates and times
66	5					26	my @numFmt = $self->Excel_builtin_date_formats;
67
68							# add other date formats explicitly specified in this workbook
69	5					48	while ($styles =~ m[<numFmt numFmtId="(\d+)" formatCode="([^"]+)"/>]g) {
70	38					111	my ($id, $code) = ($1, $2);
71	38	100				274	$numFmt[$id] = $code if $code =~ $date_style_regex;
72							}
73
74							# read all cell formats, just rembember those that involve a date number format
75	5					98	my ($cellXfs) = ($styles =~ m[<cellXfs count="\d+">(.+?)</cellXfs>]);
76	5					18	my @cell_formats = $self->_extract_xf($cellXfs);
77	5					13	my @date_styles = map {$numFmt[$_->{numFmtId}]} @cell_formats;
	90					173
78
79	5					255	return \@date_styles; # array of shape (xf_index => numFmt_code)
80							}
81
82
83							sub _extract_xf {
84	5			5		17	my ($self, $xml) = @_;
85
86	5					10	state $xf_node_regex = qr{
87							<xf # initial format tag
88							\s
89							([^>/]*+) # attributes (captured in $1)
90							(?: # non-capturing group for an alternation :
91							/> # .. either an xml closing without content
92							\| # or
93							> # .. closing for the xf tag
94							.*? # .. then some formatting content
95							</xf> # .. then the ending tag for the xf node
96							)
97							}x;
98
99	5					11	my @xf_nodes;
100	5					38	while ($xml =~ /$xf_node_regex/g) {
101	90					211	my $all_attrs = $1;
102	90					138	my %attr;
103	90					311	while ($all_attrs =~ m[(\w+)="(.+?)"]g) {
104	608					2471	$attr{$1} = $2;
105							}
106	90					546	push @xf_nodes, \%attr;
107							}
108	5					23	return @xf_nodes;
109							}
110
111
112							#======================================================================
113							# METHODS
114							#======================================================================
115
116							sub values {
117	16			16	1	192	my ($self, $sheet) = @_;
118	16					31	my @data;
119	16					24	my ($cell_type, $seen_node);
120
121							# regexes for extracting information from cell nodes
122	16					53	state $row_regex = qr(
123							<(row) # row tag ($1)
124							(?:\s+r="(\d+)")? # optional row number ($2)
125							[^>/]*? # unused attrs
126							> # end of tag
127							)x;
128	16					26	state $cell_regex = qr(
129							<(c) # cell tag ($3)
130							(?: \s+ \| (?=>) ) # either a space before attrs, or end of tag
131							(?:r="([A-Z]+)(\d+)")? # capture col ($4) and row ($5)
132							[^>/]*? # unused attrs
133							(?:s="(\d+)"\s*)? # style attribute ($6)
134							(?:t="(\w+)"\s*)? # type attribute ($7)
135							(?: # non-capturing group for an alternation :
136							/> # .. either an xml closing without content
137							\| # or
138							> # .. closing xml tag, followed by
139							(?:
140
141							<v>(.+?)</v> # .. a value ($8)
142							\| # or
143							(.+?) # .. some node content ($9)
144							)
145							</c> # followed by a closing cell tag
146							)
147							)x;
148	16					132	state $row_or_cell_regex = qr($row_regex\|$cell_regex);
149							# NOTE : these regexes uses positional capturing groups; it would be more readable with named
150							# captures instead, but it doubles the execution time on big Excel files, so I
151							# stick to plain old capturing groups.
152
153							# does this instance want date formatting ?
154	16					551	my $has_date_formatter = $self->frontend->date_formatter;
155
156							# parse worksheet XML, gathering all cells
157	16					64	my $contents = $self->_zip_member_contents($self->_zip_member_name_for_sheet($sheet));
158
159							# loop on matching nodes
160	16					42	my ($row, $col) = (0, 0);
161	16					306	while ($contents =~ /$row_or_cell_regex/g) {
162	1935	100				5560	if ($1) { # this is a 'row' tag
		50
163	730		66			1605	$row = $2 // $row+1;
164	730					4535	$col = 0;
165							}
166							elsif ($3) { # this is a 'c' tag
167	1205					3725	my ($col_A1, $given_row, $style, $cell_type, $val, $inner) = ($4, $5, $6, $7, $8, $9);
168
169							# row and column for this cell -- either given, or incremented from last cell
170	1205	100	66			5118	($col, $row) = $col_A1 && $given_row ? ($self->A1_to_num($col_A1), $given_row)
171							: ($col+1, $row);
172
173							# handle cell value according to cell type
174	1205		100			3604	$cell_type //= '';
175	1205	50				2608	if ($cell_type eq 'inlineStr') {
		100
176							# this is an inline string; gather all <t> nodes within the cell node
177	0					0	$val = join "", ($inner =~ m[<t>(.+?)</t>]g);
178	0	0				0	_decode_xml_entities($val) if $val;
179							}
180							elsif ($cell_type eq 's') {
181							# this is a string cell; $val is a pointer into the global array of shared strings
182	653					19973	$val = $self->strings->[$val];
183							}
184							else {
185							# this is a plain value
186	552	100	100			1312	($val) = ($inner =~ m[<v>(.*?)</v>]) if !defined $val && $inner;
187	552	100	100			1423	_decode_xml_entities($val) if $val && $cell_type eq 'str';
188
189							# if necessary, transform the numeric value into a formatted date
190	552	100	100			2488	if ($has_date_formatter && $style && looks_like_number($val) && $val >= 0) {
			100
			66
191	243					7807	my $date_style = $self->date_styles->[$style];
192	243	100				634	$val = $self->formatted_date($val, $date_style) if $date_style;
193							}
194							}
195
196							# insert this value into the global data array
197	1205					11254	$data[$row-1][$col-1] = $val;
198							}
199	0					0	else {die "unexpected regex match"}
200							}
201
202							# insert empty arrayrefs for empty rows
203	16		100			192	$_ //= [] foreach @data;
204
205	16					84	return \@data;
206							}
207
208
209							sub _table_targets {
210	5			5		12	my ($self, $rel_xml) = @_;
211
212	5					38	my @table_targets = $rel_xml =~ m[<Relationship .*? Target="../tables/table(\d+)\.xml"]g;
213	5					18	return @table_targets; # a list of positive integers corresponding to table ids
214							}
215
216
217							sub _parse_table_xml {
218	5			5		13	my ($self, $xml) = @_;
219
220	5					14	state $table_regex = qr{
221							<table .+? displayName="(\w+)"
222							.+? ref="([:A-Z0-9]+)"
223							.+? (headerRowCount="0")?
224							.+?>
225							}x;
226
227							# extract relevant attributes from the <table> node
228	5	50				73	my ($name, $ref, $no_headers) = $xml =~ /$table_regex/g
229							or croak "invalid table XML";
230
231							# column names. Other attributes from <tableColumn> nodes are ignored.
232	5					50	my @columns = ($xml =~ m{<tableColumn [^>]+? name="([^"]+)"}gx);
233
234							# decode entites for all string values
235	5					17	_decode_xml_entities($_) for $name, @columns;
236
237	5					38	return ($name, $ref, \@columns, $no_headers);
238							}
239
240
241							#======================================================================
242							# AUXILIARY FUNCTIONS
243							#======================================================================
244
245
246							sub _decode_xml_entities {
247	282			282		404	state $xml_entities = { amp => '&',
248							lt => '<',
249							gt => '>',
250							quot => '"',
251							apos => "'",
252							};
253	282					377	state $entity_names = join '\|', keys %$xml_entities;
254	282					428	state $regex_entities = qr/&($entity_names);/;
255
256							# substitute in-place
257	282					844	$_[0] =~ s/$regex_entities/$xml_entities->{$1}/eg;
	14					66
258							}
259
260
261							1;
262
263							__END__
264
265							=head1 NAME
266
267							Excel::ValueReader::XLSX::Backend::Regex - using regexes for extracting values from Excel workbooks
268
269							=head1 DESCRIPTION
270
271							This is one of two backend modules for L<Excel::ValueReader::XLSX>; the other
272							possible backend is L<Excel::ValueReader::XLSX::Backend::LibXML>.
273
274							This backend parses OOXML structures using regular expressions.
275
276							=head1 AUTHOR
277
278							Laurent Dami, E<lt>dami at cpan.orgE<gt>
279
280							=head1 COPYRIGHT AND LICENSE
281
282							Copyright 2020-2023 by Laurent Dami.
283
284							This library is free software; you can redistribute it and/or modify
285							it under the same terms as Perl itself.