File Coverage

blib/lib/Excel/ValueReader/XLSX/Backend/Regex.pm

Criterion	Covered	Total	%
statement	86	88	97.7
branch	16	20	80.0
condition	21	22	95.4
subroutine	13	13	100.0
pod	1	1	100.0
total	137	144	95.1

line	stmt	bran	cond	sub	pod	time	code
1							package Excel::ValueReader::XLSX::Backend::Regex;
2	1			1		637	use utf8;
	1					2
	1					5
3	1			1		30	use 5.10.1;
	1					3
4	1			1		6	use Moose;
	1					2
	1					13
5	1			1		6365	use Scalar::Util qw/looks_like_number/;
	1					11
	1					71
6	1			1		6	use Carp qw/croak/;
	1					2
	1					1514
7
8							extends 'Excel::ValueReader::XLSX::Backend';
9
10							our $VERSION = '1.08';
11
12							#======================================================================
13							# LAZY ATTRIBUTE CONSTRUCTORS
14							#======================================================================
15
16							sub _strings {
17	2			2		4	my $self = shift;
18	2					5	my @strings;
19
20							# read from the sharedStrings zip member
21	2					6	my $contents = $self->_zip_member_contents('xl/sharedStrings.xml');
22
23							# iterate on <si> nodes
24	2					34	while ($contents =~ m[<si>(.*?)</si>]sg) {
25	240					427	my $innerXML = $1;
26
27							# concatenate contents from all <t> nodes (usually there is only 1) and decode XML entities
28	240					732	my $string = join "", ($innerXML =~ m[<t[^>]*>(.+?)</t>]sg);
29	240					508	_decode_xml_entities($string);
30
31	240					818	push @strings, $string;
32							}
33
34	2					69	return \@strings;
35							}
36
37
38							sub _workbook_data {
39	5			5		10	my $self = shift;
40
41							# read from the workbook.xml zip member
42	5					15	my $workbook = $self->_zip_member_contents('xl/workbook.xml');
43
44							# extract sheet names
45	5					67	my @sheet_names = ($workbook =~ m[<sheet name="(.+?)"]g);
46	5					18	my %sheets = map {$sheet_names[$_] => $_+1} 0 .. $#sheet_names;
	21					54
47
48							# does this workbook use the 1904 calendar ?
49	5					33	my ($date1904) = $workbook =~ m[date1904="(.+?)"];
50	5	100	100			29	my $base_year = $date1904 && $date1904 =~ /^(1\|true)$/ ? 1904 : 1900;
51
52	5					184	return {sheets => \%sheets, base_year => $base_year};
53							}
54
55
56
57							sub _date_styles {
58	4			4		7	my $self = shift;
59
60	4					10	state $date_style_regex = qr{[dy]\|\bmm\b};
61
62							# read from the styles.xml zip member
63	4					24	my $styles = $self->_zip_member_contents('xl/styles.xml');
64
65							# start with Excel builtin number formats for dates and times
66	4					25	my @numFmt = $self->Excel_builtin_date_formats;
67
68							# add other date formats explicitly specified in this workbook
69	4					39	while ($styles =~ m[<numFmt numFmtId="(\d+)" formatCode="([^"]+)"/>]g) {
70	38					92	my ($id, $code) = ($1, $2);
71	38	100				241	$numFmt[$id] = $code if $code =~ $date_style_regex;
72							}
73
74							# read all cell formats, just rembember those that involve a date number format
75	4					71	my ($cellXfs) = ($styles =~ m[<cellXfs count="\d+">(.+?)</cellXfs>]);
76	4					16	my @cell_formats = $self->_extract_xf($cellXfs);
77	4					11	my @date_styles = map {$numFmt[$_->{numFmtId}]} @cell_formats;
	87					147
78
79	4					202	return \@date_styles; # array of shape (xf_index => numFmt_code)
80							}
81
82
83							sub _extract_xf {
84	4			4		12	my ($self, $xml) = @_;
85
86	4					9	state $xf_node_regex = qr{
87							<xf # initial format tag
88							\s
89							([^>/]*+) # attributes (captured in $1)
90							(?: # non-capturing group for an alternation :
91							/> # .. either an xml closing without content
92							\| # or
93							> # .. closing for the xf tag
94							.*? # .. then some formatting content
95							</xf> # .. then the ending tag for the xf node
96							)
97							}x;
98
99	4					7	my @xf_nodes;
100	4					27	while ($xml =~ /$xf_node_regex/g) {
101	87					179	my $all_attrs = $1;
102	87					97	my %attr;
103	87					243	while ($all_attrs =~ m[(\w+)="(.+?)"]g) {
104	591					2010	$attr{$1} = $2;
105							}
106	87					473	push @xf_nodes, \%attr;
107							}
108	4					36	return @xf_nodes;
109							}
110
111
112							#======================================================================
113							# METHODS
114							#======================================================================
115
116							sub values {
117	14			14	1	145	my ($self, $sheet) = @_;
118	14					24	my @data;
119	14					20	my ($row, $col, $cell_type, $seen_node);
120
121							# regex for extracting information from cell nodes
122	14					33	state $cell_regex = qr(
123							<c\ # initial cell tag
124							r="([A-Z]+)(\d+)" # capture col and row ($1 and $2)
125							[^>/]*? # unused attrs
126							(?:s="(\d+)"\s*)? # style attribute ($3)
127							(?:t="(\w+)"\s*)? # type attribute ($4)
128							(?: # non-capturing group for an alternation :
129							/> # .. either an xml closing without content
130							\| # or
131							> # .. closing xml tag, followed by
132							(?:
133
134							<v>(.+?)</v> # .. a value ($5)
135							\| # or
136							(.+?) # .. some node content ($6)
137							)
138							</c> # followed by a closing cell tag
139							)
140							)x;
141							# NOTE : this regex uses capturing groups; it would be more readable with named
142							# captures instead, but this doubles the execution time on big Excel files, so I
143							# came back to plain old capturing groups.
144
145							# does this instance want date formatting ?
146	14					372	my $has_date_formatter = $self->frontend->date_formatter;
147
148							# parse worksheet XML, gathering all cells
149	14					59	my $contents = $self->_zip_member_contents($self->_zip_member_name_for_sheet($sheet));
150	14					208	while ($contents =~ /$cell_regex/g) {
151	1054					2791	my ($col, $row, $style, $cell_type, $val, $inner) = ($self->A1_to_num($1), $2, $3, $4, $5, $6);
152
153							# handle cell value according to cell type
154	1054		100			2725	$cell_type //= '';
155	1054	50				1941	if ($cell_type eq 'inlineStr') {
		100
156							# this is an inline string; gather all <t> nodes within the cell node
157	0					0	$val = join "", ($inner =~ m[<t>(.+?)</t>]g);
158	0	0				0	_decode_xml_entities($val) if $val;
159							}
160							elsif ($cell_type eq 's') {
161							# this is a string cell; $val is a pointer into the global array of shared strings
162	535					13597	$val = $self->strings->[$val];
163							}
164							else {
165							# this is a plain value
166	519	100	100			1030	($val) = ($inner =~ m[<v>(.*?)</v>]) if !defined $val && $inner;
167	519	100	100			1120	_decode_xml_entities($val) if $val && $cell_type eq 'str';
168
169							# if necessary, transform the numeric value into a formatted date
170	519	100	100			1966	if ($has_date_formatter && $style && looks_like_number($val) && $val >= 0) {
			100
			66
171	238					6357	my $date_style = $self->date_styles->[$style];
172	238	100				589	$val = $self->formatted_date($val, $date_style) if $date_style;
173							}
174							}
175
176							# insert this value into the global data array
177	1054					8105	$data[$row-1][$col-1] = $val;
178							}
179
180							# insert arrayrefs for empty rows
181	14		100			141	$_ //= [] foreach @data;
182
183	14					61	return \@data;
184							}
185
186
187							sub _table_targets {
188	5			5		8	my ($self, $rel_xml) = @_;
189
190	5					30	my @table_targets = $rel_xml =~ m[<Relationship .*? Target="../tables/table(\d+)\.xml"]g;
191	5					15	return @table_targets; # a list of positive integers corresponding to table ids
192							}
193
194
195							sub _parse_table_xml {
196	5			5		11	my ($self, $xml) = @_;
197
198	5					10	state $table_regex = qr{
199							<table .+? displayName="(\w+)"
200							.+? ref="([:A-Z0-9]+)"
201							.+? (headerRowCount="0")?
202							.+?>
203							}x;
204
205							# extract relevant attributes from the <table> node
206	5	50				70	my ($name, $ref, $no_headers) = $xml =~ /$table_regex/g
207							or croak "invalid table XML";
208
209							# column names. Other attributes from <tableColumn> nodes are ignored.
210	5					42	my @columns = ($xml =~ m{<tableColumn [^>]+? name="([^"]+)"}gx);
211
212							# decode entites for all string values
213	5					16	_decode_xml_entities($_) for $name, @columns;
214
215	5					21	return ($name, $ref, \@columns, $no_headers);
216							}
217
218
219							#======================================================================
220							# AUXILIARY FUNCTIONS
221							#======================================================================
222
223
224							sub _decode_xml_entities {
225	263			263		331	state $xml_entities = { amp => '&',
226							lt => '<',
227							gt => '>',
228							quot => '"',
229							apos => "'",
230							};
231	263					285	state $entity_names = join '\|', keys %$xml_entities;
232	263					318	state $regex_entities = qr/&($entity_names);/;
233
234							# substitute in-place
235	263					692	$_[0] =~ s/$regex_entities/$xml_entities->{$1}/eg;
	14					49
236							}
237
238
239							1;
240
241							__END__
242
243							=head1 NAME
244
245							Excel::ValueReader::XLSX::Backend::Regex - using regexes for extracting values from Excel workbooks
246
247							=head1 DESCRIPTION
248
249							This is one of two backend modules for L<Excel::ValueReader::XLSX>; the other
250							possible backend is L<Excel::ValueReader::XLSX::Backend::LibXML>.
251
252							This backend parses OOXML structures using regular expressions.
253
254							=head1 AUTHOR
255
256							Laurent Dami, E<lt>dami at cpan.orgE<gt>
257
258							=head1 COPYRIGHT AND LICENSE
259
260							Copyright 2020,2021 by Laurent Dami.
261
262							This library is free software; you can redistribute it and/or modify
263							it under the same terms as Perl itself.