File Coverage

blib/lib/Spreadsheet/ReadGnumeric.pm
Criterion Covered Total %
statement 221 231 95.6
branch 77 94 81.9
condition 31 35 88.5
subroutine 38 39 97.4
pod 5 5 100.0
total 372 404 92.0


line stmt bran cond sub pod time code
1             # -*- mode: perl; -*-
2             #
3             # Spreadsheet reader for Gnumeric format that returns a
4             # Spreadsheet::Read-compatible data structure.
5             #
6             # Documentation below "__END__".
7             #
8             # [created. -- rgr, 26-Dec-22.]
9             #
10              
11             package Spreadsheet::ReadGnumeric;
12              
13 3     3   177681 use 5.010;
  3         25  
14              
15 3     3   48 use strict;
  3         5  
  3         78  
16 3     3   13 use warnings;
  3         5  
  3         136  
17              
18 3     3   1384 use XML::Parser::Lite;
  3         10741  
  3         96  
19 3     3   1128 use Spreadsheet::Gnumeric::StyleRegion;
  3         8  
  3         100  
20              
21             our $VERSION = '0.3';
22              
23 3     3   16 use parent qw(Spreadsheet::Gnumeric::Base);
  3         5  
  3         13  
24              
25             BEGIN {
26 3     3   183 Spreadsheet::ReadGnumeric->define_instance_accessors
27             (# User options to control parsing.
28             qw(gzipped_p cells rc attr merge minimal_attributes convert_colors),
29             # Intermediate data structures for sheet parsing.
30             qw(sheet style_regions style_attributes),
31             # XML parsing internals.
32             qw(current_elt current_attrs chars namespaces element_stack));
33             }
34              
35             sub new {
36 12     12 1 57894 my ($class, @options) = @_;
37              
38 12         92 my $self = $class->SUPER::new(@options);
39 12   100     55 $self->{_cells} //= 1;
40 12   100     50 $self->{_rc} //= 1;
41 12   100     55 $self->{_convert_colors} //= 1;
42 12         47 return $self;
43             }
44              
45             sub sheets {
46             # Read-only slot. Make sure the sheets slot is initialized.
47 76     76 1 143 my ($self) = @_;
48              
49 76         142 my $sheets = $self->{_sheets};
50 76 100       181 unless ($sheets) {
51             # [I'm afraid this is just cargo-culting here. -- rgr, 27-Dec-22.]
52 12         65 my $parser_data = { parser => __PACKAGE__,
53             type => 'gnumeric',
54             version => $VERSION };
55 12         70 my $attrs = { parsers => [ $parser_data ],
56             %$parser_data,
57             error => undef };
58 12         33 $sheets = [ $attrs ];
59 12         30 $self->{_sheets} = $sheets;
60             }
61 76         1963 return $sheets;
62             }
63              
64             ### XML parsing support.
65              
66             # This is because XML::Parser::Lite callbacks are called with the Expat object
67             # and not us, so we must bind $Self dynamically around the parsing operation.
68 3     3   19 use vars qw($Self);
  3         7  
  3         7409  
69              
70             sub _context_string {
71 0     0   0 my ($self) = @_;
72              
73 0         0 my $stack = $self->element_stack;
74 0 0       0 return $self->current_elt
75             unless $stack;
76 0         0 return join(' ', (map { $_->[0]; } @$stack), $self->current_elt);
  0         0  
77             }
78              
79             sub _decode_xlmns_name {
80             # Figure out whether we have a Gnumeric element name.
81 12444     12444   18685 my ($self, $elt_name) = @_;
82              
83 12444         47692 my ($ns_prefix, $base_elt_name) = $elt_name =~ /^([^:]+):([^:]+)$/;
84 12444 50       23742 if ($ns_prefix) {
85             # See if the prefix is the one we want.
86 12444         22214 my $url = $self->namespaces->{$ns_prefix};
87 12444 100 66     38467 if ($url && $url eq "http://www.gnumeric.org/v10.dtd") {
88             # Belongs to the Gnumeric schema.
89 12236         27274 return ($base_elt_name, 1);
90             }
91             else {
92             # It's something else.
93 208         417 return ($elt_name, 0);
94             }
95             }
96             else {
97             # Assume unqualified names belong to Gnumeric (even though we've never
98             # seen any).
99 0         0 return ($elt_name, 1);
100             }
101             }
102              
103             sub _handle_start {
104 6222     6222   307316 my ($expat, $elt, @attrs) = @_;
105 6222         24174 my $attrs = { @attrs };
106              
107             # Establish the new namespace scope. We do this first, because it may
108             # define the prefix used on this element.
109 6222   100     13309 my $old_ns_scope = $Self->namespaces || { };
110 6222         19983 my $new_ns_scope = { %$old_ns_scope };
111 6222 100       17877 if (grep { /^xmlns:(.*)$/ } keys(%$attrs)) {
  21584         37857  
112             # Copy so as not to clobber the outer scope.
113 24         87 $new_ns_scope = { %$old_ns_scope };
114 24         75 for my $attr (keys(%$attrs)) {
115 108 100       350 if ($attr =~ /^xmlns:(.*)$/) {
116 84         143 my $ns_prefix = $1;
117 84         223 $new_ns_scope->{$ns_prefix} = $attrs->{$attr};
118             }
119             }
120             }
121 6222         15862 $Self->namespaces($new_ns_scope);
122              
123             # Stack the outer context.
124 6222   100     10258 my $stack = $Self->element_stack || [ ];
125 6222 100       11266 push(@$stack, [ $Self->current_elt, $Self->current_attrs,
126             $Self->chars, $old_ns_scope ])
127             if $Self->current_elt;
128 6222         14732 $Self->element_stack($stack);
129              
130             # Install the new element context.
131 6222         9940 my ($decoded_name) = $Self->_decode_xlmns_name($elt);
132 6222         13846 $Self->current_elt($decoded_name);
133 6222         12841 $Self->current_attrs($attrs);
134 6222         10616 $Self->chars('');
135             }
136              
137             sub _handle_char {
138             # Just collect them in our "chars" slot.
139 10732     10732   358915 my ($expat, $chars) = @_;
140              
141 10732         17680 if (0) {
142             warn("handle_char: '$chars' in ", $Self->_context_string, "\n")
143             unless $chars =~ /^\s*$/;
144             }
145 10732         34805 $Self->{_chars} .= $chars;
146             }
147              
148             sub _handle_end {
149 6222     6222   147941 my ($expat, $raw_elt_name) = @_;
150              
151             # Process the completed element.
152 6222         12047 my ($elt_name, $gnumeric_p) = $Self->_decode_xlmns_name($raw_elt_name);
153 6222 100       10828 if ($gnumeric_p) {
154 6118         12474 my $method = "_process_${elt_name}_elt";
155 6118 100       22556 $Self->$method($Self->chars, %{$Self->current_attrs})
  4116         7414  
156             if $Self->can($method);
157             }
158              
159             # Restore the outer element context.
160 6222         13449 my $stack = $Self->element_stack;
161             return
162 6222 100       10904 unless @$stack;
163 6210         7335 my ($elt, $attrs, $chars, $old_ns_scope) = @{pop(@$stack)};
  6210         11746  
164 6210         15759 $Self->current_elt($elt);
165 6210         12358 $Self->current_attrs($attrs);
166 6210         13043 $Self->chars($chars);
167 6210         10101 $Self->namespaces($old_ns_scope);
168             }
169              
170             ### Utility routines.
171              
172             sub _decode_alpha {
173             # Note that this is one-based, so "A" is 1, "Z" is 26, and "AA" is 27. But
174             # there is no consistent zero digit, so this is not really base 26; if it
175             # were, "A" should be zero and "Z" should overflow to "BA".
176 116     116   244 my ($alpha) = @_;
177              
178 116 100       174 if (length($alpha) == 0) {
179 40         156 return 0;
180             }
181             else {
182 76         147 return (26 * _decode_alpha(substr($alpha, 0, -1))
183             + ord(lc(substr($alpha, -1))) - ord('a') + 1)
184             }
185             }
186              
187             sub _decode_cell_name {
188 8     8   12 my ($cell_name) = @_;
189              
190 8         32 my ($alpha, $digits) = $cell_name =~ /^([a-zA-Z]+)(\d+)$/;
191             return
192 8 50       17 unless $alpha;
193 8         15 return (_decode_alpha($alpha), 0 + $digits);
194             }
195              
196             sub _encode_cell_name {
197             # Note that $value is one-based, so 1 corresponds to "A". See the note
198             # under _decode_alpha.
199 1256     1256   14757 my ($value) = @_;
200              
201 1256 100       4764 return ($value <= 26
202             ? chr(ord('A') + $value - 1)
203             : (_encode_cell_name(int(($value - 1) / 26))
204             . _encode_cell_name(1 + ($value - 1) % 26)));
205             }
206              
207             ### Spreadsheet parsing
208              
209             sub _parse_stream {
210             # Create a new XML::Parser::Lite instance, use it to drive the parsing of
211             # $xml_stream (which we assume is uncompressed), and return the resulting
212             # spreadsheet object.
213 12     12   30 my ($self, $xml_stream) = @_;
214              
215 12         133 my $parser = XML::Parser::Lite->new
216             (Style => 'Stream',
217             Handlers => { Start => \&_handle_start,
218             End => \&_handle_end,
219             Char => \&_handle_char });
220 12         992 local $Self = $self;
221 12         580 $parser->parse(join('', <$xml_stream>));
222 12         2088 return $self->sheets;
223             }
224              
225             sub stream_gzipped_p {
226 12     12 1 37 my ($self, $stream, $file) = @_;
227              
228             # The point of the gzipped_p slot is to allow callers to suppress the gzip
229             # test if the stream is not seekable.
230 12         47 my $gzipped_p = $self->gzipped_p;
231 12 50       38 if (! defined($gzipped_p)) {
232 12 50       477 read($stream, my $block, 2) or do {
233 0         0 my $file_msg = 'from stream';
234 0         0 $file_msg = " from '$file'";
235 0         0 die "$self: Failed to read opening bytes$file_msg: $!";
236             };
237             # Test if gzipped (/usr/share/misc/magic).
238 12         39 $gzipped_p = $block eq "\037\213";
239 12         106 seek($stream, 0, 0);
240             }
241 12         153 return $gzipped_p;
242             }
243              
244             sub parse {
245 12     12 1 70 my ($self, $input) = @_;
246              
247 12         22 my $stream;
248 12 100       68 if (ref($input)) {
    100          
249             # Assume it's a stream.
250 2         4 $stream = $input;
251             }
252             elsif ($input =~ m/\A(\037\213|<\?xml)/) {
253             # $input is literal content, compressed and/or XML.
254 2     1   49 open($stream, '<', \$input);
  1         10  
  1         2  
  1         7  
255             }
256             else {
257 8 50       472 open($stream, '<', $input)
258             or die "$self: Failed to open '$input': $!";
259             }
260 12 100   2   801 binmode($stream, ':gzip')
  2 100       20  
  2         4  
  2         15  
261             if $self->stream_gzipped_p($stream, ref($input) ? () : $input);
262 12     2   3564 binmode($stream, ':encoding(UTF-8)');
  2         12  
  2         4  
  2         9  
263 12         20886 return $self->_parse_stream($stream);
264             }
265              
266             sub _process_Name_elt {
267             # Record the sheet name.
268 156     156   311 my ($self, $name) = @_;
269              
270             # Find the enclosing element, which needs to be "Sheet".
271 156         289 my $stack = $self->element_stack;
272             return
273 156 100       507 unless $stack->[@$stack-1][0] eq 'Sheet';
274 32         98 my $sheets = $self->sheets;
275 32         137 $sheets->[0]{sheet}{$name} = @$sheets;
276 32         102 $self->{_sheet}{label} = $name;
277             }
278              
279             sub _process_MaxCol_elt {
280 32     32   88 my ($self, $maxcol) = @_;
281              
282 32         72 $self->{_sheet}{mincol} = 1;
283 32         106 $self->{_sheet}{maxcol} = 1 + $maxcol;
284             }
285              
286             sub _process_MaxRow_elt {
287 32     32   115 my ($self, $maxrow) = @_;
288              
289             return
290 32 50       108 unless defined($maxrow);
291 32         66 $self->{_sheet}{minrow} = 1;
292 32         92 $self->{_sheet}{maxrow} = 1 + $maxrow;
293             }
294              
295             sub _process_Merge_elt {
296             # We don't care about MergedRegions, as that's just a container.
297 4     4   10 my ($self, $text) = @_;
298              
299 4         20 push(@{$self->{_sheet}{merged}},
300 4 50       9 [ map { _decode_cell_name($_) } split(':', $text) ])
  8         16  
301             if $self->attr;
302             }
303              
304             sub _process_Cell_elt {
305 2312     2312   7545 my ($self, $text, %keys) = @_;
306              
307             # Ignore empty cells.
308             return
309 2312 100       4642 unless $text;
310             # Both $row and $col are zero-based; the cell matrix is one-based.
311 1622         2956 my ($row, $col) = ($keys{Row}, $keys{Col});
312 1622 100       2884 $self->{_sheet}{cell}[$col + 1][$row + 1] = $text
313             if $self->rc;
314 1622 100       3236 $self->{_sheet}{_encode_cell_name($col + 1) . ($row + 1)} = $text
315             if $self->cells;
316 1622 100       3270 $self->{_sheet}{attr}[$col + 1][$row + 1]
317             = $self->find_style_for_cell($col, $row)
318             if $self->attr;
319             }
320              
321             sub _process_Sheet_elt {
322             # Add $self->sheet to $self->sheets.
323 32     32   278 my ($self, $text, %keys) = @_;
324              
325 32         115 my $sheets = $self->sheets;
326 32         76 my $attrs = $sheets->[0];
327 32         83 my $indx = $attrs->{sheets} = @$sheets;
328 32         95 my $sheet = $self->sheet;
329             $self->{_sheet}{cell}[0] = [ ] # for consistency.
330 32 100       122 if $self->{_sheet}{cell};
331 32         79 push(@$sheets, $sheet);
332 32         56 $sheet->{indx} = $indx;
333 32         75 my $attr = $self->attr;
334 32 100       78 if ($attr) {
335 8 100       29 $sheet->{style_regions} = $self->style_regions
336             if $attr eq 'keep';
337 8         30 $self->style_regions([ ]);
338             # Distribute non-minimal attributes.
339 8 50       24 unless ($self->minimal_attributes) {
340 8         58 for my $col (1 .. $sheet->{maxcol}) {
341 1544   50     2699 for my $row (1 .. $sheet->{maxrow} || 0) {
342 28704   66     61909 $self->{_sheet}{attr}[$col][$row]
343             ||= $self->find_style_for_cell($col - 1, $row - 1);
344             }
345             }
346             }
347             # Distribute merging information.
348 8         23 my $merged = $sheet->{merged};
349 8 100       23 if ($merged) {
350 2         4 my $attrs = $sheet->{attr};
351 2         7 my $merge_p = $self->merge;
352 2         5 for my $merge (@$merged) {
353 4         7 my ($col1, $row1, $col2, $row2) = @$merge;
354 4         7 my $base_cell_name = _encode_cell_name($col1) . $row1;
355 4 100       11 my $merge_flag = $merge_p ? $base_cell_name : 1;
356             my $base_cell_contents = ($self->rc
357             ? $sheet->{cell}[$col1][$row1]
358 4 50       9 : $sheet->{$base_cell_name});
359 4         10 for my $col ($col1 .. $col2) {
360 6         9 for my $row ($row1 .. $row2) {
361 12   100     33 my $cell_attrs
362             = ($attrs->[$col][$row]
363             || $self->find_style_for_cell($col - 1,
364             $row - 1)
365             || { });
366             # Copy the hash so we don't accidentally mark other
367             # cells as merged due to shared structure.
368 12         53 $cell_attrs = { %$cell_attrs };
369 12         24 $cell_attrs->{merged} = $merge_flag;
370 12         17 $attrs->[$col][$row] = $cell_attrs;
371             # When $merge_p, also copy cell contents.
372 12 100       23 if ($merge_p) {
373 6 50       12 $sheet->{cell}[$col][$row] = $base_cell_contents
374             if $self->rc;
375 6         10 my $cell_name = _encode_cell_name($col) . $row;
376 6 50       13 $sheet->{$cell_name} = $base_cell_contents
377             if $self->cells;
378             }
379             }
380             }
381             }
382             }
383             }
384 32         98 $self->sheet({ });
385             }
386              
387             ## Parsing cell style attributes
388              
389             sub _convert_identity {
390             # We assume this is 0 or 1.
391 1665     1665   2129 my ($value) = @_;
392              
393 1665         2366 return $value;
394             }
395              
396             sub _convert_align {
397             # This is a string which we truncate and downcase -- and hope the caller
398             # can figure it out, since Gnumeric seems to use non-standard values.
399 216     216   291 my ($value) = @_;
400              
401 216 50       995 return $value =~ /^GNM_.ALIGN_(.*)$/ ? lc($1) : lc($value);
402             }
403              
404             sub _convert_color {
405 171     171   234 my ($color) = @_;
406              
407             my $convert_primary = sub {
408             # Pad and/or truncate the value.
409 513     513   678 my ($value) = @_;
410              
411 513         722 my $len = length($value);
412 513 100       725 if ($len <= 2) {
    50          
413             # Less than #xFF we round to zero.
414 342         869 return '00';
415             }
416             elsif ($len == 3) {
417 0         0 return '0' . substr($value, 0, 1);
418             }
419             else {
420 171         553 return substr($value, 0, 2);
421             }
422 171         502 };
423              
424 171         516 return join('', '#', map { $convert_primary->($_); } split(':', $color));
  513         882  
425             }
426              
427             my %style_conversion_map
428             = (Back => [ bgcolor => \&_convert_color, 1 ],
429             Bold => [ bold => \&_convert_identity ],
430             # Font is not an attribute.
431             Fore => [ fgcolor => \&_convert_color, 1 ],
432             Format => [ format => \&_convert_identity ],
433             HAlign => [ halign => \&_convert_align ],
434             Hidden => [ hidden => \&_convert_identity ],
435             Indent => [ indent => \&_convert_identity ],
436             Italic => [ italic => \&_convert_identity ],
437             Locked => [ locked => \&_convert_identity ],
438             PatternColor => [ pattern_color => \&_convert_color, 1 ],
439             Rotation => [ rotation => \&_convert_identity ],
440             Script => [ script => \&_convert_identity ],
441             Shade => [ shade => \&_convert_identity ],
442             ShrinkToFit => [ shrink_to_fit => \&_convert_identity ],
443             Unit => [ size => \&_convert_identity ],
444             StrikeThrough => [ strike_through => \&_convert_identity ],
445             Underline => [ uline => \&_convert_identity ],
446             VAlign => [ valign => \&_convert_align ],
447             WrapText => [ wrap => \&_convert_identity ]);
448              
449             sub _convert_attributes {
450             # It is a useful coincidence that the cell metadata Spreadsheet::Read
451             # calls attributes are stored in XML attributes by Gnumeric. So we convert
452             # Gnumeric cell attributes to Spreadsheet::Read cell attributes.
453 216     216   321 my ($self, $cell_attributes, $gnumeric_attributes) = @_;
454              
455 216         401 my $convert_colors = $self->convert_colors;
456 216         773 for my $name (keys(%$gnumeric_attributes)) {
457 2052         3139 my $value = $gnumeric_attributes->{$name};
458 2052         2796 my $conversion = $style_conversion_map{$name};
459 2052 50       3555 my ($converted_name, $converter, $color_p)
460             = ($conversion ? @$conversion : (lc($name), \&_convert_identity));
461 2052 100 100     3498 $converter = \&_convert_identity
462             if $color_p && ! $convert_colors;
463 2052         2762 my $converted_value = $converter->($value);
464 2052         4127 $cell_attributes->{$converted_name} = $converted_value;
465             }
466             }
467              
468             sub _process_Font_elt {
469             # Process a font definition. This is contained inside
471             # fills it out.
472 516     516   2352 my ($self, $font_name, %keys) = @_;
473              
474             return
475 516 100       1076 unless $self->attr;
476 108         275 my $style_attributes = { font => $font_name };
477 108         289 $self->_convert_attributes($style_attributes, \%keys);
478 108         291 $self->style_attributes($style_attributes);
479             }
480              
481             sub _process_Style_elt {
482             # Process style attributes.
483 516     516   3352 my ($self, $text, %keys) = @_;
484              
485             return
486 516 100       1047 unless $self->attr;
487 108 50       221 my $style_attributes = $self->style_attributes or die;
488 108         217 $self->_convert_attributes($style_attributes, \%keys);
489             }
490              
491             sub find_style_for_cell {
492             # Assuming $self->attr is true, find the style region for ($col, $row) and
493             # return its attribute hash.
494 28712     28712 1 38095 my ($self, $col, $row) = @_;
495              
496 28712         30359 my $found;
497 28712         29941 for my $style_region (@{$self->{_style_regions}}) {
  28712         40327  
498 15746 100 100     25581 if ($style_region->start_col <= $col
      100        
      100        
499             && $col <= $style_region->end_col
500             && $style_region->start_row <= $row
501             && $row <= $style_region->end_row) {
502 342         428 die "duplicate at ($col, $row)"
503             # style regions are supposed to be disjoint; enable this for
504             # extra paranoia.
505             if 0 && $found;
506 342         574 $found = $style_region;
507             }
508             }
509 28712   66     61575 return $found && $found->style_attributes;
510             }
511              
512             sub _process_StyleRegion_elt {
513             # Collect a style region for the style attributes we have just defined.
514 516     516   1692 my ($self, $text, %keys) = @_;
515              
516             return
517 516 100       1000 unless $self->attr;
518             my $style_region = Spreadsheet::Gnumeric::StyleRegion->new
519             (start_col => $keys{startCol}, end_col => $keys{endCol},
520             start_row => $keys{startRow}, end_row => $keys{endRow},
521 108         286 style_attributes => $self->style_attributes);
522 108         138 push(@{$self->{_style_regions}}, $style_region);
  108         349  
523             }
524              
525             1;
526              
527             __END__