File Coverage

lib/Spreadsheet/Reader/ExcelXML/XMLReader/PositionSharedStrings.pm
Criterion Covered Total %
statement 87 106 82.0
branch 50 60 83.3
condition 34 41 82.9
subroutine 11 11 100.0
pod 2 2 100.0
total 184 220 83.6


line stmt bran cond sub pod time code
1             package Spreadsheet::Reader::ExcelXML::XMLReader::PositionSharedStrings;
2             our $AUTHORITY = 'cpan:JANDREW';
3 20     20   63052 use version; our $VERSION = version->declare('v0.16.8');
  20         34  
  20         146  
4             ###LogSD warn "You uncovered internal logging statements for Spreadsheet::Reader::ExcelXML::XMLReader::PositionSharedStrings-$VERSION";
5              
6 20     20   2354 use 5.010;
  20         58  
7 20     20   80 use Moose::Role;
  20         25  
  20         165  
8             requires qw(
9             set_error where_am_i has_position
10             advance_element_position start_the_file_over i_am_here
11             parse_element get_group_return_type squash_node
12             current_named_node current_node_parsed close_the_file
13             good_load
14             );#grep_node
15 20         186 use Types::Standard qw(
16             Int Bool HashRef is_HashRef ArrayRef Enum is_Int
17 20     20   76294 );
  20         29  
18 20     20   19015 use Carp qw( confess );
  20         28  
  20         1067  
19 20     20   80 use Data::Dumper;
  20         28  
  20         836  
20 20     20   81 use lib '../../../../../../lib';
  20         28  
  20         151  
21             ###LogSD use Log::Shiras::Telephone;
22              
23             #########1 Public Attributes 3#########4#########5#########6#########7#########8#########9
24              
25             has cache_positions =>(
26             isa => Bool,
27             reader => 'should_cache_positions',
28             default => 1,
29             );
30              
31             #########1 Public Methods 3#########4#########5#########6#########7#########8#########9
32              
33             sub get_shared_string{
34 267     267 1 82053 my( $self, $position ) = @_;
35             ###LogSD my $phone = Log::Shiras::Telephone->new( name_space =>
36             ###LogSD $self->get_all_space . '::get_shared_string', );
37 267 50       1072 if( !defined $position ){
    50          
38 0         0 $self->set_error( "Requested shared string position required - none passed" );
39 0         0 return undef;
40             }elsif( !is_Int( $position ) ){
41 0         0 confess "The passed position -$position- is not an integer";
42             }
43             ###LogSD $phone->talk( level => 'debug', message => [
44             ###LogSD "Getting the sharedStrings position: $position",
45             ###LogSD "From current position: " . ($self->has_position ? $self->where_am_i : '(none yet)'), ] );
46              
47             #checking if the reqested position is too far
48 267 100       8925 if( $position > $self->_get_unique_count - 1 ){
49 15         385 $self->set_error( "Asking for position -$position- (from 0) but the shared string " .
50             "max cell position is: " . ($self->_get_unique_count - 1) );
51 15         57 return undef;# fail
52             }
53              
54 252         308 my ( $return, $success );
55             # handle cache retrieval
56 252 100 100     6502 if( $self->should_cache_positions and $self->_last_cache_position >= $position ){
57             ###LogSD $phone->talk( level => 'debug', message => [
58             ###LogSD "Retreiving position -$position- from cache" ] );
59 142         4160 $return = $self->_get_ss_position( $position );
60 142         178 $success = 1;
61             }
62              
63             # checking if the reqested (last) position is stored (no caching)
64 252 50 100     3346 if( !$success and $self->_has_last_position and $position == $self->_get_last_position ){
      66        
65             ###LogSD $phone->talk( level => 'debug', message => [
66             ###LogSD "Already built the answer for position: $position",
67             ###LogSD $self->_get_last_position_ref ] );
68 0         0 $return = $self->_get_last_position_ref;
69 0         0 $success = 1;
70             }
71              
72             ###LogSD $phone->talk( level => 'debug', message => [
73             ###LogSD "Success: " . ($success//'not yet'), "position state: " . $self->has_position ] );
74             # reset the file if needed
75 252 100 100     3301 if( !$success and $self->has_position and $self->where_am_i > $position ){
      100        
76             ###LogSD $phone->talk( level => 'debug', message => [
77             ###LogSD "Current position: " . $self->where_am_i, "..against desired position: $position" ] );
78 4         27 $self->start_the_file_over;
79             ###LogSD $phone->talk( level => 'debug', message => [
80             ###LogSD "Finished resetting the file" ] );
81             }
82             ###LogSD $phone->talk( level => 'debug', message => [
83             ###LogSD "Reset tests complete" ] );
84              
85             # Kick start position counting for the first go-round
86 252 100 66     3036 if( !$success and !$self->has_position ){
87             ###LogSD $phone->talk( level => 'debug', message => [
88             ###LogSD "Kickstart position counting - getting first si cell" ] );
89              
90 30         44 my( $result, $node_name, $node_level, $result_ref );
91 30         101 my $current_node = $self->current_node_parsed;
92             ###LogSD $phone->talk( level => 'trace', message =>[
93             ###LogSD "The current node is:", $current_node ] );
94 30 50       125 if( (keys %$current_node)[0] eq 'ai' ){
95             ###LogSD $phone->talk( level => 'trace', message =>[
96             ###LogSD "Found the core properties node" ] );
97 0         0 $result = 2;
98 0         0 $node_name = 'si';
99             }else{
100 30         197 ( $result, $node_name, $node_level, $result_ref ) =
101             $self->advance_element_position( 'si' );
102             }
103 30 50       85 if( $result ){
104             ###LogSD $phone->talk( level => 'debug', message => [
105             ###LogSD "Successfully advanced one share string position" ] );
106 30         846 $self->i_am_here( 0 );
107             }else{
108 0         0 $self->set_error( "No sharedStrings elements available" );
109 0         0 $self->_set_unique_count( 0 );
110 0         0 return undef;
111             }
112             }
113             ###LogSD $phone->talk( level => 'debug', message => [
114             ###LogSD "Any needed kickstarting complete" ] );
115              
116             # Advance to the proper position - storing along the way as needed
117 252         522 while( !$success ){
118             ###LogSD $phone->talk( level => 'debug', message => [
119             ###LogSD "Reading the position: " . $self->where_am_i ] );
120              
121             # Build a perl ref
122 365         712 my $inital_parse = $self->parse_element;
123 365         352 my $provisional_output;
124             ###LogSD $phone->talk( level => 'debug', message => [
125             ###LogSD "Collected:", $inital_parse ] );
126              
127             # Handle unexpected end of file here
128 365 50 66     1676 if( !$inital_parse ){# Potential chopped off end of file here 20-empty_shared_strings_bug.t
    100          
129             ###LogSD $phone->talk( level => 'debug', message => [
130             ###LogSD "Handling the (empty) end of the file", ] );
131 0         0 $self->set_error( "The shared strings file ended (poorly?) before expected" );
132 0         0 $self->_set_unique_count( $self->where_am_i );
133 0         0 return undef;
134             }elsif( $inital_parse and $inital_parse eq 'EOF' ){
135             ###LogSD $phone->talk( level => 'debug', message => [
136             ###LogSD "Handling the end of the file", ] );
137 3         4 $return = $inital_parse;
138 3         75 $self->_set_unique_count( $self->where_am_i + 1 );
139 3         3 last;
140             }
141              
142             # Convert the perl ref to a styles ref
143 362         711 $inital_parse = $self->squash_node( $inital_parse );
144             ###LogSD $phone->talk( level => 'debug', message => [
145             ###LogSD "Squashed to:", $inital_parse ] );
146 362 50       600 if( is_HashRef( $inital_parse ) ){
147             ###LogSD $phone->talk( level => 'debug', message => [
148             ###LogSD "The initial parse is a hash ref" ] );
149 362 100       1131 if( exists $inital_parse->{t} ){
    50          
150 347         403 $provisional_output = $inital_parse->{t};
151             }elsif( exists $inital_parse->{list} ){
152             ###LogSD $phone->talk( level => 'debug', message => [
153             ###LogSD "The initial parse is broken up into list elements", $inital_parse->{list} ] );
154 15         24 my ( $raw_text, $rich_text );
155 15         26 for my $element( @{$inital_parse->{list}} ){
  15         42  
156             ###LogSD $phone->talk( level => 'debug', message => [
157             ###LogSD "processing element:", $element ] );
158 45 100       108 push( @$rich_text, length( $raw_text ), $element->{rPr} ) if exists $element->{rPr};
159 45         59 $raw_text .= $element->{t};
160             }
161 15         60 @$provisional_output{qw( raw_text rich_text )} = ( $raw_text, $rich_text );
162             }else{
163 0         0 confess "Couldn't find 't' or 'list' keys in: " . Dumper( $inital_parse );
164             }
165             }else{
166 0         0 confess "Found unknown parse return: " . Dumper( $inital_parse );
167             }
168             ###LogSD $phone->talk( level => 'debug', message => [
169             ###LogSD "Built position " . $self->where_am_i . " => ", $provisional_output ] );
170              
171             # Cache the position as needed
172 362 100       9954 if( $self->should_cache_positions ){
173             my $cache_value =
174             !$provisional_output ? undef :
175             !is_HashRef( $provisional_output ) ? $provisional_output :
176             (scalar( keys %$provisional_output ) == 1 or $self->_should_block_formats) ?
177 277 100 66     592 $provisional_output->{raw_text} : $provisional_output;
    100          
    100          
178             ###LogSD $phone->talk( level => 'debug', message =>[ "Caching position: " . $self->where_am_i, $cache_value ] );
179 277         7068 $self->_set_ss_position( $self->where_am_i => $cache_value );
180 277         6391 $self->_set_last_cache_position( $self->where_am_i );
181             ###LogSD $phone->talk( level => 'trace', message =>[ "Updated cache:", $self->_get_all_cache ] );
182             }
183              
184             # Determine if we have arrived
185 362 100       8190 if( $self->where_am_i == $position ){
186 107         124 $success = 1;
187 107         128 $return = $provisional_output;
188 107 100       2619 if( !$self->should_cache_positions ){
189             #~ my $cache_value = scalar( keys %$provisional_output ) == 1 ? $provisional_output->{raw_text} : $provisional_output;
190             ###LogSD $phone->talk( level => 'debug', message =>[ "Saving the last postion" ] );
191 27         602 $self->_set_last_position( $self->where_am_i );
192 27         671 $self->_set_last_position_ref( $return );
193             }
194             }
195 362         8042 $self->i_am_here( $self->where_am_i + 1 );
196             ###LogSD $phone->talk( level => 'debug', message => [
197             ###LogSD "The next position to collect is: " . $self->where_am_i ] );
198 362         789 $self->advance_element_position( 'si' )
199             }
200              
201             # Manage the output
202             $return =
203             !defined $return ? $return :
204             ( $return and $return eq 'EOF' ) ? undef :
205             ( $self->_should_block_formats and is_HashRef( $return ) ) ? $return->{raw_text} :
206 252 100 66     1459 $self->_should_block_formats ? $return :
    100 100        
    100          
    100          
    100          
207             is_HashRef( $return ) ? $return : { raw_text => $return } ;
208             ###LogSD $phone->talk( level => 'debug', message => [
209             ###LogSD "After possible format stripping: " . $self->_should_block_formats, $return ] );
210              
211             # Close the file if caching complete
212 252 100 100     7606 if( $self->should_cache_positions and $self->has_file and $self->where_am_i > $self->_get_unique_count - 1 ){
      100        
213             ###LogSD $phone->talk( level => 'debug', message => [
214             ###LogSD "Closing the file - all positions have been stored in cache" ] );
215 20         150 $self->close_the_file;
216             }
217 252         1061 return $return;
218             }
219              
220             sub load_unique_bits{
221 28     28 1 55 my( $self, ) = @_;
222             ###LogSD my $phone = Log::Shiras::Telephone->new( name_space =>
223             ###LogSD $self->get_all_space . '::load_unique_bits', );
224             ###LogSD $phone->talk( level => 'debug', message => [
225             ###LogSD "Setting the sharedStrings unique bits" ] );
226 28         45 my( $result, $node_name, $node_level, $result_ref );
227 28         149 my $current_node = $self->current_node_parsed;
228             ###LogSD $phone->talk( level => 'trace', message =>[
229             ###LogSD "The current node is:", $current_node ] );
230 28 50       121 if( (keys %$current_node)[0] eq 'sst' ){
231             ###LogSD $phone->talk( level => 'trace', message =>[
232             ###LogSD "Found the core properties node" ] );
233 28         47 $result = 2;
234 28         49 $node_name = 'sst';
235             }else{
236 0         0 ( $result, $node_name, $node_level, $result_ref ) =
237             $self->advance_element_position( 'sst' );
238 0         0 $current_node = $self->current_node_parsed;
239             }
240 28 50 33     193 if( $result and $node_name eq 'sst' ){
241 28   100     96 my $unique_count = $current_node->{sst}->{uniqueCount} // 0;
242             ###LogSD $phone->talk( level => 'debug', message => [
243             ###LogSD "Loading unique count: $unique_count" ] );
244 28         830 $self->_set_unique_count( $unique_count );
245 28         772 $self->good_load( 1 );
246             }else{
247 0         0 $self->set_error( "No 'sst' element found - can't parse this as a shared strings file" );
248 0         0 $self->_clear_unique_count;
249             }
250             }
251              
252             #########1 Private Attributes 3#########4#########5#########6#########7#########8#########9
253              
254             has _unique_count =>(
255             isa => Int,
256             writer => '_set_unique_count',
257             reader => '_get_unique_count',
258             clearer => '_clear_unique_count',
259             predicate => '_has_unique_count'
260             );
261              
262             has _last_position =>(
263             isa => Int,
264             writer => '_set_last_position',
265             reader => '_get_last_position',
266             predicate => '_has_last_position',
267             trigger => sub{
268             my ( $self ) = @_;
269             if( $self->_has_last_position_ref ){
270             $self->_clear_last_position_ref;
271             }
272             },
273             );
274              
275             has _last_position_ref =>(
276             writer => '_set_last_position_ref',
277             reader => '_get_last_position_ref',
278             clearer => '_clear_last_position_ref',
279             predicate => '_has_last_position_ref',
280             );
281              
282             has _shared_strings_positions =>(
283             isa => ArrayRef,
284             traits => ['Array'],
285             default => sub{ [] },
286             handles =>{
287             _get_ss_position => 'get',
288             _set_ss_position => 'set',
289             },
290             reader => '_get_all_cache',
291             );
292              
293             has _cache_completed =>(
294             isa => Int,
295             default => -1,
296             reader => '_last_cache_position',
297             writer => '_set_last_cache_position',
298             );
299              
300             #########1 Private Methods 3#########4#########5#########6#########7#########8#########9
301              
302             sub _should_block_formats{
303 498     498   2436 my( $self, ) = @_;
304             ###LogSD my $phone = Log::Shiras::Telephone->new( name_space =>
305             ###LogSD $self->get_all_space . '::_should_block_formats', );
306             ###LogSD $phone->talk( level => 'debug', message => [
307             ###LogSD "determining if formats should be blocked: " . $self->get_group_return_type ] );
308 498 100       1375 return ( $self->get_group_return_type =~ /(unformatted|value|xml_value)/) ? 1 : 0 ;
309             }
310              
311             #########1 Phinish 3#########4#########5#########6#########7#########8#########9
312              
313 20     20   19865 no Moose::Role;
  20         35  
  20         106  
314              
315             1;
316              
317             #########1 Documentation 3#########4#########5#########6#########7#########8#########9
318             __END__
319              
320             =head1 NAME
321              
322             Spreadsheet::Reader::ExcelXML::XMLReader::PositionSharedStrings - Position based sharedStrings Reader
323              
324             =head1 SYNOPSIS
325              
326             #!/usr/bin/env perl
327             use Data::Dumper;
328             use MooseX::ShortCut::BuildInstance qw( build_instance );
329             use Spreadsheet::Reader::ExcelXML::XMLReader;
330             use Spreadsheet::Reader::ExcelXML::XMLReader::PositionSharedStrings;
331             use Spreadsheet::Reader::ExcelXML::SharedStrings;
332              
333             my $file_instance = build_instance(
334             package => 'SharedStringsInstance',
335             workbook_inst => Spreadsheet::Reader::ExcelXML::Workbook->new,
336             superclasses =>[
337             'Spreadsheet::Reader::ExcelXML::XMLReader'
338             ],
339             add_roles_in_sequence =>[
340             'Spreadsheet::Reader::ExcelXML::XMLReader::PositionSharedStrings',
341             'Spreadsheet::Reader::ExcelXML::SharedStrings',
342             ],
343             );
344              
345             =head1 DESCRIPTION
346              
347             This documentation is written to explain ways to use this module when writing your
348             own excel parser or extending this package. To use the general package for excel
349             parsing out of the box please review the documentation for L<Workbooks
350             |Spreadsheet::Reader::ExcelXML>, L<Worksheets
351             |Spreadsheet::Reader::ExcelXML::Worksheet>, and
352             L<Cells|Spreadsheet::Reader::ExcelXML::Cell>.
353              
354             This role is written to extend L<Spreadsheet::Reader::ExcelXML::XMLReader>.
355             It adds functionality to read position based sharedStrings files. It presents this
356             functionality in compliance with the top level L<interface
357             |Spreadsheet::Reader::ExcelXML::SharedStrings>. This POD only describes the
358             functionality incrementally provided by this module. For an overview of
359             sharedStrings.xml reading see L<Spreadsheet::Reader::ExcelXML::SharedStrings>
360              
361             =head2 Requires
362              
363             These are the methods required by this role and their default provider. All
364             methods are imported straight across with no re-naming.
365              
366             =over
367              
368             L<Spreadsheet::Reader::ExcelXML::Error/set_error>
369              
370             L<Spreadsheet::Reader::ExcelXML::XMLReader/good_load>
371              
372             L<Spreadsheet::Reader::ExcelXML::XMLReader/where_am_i>
373              
374             L<Spreadsheet::Reader::ExcelXML::XMLReader/has_position>
375              
376             L<Spreadsheet::Reader::ExcelXML::XMLReader/advance_element_position>
377              
378             L<Spreadsheet::Reader::ExcelXML::XMLReader/start_the_file_over>
379              
380             L<Spreadsheet::Reader::ExcelXML::XMLReader/i_am_here>
381              
382             L<Spreadsheet::Reader::ExcelXML::XMLReader/parse_element>
383              
384             L<Spreadsheet::Reader::ExcelXML::XMLReader/squash_node>
385              
386             L<Spreadsheet::Reader::ExcelXML::XMLReader/current_named_node>
387              
388             L<Spreadsheet::Reader::ExcelXML::XMLReader/current_node_parsed>
389              
390             L<Spreadsheet::Reader::ExcelXML::XMLReader/close_the_file>
391              
392             L<Spreadsheet::Reader::ExcelXML::Workbook/get_group_return_type>
393              
394             =back
395              
396             =head2 Methods
397              
398             These are the primary ways to use this class. For additional SharedStrings options
399             see the L<Attributes|/Attributes> section.
400              
401             =head3 get_shared_string( $positive_int )
402              
403             =over
404              
405             B<Definition:> This returns the data in the shared strings file identified
406             by the $positive_int position for position in position based sharedStrings
407             files.
408              
409             B<Accepts:> $positive_int ( a positive integer )
410              
411             B<Returns:> a hash ref with the key 'raw_text' and all coallated text for that
412             xml node as the value. If there is associated rich text in the node and
413             L<Spreadsheet::Reader::ExcelXML/group_return_type> is set to 'instance'
414             then it will also have a 'rich_text' key with the value set as an arrayref of
415             pairs (not sub array refs) with the first value being the position of the
416             raw_text from zero that the formatting is applied and the second position as
417             the settings for that format. Ex.
418              
419             {
420             raw_text => 'Hello World',
421             rich_text =>[
422             2,# Starting with the letter 'l' apply the format
423             {
424             'color' => {
425             'rgb' => 'FFFF0000'
426             },
427             'sz' => '11',
428             'b' => undef,
429             'scheme' => 'minor',
430             'rFont' => 'Calibri',
431             'family' => '2'
432             },
433             6,# Starting with the letter 'W' apply the format
434             {
435             'color' => {
436             'rgb' => 'FF0070C0'
437             },
438             'sz' => '20',
439             'b' => undef,
440             'scheme' => 'minor',
441             'rFont' => 'Calibri',
442             'family' => '2'
443             }
444             ]
445             }
446              
447             =back
448              
449             =head3 load_unique_bits
450              
451             =over
452              
453             B<Definition:> When the xml file first loads this is available to pull customized data.
454             It mostly pulls metadata and stores it in hidden attributes for use later. If all goes
455             according to plan it sets L<Spreadsheet::Reader::ExcelXML::XMLReader/good_load> to 1.
456              
457             B<Accepts:> Nothing
458              
459             B<Returns:> Nothing
460              
461             =back
462              
463             =head2 Attributes
464              
465             Data passed to new when creating an instance of this class. For
466             modification of this(ese) attribute(s) see the listed 'attribute
467             methods'. For more information on attributes see
468             L<Moose::Manual::Attributes>. The easiest way to modify this(ese)
469             attribute(s) is when a classinstance is created and before it is
470             passed to the workbook or parser.
471              
472             =head3 cache_positions
473              
474             =over
475              
476             B<Definition:> Especially for sheets with lots of stored text the
477             parser can slow way down when accessing each postion. This is
478             because the text is not always stored sequentially and the reader
479             is a JIT linear parser. To go back it must restart and index
480             through each position till it gets to the right place. This is
481             especially true for excel sheets that have experienced any
482             significant level of manual intervention prior to being read.
483             This attribute turns (default) on caching for shared strings so
484             the parser only has to read through the shared strings once. When
485             the read is complete all the way to the end it will also release
486             the shared strings file in order to free up some space.
487             (a small win in exchange for the space taken by the cache). The
488             trade off here is that all intermediate shared strings are
489             L<fully|/get_shared_string( $positive_intE<verbar>$name )> read
490             before reading the target string. This means early reads will be
491             slower. For sheets that only have numbers stored or at least have
492             very few strings this will likely not be a initial hit (or speed
493             improvement). In order to minimize the physical size of the cache,
494             if there is only a text string stored in the shared strings position
495             then only the string will be stored (not as a value to a raw_text
496             hash key). It will then reconstitue into a hashref when requested.
497              
498             B<Default:> 1 = caching is on
499              
500             B<Range:> 1|0
501              
502             B<Attribute required:> yes
503              
504             B<attribute methods> Methods provided to adjust this attribute
505              
506             =over
507              
508             none - (will be autoset by L<Spreadsheet::Reader::ExcelXML/cache_positions>)
509              
510             =back
511              
512             =back
513              
514             =head1 SUPPORT
515              
516             =over
517              
518             L<github Spreadsheet::Reader::ExcelXML/issues
519             |https://github.com/jandrew/p5-spreadsheet-reader-excelxml/issues>
520              
521             =back
522              
523             =head1 TODO
524              
525             =over
526              
527             B<1.> Nothing yet
528              
529             =back
530              
531             =head1 AUTHOR
532              
533             =over
534              
535             Jed Lund
536              
537             jandrew@cpan.org
538              
539             =back
540              
541             =head1 COPYRIGHT
542              
543             This program is free software; you can redistribute
544             it and/or modify it under the same terms as Perl itself.
545              
546             The full text of the license can be found in the
547             LICENSE file included with this module.
548              
549             This software is copyrighted (c) 2016 by Jed Lund
550              
551             =head1 DEPENDENCIES
552              
553             =over
554              
555             L<Spreadsheet::Reader::ExcelXML> - the package
556              
557             =back
558              
559             =head1 SEE ALSO
560              
561             =over
562              
563             L<Spreadsheet::Read> - generic Spreadsheet reader
564              
565             L<Spreadsheet::ParseExcel> - Excel binary version 2003 and earlier (.xls files)
566              
567             L<Spreadsheet::XLSX> - Excel version 2007 and later
568              
569             L<Spreadsheet::ParseXLSX> - Excel version 2007 and later
570              
571             L<Log::Shiras|https://github.com/jandrew/Log-Shiras>
572              
573             =over
574              
575             All lines in this package that use Log::Shiras are commented out
576              
577             =back
578              
579             =back
580              
581             =cut
582              
583             #########1#########2 main pod documentation end 5#########6#########7#########8#########9