File Coverage

blib/lib/OpenOffice/Parse/SXC.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package OpenOffice::Parse::SXC;
2              
3 2     2   30901 use 5.006;
  2         8  
  2         93  
4 2     2   10 use strict;
  2         5  
  2         75  
5 2     2   11 use warnings;
  2         9  
  2         71  
6 2     2   4155 use XML::Parser;
  0            
  0            
7             use IO::File;
8             require Exporter;
9              
10             our @ISA = qw(Exporter);
11              
12             # Items to export into callers namespace by default. Note: do not export
13             # names by default without a very good reason. Use EXPORT_OK instead.
14             # Do not simply export all your public functions/methods/constants.
15              
16             # This allows declaration use OpenOffice::Parse::SXC ':all';
17             # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
18             # will save memory.
19             our %EXPORT_TAGS = ( 'all' => [ qw(
20             parse_sxc csv_quote dump_sxc_file
21             ) ] );
22              
23             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
24              
25             our @EXPORT = qw(
26             );
27             our $VERSION = '0.03';
28              
29             my %valid_options = ( worksheets => 1,
30             no_trim => 1,
31             );
32              
33             ##################################################################
34             # EXPORT_OK methods:
35              
36             sub csv_quote {
37             my $text = shift;
38             return "" if( ! defined $text );
39             $text =~ s/\n//g; # Remove all newlines!
40             $text =~ s/\"/\"\"/g;
41             if( $text =~ /[,"']/ ) {
42             $text = "\"$text\"";
43             }
44             return $text;
45             }
46              
47             sub parse_sxc {
48             my $sxc_filename = shift;
49             my %options = @_;
50             my $SXC = OpenOffice::Parse::SXC->new( %options );
51             # OpenOffice::Parse::SXC implements the 'data_handler' interface, so we can
52             # create an object to use itself as a data handler.
53             $SXC->set_data_handler( $SXC );
54             $SXC->parse_file( $sxc_filename );
55             return $SXC->parse_sxc_rows;
56             }
57              
58             # Used for debugging, dump_sxc_file parses a file and dumps the resultant objects
59             # onto STDOUT. This is a good way to view just what's going on behind the scenes.
60              
61             sub dump_sxc_file {
62             my $filename = shift;
63             my $Parser = XML::Parser->new( Style => "Objects" );
64              
65             my $results = $Parser->parsefile( $filename );
66             print Dumper( $results );
67             }
68              
69             ##################################################################
70             # The data_handler routines:
71             #
72             # These are provided to provide the simple interface parse_sxc()
73             #
74             # See parse_sxc() for more details
75              
76             sub row {
77             my $self = shift;
78             shift;
79             my $row = shift;
80             push @{$self->{parse_sxc_rows}}, $row;
81             # print join(",", @$row ),"\n";
82             }
83              
84             sub worksheet {
85             my $self = shift;
86             shift;
87             my $worksheet = shift;
88             if( ! $self->{parse_sxc_rows} ) {
89             $self->{parse_sxc_rows} = [];
90             }
91             # print "IN WORKSHEET '$worksheet'.\n";
92             }
93              
94             sub workbook {
95             my $self = shift;
96             shift;
97             my $workbook = shift;
98             # print "IN WORKBOOK '$workbook'.\n";
99             }
100              
101             sub parse_sxc_rows {
102             my $self = shift;
103             return @{$self->{parse_sxc_rows}};
104             }
105              
106             sub clear_parse_sxc_rows {
107             my $self = shift;
108             $self->{parse_sxc_rows} = [];
109             }
110              
111              
112             # End data_handler routines
113             ##################################################################
114              
115             ##################################################################
116             # Main OpenOffice::Parse::SXC methods:
117              
118             sub new {
119             my $type = shift;
120             my $self = { options => {},
121             };
122             bless $self, $type;
123             my %options = @_;
124             $self->set_options( %options ) if( %options );
125             $self->repeat_following_cell( 1 ); # Times the cell is to be repeated
126             $self->repeat_following_row( 1 ); # Times the row is to be repeated
127             $self->reset_cell_list; # Clear out the cell list
128             # If the user hasn't supplied a row handler, set up a default one for
129             # him which prints out the data to STDOUT.
130             if( ! $self->get_data_handler ) {
131             $self->set_data_handler( $self );
132             }
133             $self->accept_rows( 0 ); # By default, start off accepting NOTHING
134             $self->accept_cells( 0 );
135             $self->accept_text( 0 );
136             return $self;
137             }
138              
139             # PUBLIC parse() accepts a filehandle
140              
141             sub parse {
142             my $self = shift;
143              
144             my $SXC_FH = shift; # Data source
145              
146             # We need to use closures to provide a true object-oriented way of doing things. This can
147             # be considered a memory leak, but only a few bytes per parse call:
148             my $Parser = XML::Parser->new
149             ( Handlers => { Start => sub { $self->start_handler( @_ ); },
150             Char => sub { $self->char_handler( @_ ); },
151             End => sub { $self->end_handler( @_ ); },
152             },
153             );
154             my $results = $Parser->parse( $SXC_FH );
155             return $results;
156             }
157              
158             # PUBLIC calls parse() after opening a filehandle
159              
160             sub parse_file {
161             my $self = shift;
162             my $filename = shift || die "No file to parse";
163              
164             if( ! -f $filename ) {
165             die "Could not find file '$filename' to parse";
166             }
167             my $SXC_FH = IO::File->new( "unzip -p $filename content.xml|" )
168             || die "Could not open pipe: 'unzip -p $filename content.xml'";
169             $self->get_data_handler->workbook( $self, $filename );
170             return $self->parse( $SXC_FH );
171             }
172              
173             # The XML::Parser handler for ending of tags.
174             # It's used to trigger the end of cell and end of row actions.
175              
176             sub end_handler {
177             my $self = shift;
178              
179             my $Expat = shift;
180             my $type = shift;
181              
182             if( $type eq "table:table-row" ) {
183             if( $self->accept_rows ) {
184             $self->accept_cells( 0 );
185             $self->end_row; # The row is done
186             }
187             }
188             elsif( $type eq "table:table-cell" ) {
189             if( $self->accept_cells ) {
190             if( $self->accept_text ) {
191             $self->end_cell; # The cell is done
192             }
193             $self->accept_text( 0 );
194             }
195             }
196             elsif( $type eq "text:p" ) {
197             # Kludging along to infinity... The data in each cell
198             # comes in tags. Each is assumed to NOT end in
199             # a newline, however, if a newline is added () it ends
200             # the previous block and starts a new one.
201             #
202             # I'll add a newline after the end of each tag, and then
203             # remove the last newline on the list when the cell is 'closed'.
204             if( $self->accept_text ) {
205             $self->append_cell_data( "\n" );
206             }
207             }
208             }
209              
210             # E() implements an "Object O Exists in list L" boolean function
211              
212             sub E {
213             my $item = shift;
214             my @set = @_;
215             for( @set ) {
216             return 1 if( $item eq $_ );
217             }
218             return 0;
219             }
220              
221             # The start_handler for XML::Parser.
222             # It's responsible for things such as the following:
223             #
224             # - Locking and allowing the parsing of worksheets, rows, and cells.
225             # -
226              
227             sub start_handler {
228             my $self = shift;
229             my $Expat = shift;
230              
231             my $type = shift;
232             my %args = @_;
233             if( $type eq "table:table" ) {
234             # Restrict processing of a 'worksheet' if the user has specified worksheets that he wants:
235             if( ! $self->get_option( "worksheets" ) or E( $args{"table:name"}, @{$self->get_option( "worksheets" )} ) ) {
236             # Ok, we process this worksheet:
237             $self->accept_rows( 1 ); # Accept rows
238             $self->set_current_worksheet_name( $args{"table:name"} );
239             $self->get_data_handler->worksheet( $self, $args{"table:name"} );
240             }
241             else {
242             $self->accept_rows( 0 ); # Do not accept row data
243             }
244             }
245             elsif( $type eq "table:table-row" ) { # ROW
246              
247             if( $self->accept_rows ) {
248             if( $args{"table:number-rows-repeated"} ) {
249             # Cause next row to be repeated...
250             $self->repeat_following_row( $args{"table:number-rows-repeated"} );
251             }
252             $self->accept_cells( 1 );
253             }
254             }
255             elsif( $type eq "table:table-cell" ) { # CELL
256              
257             if( $self->accept_cells ) {
258             # Cell repeat
259             if( $args{"table:number-columns-repeated"} ) {
260             $self->repeat_following_cell( $args{"table:number-columns-repeated"} );
261             }
262             $self->accept_text( 1 );
263             }
264             }
265             elsif( $type eq "text:s" ) { # TEXT
266             # NOTE: Text type 'text:s' = space, I assume! OpenOffice uses this tag to
267             # represent spaces that are longer than 2 characters. There may be other
268             # special 'text' elements, but I'm unaware of them currently. This is the
269             # routine to modify to handle them though!
270             if( $self->accept_text ) {
271             my $multiplier = $args{"text:c"} || 1; # Number of characters
272             $self->append_cell_data( " " x $multiplier );
273             }
274             }
275             elsif( $type eq "text:p" ) {
276             # Yikes, I initially wrote this without text:p in the start handler, instead
277             # relying on char_handler. I SHOULD change the restrictions layer to handle
278             # accept_text_p... maybe when I have the energy
279             }
280             }
281              
282             # The XML::Parser character handler. It builds up cells piece by piece
283              
284             sub char_handler {
285             my $self = shift;
286              
287             if( $self->accept_text ) {
288             my $Expat = shift;
289             my $text = shift;
290             $self->append_cell_data( $text ); # Build up cell data from multiple bits of text
291             }
292             }
293              
294             ##################################################################
295             # These routines restrict what gets processed. They each
296             # take a boolean value, turning the switch on or off. There
297             # are 3 levels of restriction: rows, cells, and text.
298              
299             sub accept_cells {
300             my $self = shift;
301             my $value = shift;
302             if( ! defined $value ) {
303             return $self->{accept_cells};
304             }
305             else {
306             $self->{accept_cells} = $value;
307             }
308             }
309             sub accept_rows {
310             my $self = shift;
311             my $value = shift;
312             if( ! defined $value ) {
313             return $self->{accept_rows};
314             }
315             else {
316             $self->{accept_rows} = $value;
317             }
318             }
319             sub accept_text {
320             my $self = shift;
321             my $value = shift;
322             if( ! defined $value ) {
323             return $self->{accept_text};
324             }
325             else {
326             $self->{accept_text} = $value;
327             }
328             }
329              
330             ##################################################################
331              
332             sub set_current_worksheet_name {
333             my $self = shift;
334             $self->{current_worksheet_name} = shift;
335             }
336              
337             # PUBLIC, returns the name of the current worksheet.
338              
339             sub get_current_worksheet_name {
340             my $self = shift;
341             return $self->{current_worksheet_name};
342             }
343              
344              
345             # Reset the list of cells to the empty list
346              
347             sub reset_cell_list {
348             my $self = shift;
349             $self->{cells} = [];
350             }
351              
352             # PUBLIC Set some options via a hash
353              
354             sub set_options {
355             my $self = shift;
356             my %options = @_;
357              
358             # Check to ensure the options are valid
359             for( keys %options ) {
360             if( ! $valid_options{$_} ) {
361             die "Invalid option: '$_' ($options{$_}) passed as an option to ".ref $self."->set_options()";
362             }
363             }
364              
365             $self->{options} = { %{$self->{options}}, %options };
366             }
367              
368             # PUBLIC Get an option
369              
370             sub get_option {
371             my $self = shift;
372             my $opt_name = shift;
373             return $self->{options}{$opt_name};
374             }
375              
376             sub append_cell_data {
377             my $self = shift;
378             $self->{current_cell_data} .= shift;
379             }
380              
381             sub clear_cell {
382             my $self = shift;
383             $self->{current_cell_data} = "";
384             }
385              
386             # Specify that a cell is to be repeated N times. N is usually 1.
387              
388             sub repeat_following_cell {
389             my $self = shift;
390             $self->{cell_repeat} = shift;
391             }
392              
393             # See repeat_following_cell()
394              
395             sub repeat_following_row {
396             my $self = shift;
397             $self->{row_repeat} = shift;
398             }
399              
400             # The data_handler is how we use this module.
401              
402             sub set_data_handler {
403             my $self = shift;
404             my $data_handler = shift || die "No row handler provided";
405              
406             $self->{data_handler} = $data_handler;
407             }
408              
409             sub get_data_handler {
410             my $self = shift;
411             return $self->{data_handler};
412             }
413              
414             # The end of the row has been reached, we call the data_handler:
415              
416             sub end_row {
417             my $self = shift;
418             my $cells = $self->{cells};
419              
420             # OpenOffice actually specifies ALL the cells in the spreadsheet, some 32000 of
421             # them, but using a repeat. This bit of code detects the repeat, and can either
422             # ignore it, since there likely won't be any data after a long repeat value, or
423             # print them all out, if the "no_trim" option has been supplied.
424             if( $self->{row_repeat} < 500 or $self->get_option( "no_trim" ) ) {
425             for( 1 .. $self->{row_repeat} ) {
426             $self->get_data_handler->row( $self, $cells ); # Assume the row handler is an object
427             }
428             }
429             $self->repeat_following_row( 1 ); # Default 1
430             $self->reset_cell_list; # Clear out cells
431             }
432              
433             # Ends the current cell. It will be added to the cell list.
434              
435             sub end_cell {
436             my $self = shift;
437             chomp $self->{current_cell_data}; # remove the last newline
438             for( 1 .. $self->{cell_repeat} ) {
439             push @{$self->{cells}}, $self->{current_cell_data};
440             }
441             $self->repeat_following_cell( 1 ); # Default to 1
442             $self->clear_cell;
443             }
444              
445              
446             1;
447             __END__