File Coverage

blib/lib/XML/DT/Sequence.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             package XML::DT::Sequence;
2              
3 1     1   21443 use XML::DT;
  0            
  0            
4              
5             use 5.006;
6             use strict;
7             use warnings;
8              
9             use base 'Exporter';
10             our @EXPORT = qw($u $c %v $q &father &gfather &ggfather &root);
11              
12             =encoding UTF-8
13              
14             =head1 NAME
15              
16             XML::DT::Sequence - Down Translator (XML::DT) for sequence XMLs
17              
18             =head1 VERSION
19              
20             Version 0.01
21              
22             =cut
23              
24             our $VERSION = '0.01';
25              
26              
27             =head1 SYNOPSIS
28              
29             A lot of XML files nowadays are just catalogues, simple sequences of
30             small chunks, that repeat, and repeat. These files can get enormous,
31             and DOM processing hard. SAX processing it interesting but not always
32             the best approach.
33              
34             This module chunks the XML file in Header, a sequence of the repeating
35             blocks, and a footer, and each one of these chunks can be processed by
36             DOM, using L technology.
37              
38              
39             use XML::DT::Sequence;
40              
41             my $dt = XML::DT::Sequence->new();
42              
43             $dt->process("file.xml",
44             -tag => 'item',
45             -head => sub {
46             my ($self, $xml) = @_;
47             # do something with $xml
48             },
49             -body => {
50             item => sub {
51             # XML::DT like handler
52             }
53             },
54             -foot => sub {
55             my ($self, $xml) = @_;
56             # do something with $xml
57             },
58             );
59              
60             =head1 EXPLANATION
61              
62             There are four options, only two mandatory: C<-tag> and
63             C<-body>. C<-tag> is the element name that repeats in the XML file,
64             and that you want to process one at a time. C<-body> is the handler to
65             process each one of these elements.
66              
67             C<-head> is the handler to process the XML that appears before the
68             first instance of the repeating element, and C<-foot> the handler to
69             process the XML that apperas after the last instance of the repeating
70             element.
71              
72             Each one of these handlers can be a code reference that receives the
73             C object and the XML string, or a hash reference,
74             with L handlers to process each XML snippet.
75              
76             Note that when processing header or footer, XML is incomplete, and the
77             parser can recover in weird ways.
78              
79             The C method returns a hash reference with three keys:
80             C<-head> is the return value of the C<-head> handler, and C<-foot> is
81             the return value of the C<-foot> handler. C<-body> is the number of
82             elements of the sequence that were processed.
83              
84             =head1 METHODS
85              
86             =head2 new
87              
88             Constructor.
89              
90             =head2 process
91              
92             Processor. Se explanation above.
93              
94             =head2 break
95              
96             Forces the process to finish. Useful when you processed enough number
97             of elements. Note that if you break the process the C<-foot> code will
98             not be run.
99              
100             If you are using a code reference as a handler, call it from the first
101             argument (reference to the object). If you are using a C
102             handler, C<< $u >> has the object, so just call C on it.
103              
104             =cut
105              
106             sub new {
107             my ($class) = @_;
108             return bless { } => $class;
109             }
110              
111             sub break {
112             my $self = shift;
113             $self->{BREAK} = 1;
114             }
115              
116             sub process {
117             my ($self, $file, %ops) = @_;
118              
119             die "Option -tag is mantatory." unless exists $ops{-tag};
120              
121             local $/ = "";
122              
123             # XXX - fixme... utf8?
124             open my $fh, "<:utf8", $file or die "Can't open file $file for reading [$!]";
125             my $firstChunk = <$fh>;
126              
127             die "No $/ tag found. Bailing out." unless $firstChunk =~ $/;
128              
129             my $head = $firstChunk;
130             $head =~ s/<$ops{-tag}.*//s;
131              
132             ## Process header if there is such a handler
133             my $headReturn = undef;
134             if (exists($ops{-head})) {
135             my $headCode = $ops{-head};
136             if (ref($headCode) eq "CODE") {
137             $headReturn = $headCode->($self, $head);
138             }
139             elsif (ref($headCode) eq "HASH") {
140             $headReturn = dtstring($head, -recover => 1, -userdata => $self, %$headCode);
141             }
142             else {
143             die "No idea what to do with -head of type ".ref($ops{-head});
144             }
145             }
146              
147             ## process the sequence
148             my $chunk = $firstChunk;
149             my $totalElems = 0;
150             my $bodyCode = $ops{-body} || undef;
151             my $code;
152              
153             if (!$bodyCode) {
154             $code = sub { };
155             } elsif (ref($bodyCode) eq "CODE") {
156             $code = sub { $bodyCode->($self, $_[0]) };
157             } elsif (ref($bodyCode) eq "HASH") {
158             $code = sub { dtstring($_[0], -userdata=> $self, %$bodyCode) }
159             } else {
160             die "No idea what to do with -body of type ".ref($ops{-body});
161             }
162              
163             do {
164             ++$totalElems;
165             $chunk =~ s/^.*(?=<$ops{-tag})//s;
166             $code->($chunk);
167             $chunk = <$fh>;
168             } while ($chunk =~ m{} and !$self->{BREAK});
169              
170             my $footReturn;
171             if (!$self->{BREAK}) {
172             if (exists($ops{-foot})) {
173             my $footCode = $ops{-foot};
174             if (ref($footCode) eq "CODE") {
175             $footReturn = $footCode->($self, $chunk);
176             }
177             elsif (ref($footCode) eq "HASH") {
178             $chunk =~ s{^\s*}{}g;
179             $footReturn = dtstring($chunk,
180             -userdata => $self,
181             -recover => 1, %$footCode);
182             }
183             else {
184             die "No idea what to do with -foot of type ".ref($ops{-foot});
185             }
186             }
187             }
188              
189             close $fh;
190              
191             return {
192             -head => $headReturn,
193             -body => $totalElems,
194             -foot => $footReturn,
195             };
196             }
197              
198             =head1 AUTHOR
199              
200             Alberto Simões, C<< >>
201              
202             =head1 BUGS
203              
204             Please report any bugs or feature requests to C
205             rt.cpan.org>, or through the web interface at
206             L. I
207             will be notified, and then you'll automatically be notified of
208             progress on your bug as I make changes.
209              
210              
211              
212              
213             =head1 SUPPORT
214              
215             You can find documentation for this module with the perldoc command.
216              
217             perldoc XML::DT::Sequence
218              
219              
220             You can also look for information at:
221              
222             =over 4
223              
224             =item * RT: CPAN's request tracker (report bugs here)
225              
226             L
227              
228             =item * AnnoCPAN: Annotated CPAN documentation
229              
230             L
231              
232             =item * CPAN Ratings
233              
234             L
235              
236             =item * Search CPAN
237              
238             L
239              
240             =back
241              
242             =head1 KNOWN BUGS AND LIMITATIONS
243              
244             =over 4
245              
246             =item * Spaced tags
247              
248             It is not usual, but XML allows the usage of spaces inside element
249             tags, for instance, between the C<< < >> and the element name. This is
250             B supported.
251              
252             =item * Multiple usage tags
253              
254             If the same tag is used in different levels of the XML hierarchy, it
255             is likely that the implemented algorithm will not work.
256              
257             =back
258              
259             =head1 ACKNOWLEDGEMENTS
260              
261              
262             =head1 LICENSE AND COPYRIGHT
263              
264             Copyright 2012 Alberto Simões.
265              
266             This program is free software; you can redistribute it and/or modify it
267             under the terms of either: the GNU General Public License as published
268             by the Free Software Foundation; or the Artistic License.
269              
270             See http://dev.perl.org/licenses/ for more information.
271              
272              
273             =cut
274              
275             1; # End of XML::DT::Sequence