File Coverage

blib/lib/XML/DT/Sequence.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             package XML::DT::Sequence;
2             $XML::DT::Sequence::VERSION = '0.02';
3 1     1   14832 use XML::DT;
  0            
  0            
4              
5             use 5.006;
6             use strict;
7             use warnings;
8              
9             use base 'Exporter';
10             our @EXPORT = qw($u $c %v $q &father &gfather &ggfather &root);
11              
12             =encoding UTF-8
13              
14             =head1 NAME
15              
16             XML::DT::Sequence - Down Translator (XML::DT) for sequence XMLs
17              
18             =head1 SYNOPSIS
19              
20             A lot of XML files nowadays are just catalogues, simple sequences of
21             small chunks, that repeat, and repeat. These files can get enormous,
22             and DOM processing hard. SAX processing it interesting but not always
23             the best approach.
24              
25             This module chunks the XML file in Header, a sequence of the repeating
26             blocks, and a footer, and each one of these chunks can be processed by
27             DOM, using L technology.
28              
29              
30             use XML::DT::Sequence;
31              
32             my $dt = XML::DT::Sequence->new();
33              
34             $dt->process("file.xml",
35             -tag => 'item',
36             -head => sub {
37             my ($self, $xml) = @_;
38             # do something with $xml
39             },
40             -body => {
41             item => sub {
42             # XML::DT like handler
43             }
44             },
45             -foot => sub {
46             my ($self, $xml) = @_;
47             # do something with $xml
48             },
49             );
50              
51             =head1 EXPLANATION
52              
53             There are four options, only two mandatory: C<-tag> and
54             C<-body>. C<-tag> is the element name that repeats in the XML file,
55             and that you want to process one at a time. C<-body> is the handler to
56             process each one of these elements.
57              
58             C<-head> is the handler to process the XML that appears before the
59             first instance of the repeating element, and C<-foot> the handler to
60             process the XML that apperas after the last instance of the repeating
61             element.
62              
63             Each one of these handlers can be a code reference that receives the
64             C object and the XML string, or a hash reference,
65             with L handlers to process each XML snippet.
66              
67             Note that when processing header or footer, XML is incomplete, and the
68             parser can recover in weird ways.
69              
70             The C method returns a hash reference with three keys:
71             C<-head> is the return value of the C<-head> handler, and C<-foot> is
72             the return value of the C<-foot> handler. C<-body> is the number of
73             elements of the sequence that were processed.
74              
75             =head1 METHODS
76              
77             =head2 new
78              
79             Constructor.
80              
81             =head2 process
82              
83             Processor. Se explanation above.
84              
85             =head2 break
86              
87             Forces the process to finish. Useful when you processed enough number
88             of elements. Note that if you break the process the C<-foot> code will
89             not be run.
90              
91             If you are using a code reference as a handler, call it from the first
92             argument (reference to the object). If you are using a C
93             handler, C<< $u >> has the object, so just call C on it.
94              
95             =cut
96              
97             sub new {
98             my ($class) = @_;
99             return bless { } => $class;
100             }
101              
102             sub break {
103             my $self = shift;
104             $self->{BREAK} = 1;
105             }
106              
107             sub process {
108             my ($self, $file, %ops) = @_;
109              
110             die "Option -tag is mantatory." unless exists $ops{-tag};
111              
112             local $/ = "";
113              
114             # XXX - fixme... utf8?
115             open my $fh, "<:utf8", $file or die "Can't open file $file for reading [$!]";
116             my $firstChunk = <$fh>;
117              
118             die "No $/ tag found. Bailing out." unless $firstChunk =~ $/;
119              
120             my $head = $firstChunk;
121             $head =~ s/<$ops{-tag}.*//s;
122              
123             ## Process header if there is such a handler
124             my $headReturn = undef;
125             if (exists($ops{-head})) {
126             my $headCode = $ops{-head};
127             if (ref($headCode) eq "CODE") {
128             $headReturn = $headCode->($self, $head);
129             }
130             elsif (ref($headCode) eq "HASH") {
131             $headReturn = dtstring($head, -recover => 1, -userdata => $self, %$headCode);
132             }
133             else {
134             die "No idea what to do with -head of type ".ref($ops{-head});
135             }
136             }
137              
138             ## process the sequence
139             my $chunk = $firstChunk;
140             my $totalElems = 0;
141             my $bodyCode = $ops{-body} || undef;
142             my $code;
143              
144             if (!$bodyCode) {
145             $code = sub { };
146             } elsif (ref($bodyCode) eq "CODE") {
147             $code = sub { $bodyCode->($self, $_[0]) };
148             } elsif (ref($bodyCode) eq "HASH") {
149             $code = sub { dtstring($_[0], -userdata=> $self, %$bodyCode) }
150             } else {
151             die "No idea what to do with -body of type ".ref($ops{-body});
152             }
153              
154             do {
155             ++$totalElems;
156             $chunk =~ s/^.*(?=<$ops{-tag})//s;
157             $code->($chunk);
158             $chunk = <$fh>;
159             } while ($chunk =~ m{} and !$self->{BREAK});
160              
161             my $footReturn;
162             if (!$self->{BREAK}) {
163             if (exists($ops{-foot})) {
164             my $footCode = $ops{-foot};
165             if (ref($footCode) eq "CODE") {
166             $footReturn = $footCode->($self, $chunk);
167             }
168             elsif (ref($footCode) eq "HASH") {
169             $chunk =~ s{^\s*}{}g;
170             $footReturn = dtstring($chunk,
171             -userdata => $self,
172             -recover => 1, %$footCode);
173             }
174             else {
175             die "No idea what to do with -foot of type ".ref($ops{-foot});
176             }
177             }
178             }
179              
180             close $fh;
181              
182             return {
183             -head => $headReturn,
184             -body => $totalElems,
185             -foot => $footReturn,
186             };
187             }
188              
189             =head1 AUTHOR
190              
191             Alberto Simões, C<< >>
192              
193             =head1 BUGS
194              
195             Please report any bugs or feature requests to C
196             rt.cpan.org>, or through the web interface at
197             L. I
198             will be notified, and then you'll automatically be notified of
199             progress on your bug as I make changes.
200              
201              
202              
203              
204             =head1 SUPPORT
205              
206             You can find documentation for this module with the perldoc command.
207              
208             perldoc XML::DT::Sequence
209              
210              
211             You can also look for information at:
212              
213             =over 4
214              
215             =item * RT: CPAN's request tracker (report bugs here)
216              
217             L
218              
219             =item * AnnoCPAN: Annotated CPAN documentation
220              
221             L
222              
223             =item * CPAN Ratings
224              
225             L
226              
227             =item * Search CPAN
228              
229             L
230              
231             =back
232              
233             =head1 KNOWN BUGS AND LIMITATIONS
234              
235             =over 4
236              
237             =item * Spaced tags
238              
239             It is not usual, but XML allows the usage of spaces inside element
240             tags, for instance, between the C<< < >> and the element name. This is
241             B supported.
242              
243             =item * Multiple usage tags
244              
245             If the same tag is used in different levels of the XML hierarchy, it
246             is likely that the implemented algorithm will not work.
247              
248             =back
249              
250             =head1 ACKNOWLEDGEMENTS
251              
252              
253             =head1 LICENSE AND COPYRIGHT
254              
255             Copyright 2012 Alberto Simões.
256              
257             This program is free software; you can redistribute it and/or modify it
258             under the terms of either: the GNU General Public License as published
259             by the Free Software Foundation; or the Artistic License.
260              
261             See http://dev.perl.org/licenses/ for more information.
262              
263              
264             =cut
265              
266             1; # End of XML::DT::Sequence