File Coverage

blib/lib/XML/Struct/Reader.pm
Criterion Covered Total %
statement 120 133 90.2
branch 88 108 81.4
condition 46 59 77.9
subroutine 18 19 94.7
pod 5 6 83.3
total 277 325 85.2


line stmt bran cond sub pod time code
1             package XML::Struct::Reader;
2 5     5   37 use strict;
  5         11  
  5         149  
3              
4 5     5   2654 use Moo;
  5         56728  
  5         29  
5 5     5   7473 use Carp qw(croak);
  5         23  
  5         297  
6             our @CARP_NOT = qw(XML::Struct);
7 5     5   30 use Scalar::Util qw(blessed);
  5         13  
  5         200  
8 5     5   29 use XML::Struct;
  5         12  
  5         1781  
9              
10             our $VERSION = '0.27';
11              
12             has whitespace => (is => 'ro', default => sub { 0 });
13             has attributes => (is => 'ro', default => sub { 1 });
14             has path => (is => 'ro', default => sub { '*' }, isa => \&_checkPath);
15             has stream => (is => 'rw',
16             lazy => 1,
17             builder => 1,
18             isa => sub {
19             die 'stream must be an XML::LibXML::Reader'
20             unless blessed $_[0] && $_[0]->isa('XML::LibXML::Reader');
21             }
22             );
23             has from => (is => 'ro', trigger => 1);
24             has ns => (is => 'ro', default => sub { 'keep' }, trigger => 1);
25             has depth => (is => 'ro', coerce => sub {
26             (defined $_[0] and $_[0] =~ /^\+?\d+/) ? $_[0] : undef
27             });
28             has deep => (is => 'ro', default => sub { '' } );
29             has simple => (is => 'ro', default => sub { 0 });
30             has root => (is => 'ro', default => sub { 0 });
31             has content => (is => 'ro', default => sub { 'content' });
32              
33 5         10049 use XML::LibXML::Reader qw(
34             XML_READER_TYPE_ELEMENT
35             XML_READER_TYPE_TEXT
36             XML_READER_TYPE_CDATA
37             XML_READER_TYPE_SIGNIFICANT_WHITESPACE
38             XML_READER_TYPE_END_ELEMENT
39 5     5   39 );
  5         10  
40              
41             sub BUILD {
42 44     44 0 185 my ($self) = @_;
43            
44             # make sure that option 'deep' and 'depth' are only set if it makes sense
45            
46 44 100       257 if ($self->deep eq 'simple') {
    50          
    100          
    50          
47 3 50 33     21 if ($self->simple or (defined $self->depth and $self->depth == 0)) {
      66        
48             # (deep = simple, simple = 1) or (deep = simple, depth = 0)
49 3         7 $self->{simple} = 1;
50 3         5 delete $self->{depth};
51 3         7 $self->{deep} = '';
52             }
53             } elsif ($self->deep eq 'struct') {
54 0         0 $self->{deep} = '';
55             } elsif ($self->deep eq '') {
56 39 100       134 $self->{deep} = $self->simple ? '' : 'simple';
57             } elsif ($self->deep !~ /^(dom|raw)$/) {
58 0         0 croak "option deep must be simple, struct, dom, or raw!";
59             }
60              
61 44 100 100     374 if (($self->depth || 0) and $self->root and $self->simple) {
      100        
      66        
62 2         13 $self->{depth} = $self->{depth}-1;
63             }
64             }
65              
66             sub _build_stream {
67 0     0   0 XML::LibXML::Reader->new( { IO => \*STDIN } )
68             }
69            
70             sub _trigger_from {
71 42     42   450 my ($self, $from) = @_;
72              
73 42 100 100     247 unless (blessed $from and $from->isa('XML::LibXML::Reader')) {
74 33         56 my %options;
75              
76 33 50 66     93 if (ref $from and ref $from eq 'HASH') {
77 0         0 %options = %$from;
78 0 0       0 $from = delete $options{from} if exists $options{from};
79             }
80              
81 33 50 33     469 if (!defined $from or $from eq '-') {
    100 100        
    100 100        
    50 66        
    100 100        
    100 66        
    50          
    50          
    0          
82 0         0 $options{IO} = \*STDIN
83             } elsif( !ref $from and $from =~ /^</ ) {
84 9         21 $options{string} = $from;
85             } elsif( ref $from and ref $from eq 'SCALAR' ) {
86 1         2 $options{string} = $$from;
87             } elsif( ref $from and ref $from eq 'GLOB' ) {
88 0         0 $options{FD} = $from;
89             } elsif( blessed $from and $from->isa('XML::LibXML::Document') ) {
90 3         135 $options{DOM} = $from;
91             } elsif( blessed $from and $from->isa('XML::LibXML::Element') ) {
92 1         35 my $doc = XML::LibXML->createDocument;
93 1         21 $doc->setDocumentElement($from);
94 1         26 $options{DOM} = $doc;
95             } elsif( blessed $from ) {
96 0         0 $options{IO} = $from;
97             } elsif( !ref $from ) {
98 19         47 $options{location} = $from; # filename or URL
99 0         0 } elsif( ! grep { $_ =~ /^(IO|string|location|FD|DOM)$/} keys %options ) {
100 0         0 croak "invalid option 'from': $from";
101             }
102            
103             $from = XML::LibXML::Reader->new( %options )
104             or die "failed to create XML::LibXML::Reader with "
105 33 50       142 . join(', ',map { "$_=".$options{$_} } keys %options )."\n";
  0         0  
106             }
107              
108 42         4263 $self->stream($from);
109             }
110              
111              
112             sub _trigger_ns {
113 5     5   188 my ($self, $ns) = @_;
114              
115 5 50 33     124 if (!defined $ns or $ns eq '') {
    50          
116 0         0 $self->{ns} = 'keep';
117             } elsif ($ns !~ /^(keep|strip|disallow)?$/) {
118 0         0 croak "invalid option 'ns': $ns";
119             }
120             }
121              
122              
123             sub _checkPath {
124 50     50   479 my $path = shift;
125              
126 50 50       302 die "invalid path: $path" if $path =~ qr{\.\.|.//|^\.};
127 50 50       266 die "relative path not supported: $path" if $path =~ qr{^[^/]+/};
128              
129 50         732 return $path;
130             }
131              
132             sub _nameMatch {
133 79   100 79   355 return ($_[0] eq '*' or $_[0] eq $_[1]);
134             }
135              
136             # read to the next element
137             # TODO: use XML::LibXML->nextPatternMatch
138             sub _nextPatternMatch {
139 68     68   145 my ($self, $stream, $path) = @_;
140              
141 68         115 $path =~ s{^//}{};
142 68 100       401 $path .= '*' if $path =~ qr{^$|/$};
143              
144 68         251 my @parts = split '/', $path;
145 68         136 my $relative = $parts[0] ne '';
146              
147 68         100 while(1) {
148 157 100       1741 return if !$stream->read; # end or error
149 137 100       480 next if $stream->nodeType != XML_READER_TYPE_ELEMENT;
150              
151             # printf " %d=%d %s:%s==%s\n", $stream->depth, scalar @parts, $stream->nodePath, $stream->name, join('/', @parts);
152              
153 79         183 my $name = $self->_name($stream);
154              
155 79 100       162 if ($relative) {
156 51 100       104 if (_nameMatch($parts[0], $name)) {
157 35         73 last;
158             }
159             } else {
160 28 100       91 if (!_nameMatch($parts[$stream->depth+1], $name)) {
    100          
161 6         42 $stream->nextSibling();
162             } elsif ($stream->depth == scalar @parts - 2) {
163 13         39 last;
164             }
165             }
166             }
167              
168 48         159 return 1;
169             }
170              
171             sub readNext {
172 68     68 1 7871 my $self = shift;
173 68 100       1515 my $stream = blessed $_[0] ? shift() : $self->stream;
174 68 100       546 my $path = defined $_[0] ? _checkPath($_[0]) : $self->path;
175              
176 68 100       160 return unless $self->_nextPatternMatch($stream, $path);
177              
178 48         118 my $xml = $self->readElement($stream);
179              
180 46 100       746 return $self->simple ? XML::Struct::Simple->new(
181             root => $self->root,
182             attributes => $self->attributes,
183             depth => $self->depth,
184             content => $self->content,
185             )->transform($xml) : $xml;
186             }
187              
188             *read = \&readNext;
189              
190              
191             sub readDocument {
192 32     32 1 66 my $self = shift;
193 32         46 my @document;
194            
195 32         83 while(my $element = $self->read(@_)) {
196 32 100       177 return $element unless wantarray;
197 18         106 push @document, $element;
198             }
199              
200 16         156 return @document;
201             }
202              
203             sub _name {
204 292     292   489 my ($self, $stream) = @_;
205              
206 292 100       799 if ($self->ns eq 'strip') {
    100          
207 10         44 return $stream->localName;
208             } elsif( $self->ns eq 'disallow' ) {
209 15 100       75 if ( $stream->name =~ /^xmlns(:.*)?$/) {
210 2         502 croak "namespaces not allowed at line ".$stream->lineNumber;
211             }
212             }
213              
214 280         993 return $stream->name;
215             }
216              
217              
218             sub readElement {
219 185     185 1 263 my $self = shift;
220 185 50       323 my $stream = @_ ? shift : $self->stream;
221              
222 185         317 my @element = ($self->_name($stream));
223              
224             # TODO: dom or raw
225 185 100 100     516 if (defined $self->depth and $stream->depth >= $self->depth) {
226 28 100       78 if ($self->deep eq 'dom') {
    100          
227 1         33 my $dom = $stream->copyCurrentNode(1);
228 1         10 $stream->next;
229 1         3 return $dom;
230             } elsif ($self->deep eq 'raw') {
231 1         45 my $xml = $stream->readOuterXml();
232 1         9 $stream->next;
233 1         4 return $xml;
234             }
235             #copyCurrentNode
236             #if (defined $self->depth and $self->depth == $stream->depth ) {
237             #print $stream->depth." ".$self->deep."!".$element[0]."\n";
238             #}
239             }
240              
241 183 100       414 if ($self->attributes) {
    100          
242 133         240 my $attr = $self->readAttributes($stream);
243 131 100       377 my $children = $stream->isEmptyElement ? [ ] : $self->readContent($stream);
244 131         214 push @element, $attr, $children;
245             } elsif( !$stream->isEmptyElement ) {
246 38         107 push @element, $self->readContent($stream);
247             }
248              
249 181         339 return \@element;
250             }
251              
252              
253             sub readAttributes {
254 133     133 1 179 my $self = shift;
255 133 50       227 my $stream = @_ ? shift : $self->stream;
256              
257 133 100       401 return { } if $stream->moveToFirstAttribute != 1;
258              
259 26         44 my $attr = { };
260 26         49 do {
261 30 100 100     135 if ($self->ns ne 'strip' or $stream->name !~ /^xmlns(:.*)?$/) {
262 28         112 $attr->{ $self->_name($stream) } = $stream->value;
263             }
264             } while ($stream->moveToNextAttribute);
265 24         77 $stream->moveToElement;
266              
267 24         38 return $attr;
268             }
269              
270              
271             sub readContent {
272 136     136 1 200 my $self = shift;
273 136 50       221 my $stream = @_ ? shift : $self->stream;
274              
275 136         196 my @children;
276 136         176 while(1) {
277 505         1386 $stream->read;
278 505         951 my $type = $stream->nodeType;
279              
280 505 100 66     1539 last if !$type or $type == XML_READER_TYPE_END_ELEMENT;
281              
282 369 100 100     1281 if ($type == XML_READER_TYPE_ELEMENT) {
    100 66        
    100          
283 137         307 push @children, $self->readElement($stream);
284             } elsif ($type == XML_READER_TYPE_TEXT or $type == XML_READER_TYPE_CDATA ) {
285 60         206 push @children, $stream->value;
286             } elsif ($type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE && $self->whitespace) {
287 1         9 push @children, $stream->value;
288             }
289             }
290            
291 136         268 return \@children;
292             }
293              
294             1;
295             __END__
296              
297             =encoding UTF-8
298              
299             =head1 NAME
300              
301             XML::Struct::Reader - Read XML streams into XML data structures
302              
303             =head1 SYNOPSIS
304              
305             my $reader = XML::Struct::Reader->new( from => "file.xml" );
306             my $data = $reader->read;
307              
308             =head1 DESCRIPTION
309              
310             This module reads an XML stream (via L<XML::LibXML::Reader>) into
311             L<XML::Struct>/MicroXML data structures.
312              
313             =head1 METHODS
314              
315             =head2 read = readNext ( [ $stream ] [, $path ] )
316              
317             Read the next XML element from a stream. If no path option is specified, the
318             reader's path option is used ("C<*>" by default, first matching the root, then
319             every other element).
320              
321             =head2 readDocument( [ $stream ] [, $path ] )
322              
323             Read an entire XML document. In contrast to C<read>/C<readNext>, this method
324             always reads the entire stream. The return value is the first element (that is
325             the root element by default) in scalar context and a list of elements in array
326             context. Multiple elements can be returned for instance when a path was
327             specified to select document fragments.
328              
329             =head2 readElement( [ $stream ] )
330              
331             Read an XML element from a stream and return it as array reference with element name,
332             attributes, and child elements. In contrast to method C<read>, this method expects
333             the stream to be at an element node (C<< $stream->nodeType == 1 >>) or bad things
334             might happed.
335              
336             =head2 readAttributes( [ $stream ] )
337              
338             Read all XML attributes from a stream and return a (possibly empty) hash
339             reference.
340              
341             =head2 readContent( [ $stream ] )
342              
343             Read all child elements of an XML element and return the result as (possibly
344             empty) array reference. Significant whitespace is only included if option
345             C<whitespace> is enabled.
346              
347             =head1 CONFIGURATION
348              
349             =over
350              
351             =item from
352              
353             A source to read from. Possible values include a string or string reference
354             with XML data, a filename, an URL, a file handle, instances of
355             L<XML::LibXML::Document> or L<XML::LibXML::Element>, and a hash reference with
356             options passed to L<XML::LibXML::Reader>.
357              
358             =item stream
359              
360             A L<XML::LibXML::Reader> to read from. If no stream has been defined, one must
361             pass a stream parameter to the C<read...> methods. Setting a source with option
362             C<from> automatically sets a stream.
363              
364             =item attributes
365              
366             Include attributes (enabled by default). If disabled, the representation of
367             an XML element will be
368              
369             [ $name => \@children ]
370              
371             instead of
372              
373             [ $name => \%attributes, \@children ]
374              
375             =item path
376              
377             Optional path expression to be used as default value when calling C<read>.
378             Pathes must either be absolute (starting with "C</>") or consist of a single
379             element name. The special name "C<*>" matches all element names.
380              
381             A path is a very reduced form of an XPath expressions (no axes, no "C<..>", no
382             node tests, C<//> only at the start...). Namespaces are not supported yet.
383              
384             =item whitespace
385              
386             Include ignorable whitespace as text elements (disabled by default)
387              
388             =item ns
389              
390             Define how XML namespaces should be processed. By default (value 'C<keep>'),
391             this document:
392              
393             <doc>
394             <x:foo xmlns:x="http://example.org/" bar="doz" />
395             </doc>
396              
397             is transformed to this structure, keeping namespace prefixes and declarations
398             as unprocessed element names and attributes:
399              
400             [ 'doc', {}, [
401             [
402             'x:foo', {
403             'bar' => 'doz',
404             'xmlns:x' => 'http://example.org/'
405             }
406             ]
407             ]
408              
409             Setting this option to 'C<strip>' will remove all namespace prefixes and
410             namespace declaration attributes, so the result would be:
411              
412             [ 'doc', {}, [
413             [
414             'foo', {
415             'bar' => 'doz'
416             }
417             ]
418             ]
419              
420             Setting this option to 'C<disallow>' results in an error when namespace
421             prefixes or declarations are read.
422              
423             Expanding namespace URIs ('C<expand'>) is not supported yet.
424              
425             =item simple
426              
427             Convert XML to simple key-value structure (SimpleXML) with
428             L<XML::Struct::Simple>.
429              
430             =item depth
431              
432             Only transform to a given depth, starting at C<0> for the root node. Negative
433             values, non-numeric values or C<undef> are ignored (unlimited depth as
434             default).
435              
436             XML elements below the depth are converted to SimpleXML by default or to
437             MicroXML if option C<simple> is enabled. This can be configured with option
438             C<deep>.
439              
440             This option is useful for instance to access document-oriented XML embedded in
441             data oriented XML.
442              
443             =item deep
444              
445             How to transform elements below given C<depth>. This option is experimental.
446              
447             =item root
448              
449             Include root element when converting to SimpleXML. Disabled by default.
450              
451             =item content
452              
453             Name of text content when converting to SimpleXML.
454              
455             =back
456              
457             =cut