File Coverage

blib/lib/XML/CompactTree.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package XML::CompactTree;
2              
3 3     3   96764 use warnings;
  3         8  
  3         228  
4 3     3   17 use strict;
  3         4  
  3         163  
5              
6             =head1 NAME
7              
8             XML::CompactTree - builder of compact tree structures from XML documents
9              
10             =head1 VERSION
11              
12             Version 0.03
13              
14             =cut
15              
16             our $VERSION = '0.03';
17              
18 3     3   15 use base qw(Exporter);
  3         9  
  3         720  
19 3     3   16 use vars qw( @EXPORT @EXPORT_OK %EXPORT_TAGS );
  3         5  
  3         212  
20 3     3   1290 use XML::LibXML::Reader;
  0            
  0            
21              
22             # XCT_USE_QNAMES /* not yet implemented */
23             # XCT_TEXT_AS_STRING /* not yet implemented */
24             # XCT_PRESERVE_PARENT /* not yet implemented */
25             # XCT_MERGE_TEXT_NODES /* not yet implemented */
26              
27             use constant do {
28             my @flags = (qw(
29             XCT_IGNORE_WS
30             XCT_IGNORE_SIGNIFICANT_WS
31             XCT_IGNORE_PROCESSING_INSTRUCTIONS
32             XCT_IGNORE_COMMENTS
33             XCT_USE_QNAMES
34             XCT_KEEP_NS_DECLS
35             XCT_TEXT_AS_STRING
36             XCT_ATTRIBUTE_ARRAY
37             XCT_PRESERVE_PARENT
38             XCT_MERGE_TEXT_NODES
39             XCT_LINE_NUMBERS
40             XCT_DOCUMENT_ROOT
41             ));
42             $EXPORT_TAGS{flags} = \@flags;
43             my %c = map { ($flags[$_] => (1 << $_)) } 0..$#flags;
44             \%c
45             };
46              
47             BEGIN {
48             @EXPORT = (map @$_, values %EXPORT_TAGS);
49             @EXPORT_OK = @EXPORT;
50             $EXPORT_TAGS{all}=\@EXPORT_OK;
51             }
52              
53             =head1 SYNOPSIS
54              
55             use XML::CompactTree;
56             use XML::LibXML::Reader;
57              
58             my $reader = XML::LibXML::Reader->new(location => $url);
59             ...
60             my $tree = XML::CompactTree::readSubtreeToPerl($reader);
61             ...
62              
63             =head1 DESCRIPTION
64              
65             This module provides functions that use XML::LibXML::Reader to parse
66             an XML document into a parse tree formed of nested arrays (and hashes).
67              
68             It aims to be fast in doing that and to presreve all relevant
69             information from the XML (including namespaces, document order, mixed
70             content, etc.). It sacrifices user friendliness for speed.
71              
72             IMPORTANT: There is an even more efficient XS implementation of this
73             module called XML::CompactTree::XS with 100% equivalent functionality.
74              
75             =head1 PURPOSE
76              
77             I wrote this module because I noticed that repeated calls to methods
78             implemented in C (XS) were very expensive in Perl.
79              
80             Therefore traversing a large DOM tree using XML::LibXML or iterating
81             over an XML stream using XML::LibXML::Reader was much slower than
82             traversing similarly large and structured native Perl data
83             structures.
84              
85             This module allows the user to build a document parse tree consisting
86             of native Perl data structures (arrays and optionally hashes) using
87             XML::LibXML::Reader with minimal number of XS calls.
88              
89             (Note that there XML::CompactTree::XS is 100% equivalent of this
90             module that manages the same with just one XS call.)
91              
92             It does not provide full DOM navigation but attempts to provide
93             maximum amount of information. Its memory footprint should be
94             somewhat smaller than that of a corresponding XML::LibXML DOM tree.
95              
96             =head1 EXPORT
97              
98             By default, the following constants are exported (C<:flags> export
99             tag) to be used as flags for the tree builder:
100              
101             XCT_IGNORE_WS
102             XCT_IGNORE_SIGNIFICANT_WS
103             XCT_IGNORE_PROCESSING_INSTRUCTIONS
104             XCT_IGNORE_COMMENTS
105             XCT_USE_QNAMES /* not yet implemented */
106             XCT_KEEP_NS_DECLS
107             XCT_TEXT_AS_STRING /* not yet implemented */
108             XCT_ATTRIBUTE_ARRAY
109             XCT_PRESERVE_PARENT /* not yet implemented */
110             XCT_MERGE_TEXT_NODES /* not yet implemented */
111             XCT_DOCUMENT_ROOT
112              
113             =head1 FUNCTIONS
114              
115             =head2 readSubtreeToPerl( $reader, $flags, \my %ns )
116              
117             Uses a given XML::LibXML::Reader parser objects to parse a subtree at
118             the current reader position to build a tree formed of nested arrays
119             (see L).
120              
121             =over 4
122              
123             =item reader
124              
125             A XML::LibXML::Reader object to use as the reader. While building the
126             tree, the reader moves to the next node on the current or higher
127             level.
128              
129             =item flags
130              
131             An integer consisting of 1 bit flags (see constants in the EXPORT section).
132             Use binary or (|) to combine individual flags.
133              
134             The following flags are NOT implemented yet:
135              
136             XCT_USE_QNAMES, XCT_TEXT_AS_STRING, XCT_PRESERVE_PARENT, XCT_MERGE_TEXT_NODES
137              
138             =item ns
139              
140             You may pass an empty hash reference that will be populated by a
141             namespace_uri to namespace_index map, that can be used to decode
142             namespace indexes in the resulting data structure (see L
143             FORMAT>).
144              
145              
146             =back
147              
148             =cut
149              
150             sub readSubtreeToPerl {
151             my ($reader,$flags,$ns)=@_;
152             $ns||={};
153             $ns->{''}=0;
154             my $ret = _readSubtreeToPerl($reader,$flags,$ns,1,0);
155             return $ret->[0];
156             }
157              
158             =head2 readLevelToPerl( $reader, $flags, $ns )
159              
160             Like C, but reads the subtree
161             at the current reader position and all its following siblings.
162             It returns an array reference of representations of these subtrees
163             as in the format described in L.
164              
165             =cut
166              
167             sub readLevelToPerl {
168             my ($reader,$flags,$ns)=@_;
169             $ns||={};
170             $ns->{''}=0;
171             my $ret = _readSubtreeToPerl($reader,$flags,$ns,1,1);
172             return $ret;
173             }
174              
175             sub _readSubtreeToPerl {
176             my ($reader, $flags, $ns_map, $free_ns_index, $read_siblings) = @_;
177             my @parents;
178             my ($av,$prev,$kids,$ret,$type,$name);
179             my $cur_depth=$reader->depth();
180             my $start_depth = $cur_depth;
181             my $prev_depth = $start_depth;
182             my $top = [];
183             if ($reader->nodeType()==0) {
184             return if $reader->read()!=1;
185             if ($flags & XCT_DOCUMENT_ROOT) {
186             $prev = [ XML_READER_TYPE_DOCUMENT,
187             $reader->encoding,
188             ];
189             $start_depth --;
190             $prev_depth --;
191             push @$top, $prev;
192             push @parents, $prev;
193             }
194             }
195             do {{
196             $type = $reader->nodeType();
197             # warn("$type, $cur_depth, ".$reader->name."\n");
198             if ($type == XML_READER_TYPE_NONE
199             or $type == XML_READER_TYPE_ATTRIBUTE
200             or $type == XML_READER_TYPE_DOCUMENT_TYPE
201             or $type == XML_READER_TYPE_END_ELEMENT
202             or $type == XML_READER_TYPE_ENTITY
203             or $type == XML_READER_TYPE_END_ENTITY
204             or $type == XML_READER_TYPE_XML_DECLARATION) {
205             $ret = $reader->read();
206             } else {
207             if (($flags & (XCT_IGNORE_WS|XCT_IGNORE_SIGNIFICANT_WS))
208             and $type == XML_READER_TYPE_WHITESPACE
209             or
210             ($flags & XCT_IGNORE_SIGNIFICANT_WS)
211             and $type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE
212             or
213             ($flags & XCT_IGNORE_COMMENTS)
214             and $type == XML_READER_TYPE_COMMENT
215             or
216             ($flags & XCT_IGNORE_PROCESSING_INSTRUCTIONS
217             and $type == XML_READER_TYPE_PROCESSING_INSTRUCTION)) {
218             $ret = $reader->read();
219             } else {
220             my @av=();
221             $av=\@av;
222             push @av, $type;
223             if ($type == XML_READER_TYPE_ELEMENT) {
224             # warn(" element\n");
225             push @av, $reader->localName();
226             $name = $reader->namespaceURI();
227             if ($name) {
228             if (exists($ns_map->{$name})) {
229             push(@av, $ns_map->{$name} || 0);
230             } else {
231             # warn("storing namespace $name as $free_ns_index)";
232             push(@av, $free_ns_index);
233             $ns_map->{$name}=$free_ns_index;
234             $free_ns_index++;
235             }
236             } else {
237             push(@av, 0); # no namespace
238             }
239             if ($reader->hasAttributes() && $reader->moveToFirstAttribute()==1) {
240             if ($flags & XCT_ATTRIBUTE_ARRAY) {
241             my @attrs;
242             do {
243             $name = $reader->name();
244             if (($flags & XCT_KEEP_NS_DECLS) || substr($name,0,5) ne 'xmlns' ) {
245             push(@attrs, $name);
246             push(@attrs, $reader->value());
247             }
248             } while ($reader->moveToNextAttribute()==1);
249             # $reader->moveToElement();
250             push(@av, \@attrs);
251             } else {
252             my %attrs;
253             do {
254             $name = $reader->name();
255             if (($flags & XCT_KEEP_NS_DECLS) || substr($name,0,5) ne 'xmlns' ) {
256             $attrs{$name}=$reader->value();
257             }
258             } while ($reader->moveToNextAttribute()==1);
259             $reader->moveToElement();
260             push(@av, \%attrs);
261             }
262             } else {
263             push(@av, undef); # no attributes
264             }
265             if ($flags & XCT_LINE_NUMBERS) {
266             push(@av, $reader->lineNumber());
267             }
268             } elsif ($type == XML_READER_TYPE_TEXT or
269             $type == XML_READER_TYPE_CDATA or
270             $type == XML_READER_TYPE_COMMENT or
271             $type == XML_READER_TYPE_WHITESPACE or
272             $type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
273             push(@av, $reader->value());
274             } elsif ($type == XML_READER_TYPE_ENTITY_REFERENCE or
275             $type == XML_READER_TYPE_PROCESSING_INSTRUCTION or
276             $type == XML_READER_TYPE_NOTATION) {
277             push(@av, $reader->localName());
278             push(@av, $reader->value());
279             } elsif ($type == XML_READER_TYPE_DOCUMENT or
280             $type == XML_READER_TYPE_DOCUMENT_FRAGMENT) {
281             push(@av, $reader->encoding());
282             }
283             if ($cur_depth==$start_depth) {
284             push(@$top, $av);
285             $prev_depth = $cur_depth;
286             $kids = undef;
287             } elsif ($cur_depth > $prev_depth) {
288             $kids=[];
289             push(@$prev, $kids);
290             push(@$kids, $av);
291             push(@parents, $prev);
292             $prev_depth = $cur_depth;
293             } elsif ($cur_depth == $prev_depth) {
294             push(@$kids, $av) if $kids;
295             } else {
296             do {
297             $prev_depth--;
298             pop(@parents);
299             } while ($cur_depth < $prev_depth);
300             my $p = $parents[-1];
301             if ($p) {
302             $prev = $p;
303             $p = $prev->[-1];
304             if ($p) {
305             $kids = $p;
306             push(@$kids, $av);
307             }
308             }
309             }
310             $prev = $av;
311             $ret = $reader->read();
312             }
313             }
314             # print STDERR "$cur_depth, ",$reader->depth(),"\n";
315             }} while ($ret == 1 && ($cur_depth = $reader->depth()) > ($start_depth - ($read_siblings ? 1 : 0)));
316             if ($ret == 1) {
317             if ($reader->depth() == $start_depth &&
318             $reader->nodeType() == XML_READER_TYPE_END_ELEMENT) {
319             $reader->read();
320             }
321             }
322             return $top;
323             }
324              
325             =head1 OUTPUT FORMAT
326              
327             The result of parsing a subtree is a Perl array reference C<$node>
328             contains a node type followed by node data whose interpretation on
329             further positions in $node depends on the node type, as described
330             below:
331              
332             =head2 Any Node
333              
334             =over 5
335              
336             =item *
337              
338             $node->[0] is an integer representing the node type. Use
339             XML::LibXML::Reader node-tye constants, e.g. XML_READER_TYPE_ELEMENT
340             for an element node, XML_READER_TYPE_TEXT for text node, etc.
341              
342             =back
343              
344             =head2 Document or Document Fragment Nodes
345              
346             =over 5
347              
348             =item *
349              
350             $node->[1] contains the document encoding
351              
352             =item *
353              
354             $node->[2] is an array reference containing similar represention of
355             all the child nodes of the document (fragment).
356              
357             =back
358              
359             Note: XML::LibXML::Reader does not document node by default, which
360             means that calling readSubtreeToPerl on a reader object in its initial
361             state only parses the first node in the document (which can be the
362             root element, but also a comment or a processing instruction). Use
363             XCT_DOCUMENT_ROOT flag to force creating a document node in such case.
364              
365             =head2 Element nodes
366              
367             =over 5
368              
369             =item *
370              
371             $node->[1] is the local name (UTF-8 encoded character string)
372              
373             =item *
374              
375             $node->[2] is the namespace index (see L below)
376              
377             =item *
378              
379             $node->[3] is undef if the element has no attributes. Otherwise if
380             XCT_ATTRIBUTE_ARRAY flag was used, $node->[3] is an array reference of
381             the form C<[ name1, value1, name2, value2, ....]> of attribute names and
382             corresponding values. If XCT_ATTRIBUTE_ARRAY flag was not used, then
383             $node->[3] is a hash reference mapping attribute names to the
384             corresponding attribute values C<{ name1=>value1, name2=>value2...}>
385              
386             The flag XCT_KEEP_NS_DECLS controls whether namespace declarations
387             (xmlns=... or xmlns:prefix=...) are included along with normal
388             attributes or not.
389              
390             Note: there is no support for namespaced attributes yet, but the
391             attribute names are stored as QNames, so one can always use
392             XCT_KEEP_NS_DECLS to keep track of namespace prefix declarations and
393             do the resolving manually. Support for namespaced attributes is
394             planned.
395              
396             =item *
397              
398             If XTC_LINE_NUMBERS flag was used, $node->[4] contains the line number
399             of the element and $node->[5] contains an array reference containing
400             similar representions of the child nodes of the current node.
401              
402             =item *
403              
404             If XTC_LINE_NUMBERS flag was NOT used, $node->[4] contains an array
405             reference of similar representations of the child nodes of the current
406             node.
407              
408             =back
409              
410             =head2 Text, CDATA, Comment and White-Space Nodes
411              
412             =over 5
413              
414             =item *
415              
416             $node->[1] contains the node value (UTF-8 encoded character string)
417              
418             =back
419              
420             =head2 Unparsed Entity, Processing-Instruction, and Notation Nodes
421              
422             =over 5
423              
424             =item *
425              
426             $node->[1] contains the local name (there is no support for
427             namespaces on these types of nodes yet)
428              
429             =item *
430              
431             $node->[2] contains the node value
432              
433             =back
434              
435             =head2 Skipping Less-Significant Nodes
436              
437             White-space (non-significant or significant), processing-instruction
438             and comment nodes can be completely skipped, using the following
439             flags:
440              
441             XCT_IGNORE_WS
442             XCT_IGNORE_SIGNIFICANT_WS
443             XCT_IGNORE_PROCESSING_INSTRUCTIONS
444             XCT_IGNORE_COMMENTS
445              
446             =head1 NAMESPACES
447              
448             Namespaces of element nodes are stored in the element node as an
449             integer. 0 always represents nodes without namespace, all other
450             namespaces are assigned unique numbers in an increasing order as they
451             appear. You can pass an empty hash reference to the parsing functions
452             to obtain the mapping.
453              
454             =head2 Example
455              
456             use XML::CompactTree;
457             use XML::LibXML::Reader;
458              
459             my $reader = XML::LibXML::Reader->new(location => $ARGV[0]);
460             my %ns;
461             my $data = XML::CompactTree::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT, \%ns );
462             $ns_map[$ns{$_}]=$_ for keys %ns;
463             my @nodes = ($data);
464             while (@nodes) {
465             my $node = shift @nodes;
466             my $type = $node->[0];
467             if ($type == XML_READER_TYPE_ELEMENT) {
468             print "element $node->[1] is from ns $node->[2] '$ns_map[$node->[2]]'\n";
469             push @nodes, @{$node->[4]}; # queue children
470             } elsif ($type == XML_READER_TYPE_DOCUMENT) {
471             push @nodes, @{$node->[2]}; # queue children
472             }
473             }
474              
475             =head1 PLANNED FEATURES
476              
477             Planned flags:
478              
479             XCT_USE_QNAMES - use QNames instead of local names for all nodes
480             XCT_TEXT_AS_STRING - put text nodes into the tree as plain scalars
481             XCT_PRESERVE_PARENT - add a slot with a weak reference to the parent node
482             XCT_MERGE_TEXT_NODES - merge adjacent text/cdata nodes together
483              
484             Features: allow blessing the array refs to default or user-specified
485             classes; the default classes would provide a very small subset of DOM
486             methods to retrieve node information, manipulate the tree, and
487             possibly serialize the parse tree back to XML.
488              
489             =head1 AUTHOR
490              
491             Petr Pajas, C<< >>
492              
493             =head1 BUGS
494              
495             Please report any bugs or feature requests to
496             C, or through the web interface at
497             L.
498             I will be notified, and then you'll automatically be notified of progress on
499             your bug as I make changes.
500              
501             =head1 COPYRIGHT & LICENSE
502              
503             Copyright 2008-2009 Petr Pajas, All Rights Reserved.
504              
505             This program is free software; you can redistribute it and/or modify it
506             under the same terms as Perl itself.
507              
508             =head1 SEE ALSO
509              
510             XML::CompactTree::XS
511              
512             XML::LibXML::Reader
513              
514             =cut
515              
516              
517             1; # End of XML::CompactTree