| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package XML::CompactTree; |
|
2
|
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
96764
|
use warnings; |
|
|
3
|
|
|
|
|
8
|
|
|
|
3
|
|
|
|
|
228
|
|
|
4
|
3
|
|
|
3
|
|
17
|
use strict; |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
163
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
XML::CompactTree - builder of compact tree structures from XML documents |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 VERSION |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
Version 0.03 |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=cut |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $VERSION = '0.03'; |
|
17
|
|
|
|
|
|
|
|
|
18
|
3
|
|
|
3
|
|
15
|
use base qw(Exporter); |
|
|
3
|
|
|
|
|
9
|
|
|
|
3
|
|
|
|
|
720
|
|
|
19
|
3
|
|
|
3
|
|
16
|
use vars qw( @EXPORT @EXPORT_OK %EXPORT_TAGS ); |
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
212
|
|
|
20
|
3
|
|
|
3
|
|
1290
|
use XML::LibXML::Reader; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# XCT_USE_QNAMES /* not yet implemented */ |
|
23
|
|
|
|
|
|
|
# XCT_TEXT_AS_STRING /* not yet implemented */ |
|
24
|
|
|
|
|
|
|
# XCT_PRESERVE_PARENT /* not yet implemented */ |
|
25
|
|
|
|
|
|
|
# XCT_MERGE_TEXT_NODES /* not yet implemented */ |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
use constant do { |
|
28
|
|
|
|
|
|
|
my @flags = (qw( |
|
29
|
|
|
|
|
|
|
XCT_IGNORE_WS |
|
30
|
|
|
|
|
|
|
XCT_IGNORE_SIGNIFICANT_WS |
|
31
|
|
|
|
|
|
|
XCT_IGNORE_PROCESSING_INSTRUCTIONS |
|
32
|
|
|
|
|
|
|
XCT_IGNORE_COMMENTS |
|
33
|
|
|
|
|
|
|
XCT_USE_QNAMES |
|
34
|
|
|
|
|
|
|
XCT_KEEP_NS_DECLS |
|
35
|
|
|
|
|
|
|
XCT_TEXT_AS_STRING |
|
36
|
|
|
|
|
|
|
XCT_ATTRIBUTE_ARRAY |
|
37
|
|
|
|
|
|
|
XCT_PRESERVE_PARENT |
|
38
|
|
|
|
|
|
|
XCT_MERGE_TEXT_NODES |
|
39
|
|
|
|
|
|
|
XCT_LINE_NUMBERS |
|
40
|
|
|
|
|
|
|
XCT_DOCUMENT_ROOT |
|
41
|
|
|
|
|
|
|
)); |
|
42
|
|
|
|
|
|
|
$EXPORT_TAGS{flags} = \@flags; |
|
43
|
|
|
|
|
|
|
my %c = map { ($flags[$_] => (1 << $_)) } 0..$#flags; |
|
44
|
|
|
|
|
|
|
\%c |
|
45
|
|
|
|
|
|
|
}; |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
BEGIN { |
|
48
|
|
|
|
|
|
|
@EXPORT = (map @$_, values %EXPORT_TAGS); |
|
49
|
|
|
|
|
|
|
@EXPORT_OK = @EXPORT; |
|
50
|
|
|
|
|
|
|
$EXPORT_TAGS{all}=\@EXPORT_OK; |
|
51
|
|
|
|
|
|
|
} |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
use XML::CompactTree; |
|
56
|
|
|
|
|
|
|
use XML::LibXML::Reader; |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my $reader = XML::LibXML::Reader->new(location => $url); |
|
59
|
|
|
|
|
|
|
... |
|
60
|
|
|
|
|
|
|
my $tree = XML::CompactTree::readSubtreeToPerl($reader); |
|
61
|
|
|
|
|
|
|
... |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
This module provides functions that use XML::LibXML::Reader to parse |
|
66
|
|
|
|
|
|
|
an XML document into a parse tree formed of nested arrays (and hashes). |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
It aims to be fast in doing that and to presreve all relevant |
|
69
|
|
|
|
|
|
|
information from the XML (including namespaces, document order, mixed |
|
70
|
|
|
|
|
|
|
content, etc.). It sacrifices user friendliness for speed. |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
IMPORTANT: There is an even more efficient XS implementation of this |
|
73
|
|
|
|
|
|
|
module called XML::CompactTree::XS with 100% equivalent functionality. |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=head1 PURPOSE |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
I wrote this module because I noticed that repeated calls to methods |
|
78
|
|
|
|
|
|
|
implemented in C (XS) were very expensive in Perl. |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Therefore traversing a large DOM tree using XML::LibXML or iterating |
|
81
|
|
|
|
|
|
|
over an XML stream using XML::LibXML::Reader was much slower than |
|
82
|
|
|
|
|
|
|
traversing similarly large and structured native Perl data |
|
83
|
|
|
|
|
|
|
structures. |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
This module allows the user to build a document parse tree consisting |
|
86
|
|
|
|
|
|
|
of native Perl data structures (arrays and optionally hashes) using |
|
87
|
|
|
|
|
|
|
XML::LibXML::Reader with minimal number of XS calls. |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
(Note that there XML::CompactTree::XS is 100% equivalent of this |
|
90
|
|
|
|
|
|
|
module that manages the same with just one XS call.) |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
It does not provide full DOM navigation but attempts to provide |
|
93
|
|
|
|
|
|
|
maximum amount of information. Its memory footprint should be |
|
94
|
|
|
|
|
|
|
somewhat smaller than that of a corresponding XML::LibXML DOM tree. |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head1 EXPORT |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
By default, the following constants are exported (C<:flags> export |
|
99
|
|
|
|
|
|
|
tag) to be used as flags for the tree builder: |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
XCT_IGNORE_WS |
|
102
|
|
|
|
|
|
|
XCT_IGNORE_SIGNIFICANT_WS |
|
103
|
|
|
|
|
|
|
XCT_IGNORE_PROCESSING_INSTRUCTIONS |
|
104
|
|
|
|
|
|
|
XCT_IGNORE_COMMENTS |
|
105
|
|
|
|
|
|
|
XCT_USE_QNAMES /* not yet implemented */ |
|
106
|
|
|
|
|
|
|
XCT_KEEP_NS_DECLS |
|
107
|
|
|
|
|
|
|
XCT_TEXT_AS_STRING /* not yet implemented */ |
|
108
|
|
|
|
|
|
|
XCT_ATTRIBUTE_ARRAY |
|
109
|
|
|
|
|
|
|
XCT_PRESERVE_PARENT /* not yet implemented */ |
|
110
|
|
|
|
|
|
|
XCT_MERGE_TEXT_NODES /* not yet implemented */ |
|
111
|
|
|
|
|
|
|
XCT_DOCUMENT_ROOT |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 FUNCTIONS |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=head2 readSubtreeToPerl( $reader, $flags, \my %ns ) |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Uses a given XML::LibXML::Reader parser objects to parse a subtree at |
|
118
|
|
|
|
|
|
|
the current reader position to build a tree formed of nested arrays |
|
119
|
|
|
|
|
|
|
(see L |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=over 4 |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=item reader |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
A XML::LibXML::Reader object to use as the reader. While building the |
|
126
|
|
|
|
|
|
|
tree, the reader moves to the next node on the current or higher |
|
127
|
|
|
|
|
|
|
level. |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=item flags |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
An integer consisting of 1 bit flags (see constants in the EXPORT section). |
|
132
|
|
|
|
|
|
|
Use binary or (|) to combine individual flags. |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
The following flags are NOT implemented yet: |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
XCT_USE_QNAMES, XCT_TEXT_AS_STRING, XCT_PRESERVE_PARENT, XCT_MERGE_TEXT_NODES |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=item ns |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
You may pass an empty hash reference that will be populated by a |
|
141
|
|
|
|
|
|
|
namespace_uri to namespace_index map, that can be used to decode |
|
142
|
|
|
|
|
|
|
namespace indexes in the resulting data structure (see L |
|
143
|
|
|
|
|
|
|
FORMAT>). |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=back |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=cut |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub readSubtreeToPerl { |
|
151
|
|
|
|
|
|
|
my ($reader,$flags,$ns)=@_; |
|
152
|
|
|
|
|
|
|
$ns||={}; |
|
153
|
|
|
|
|
|
|
$ns->{''}=0; |
|
154
|
|
|
|
|
|
|
my $ret = _readSubtreeToPerl($reader,$flags,$ns,1,0); |
|
155
|
|
|
|
|
|
|
return $ret->[0]; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=head2 readLevelToPerl( $reader, $flags, $ns ) |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
Like C, but reads the subtree |
|
161
|
|
|
|
|
|
|
at the current reader position and all its following siblings. |
|
162
|
|
|
|
|
|
|
It returns an array reference of representations of these subtrees |
|
163
|
|
|
|
|
|
|
as in the format described in L |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=cut |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub readLevelToPerl { |
|
168
|
|
|
|
|
|
|
my ($reader,$flags,$ns)=@_; |
|
169
|
|
|
|
|
|
|
$ns||={}; |
|
170
|
|
|
|
|
|
|
$ns->{''}=0; |
|
171
|
|
|
|
|
|
|
my $ret = _readSubtreeToPerl($reader,$flags,$ns,1,1); |
|
172
|
|
|
|
|
|
|
return $ret; |
|
173
|
|
|
|
|
|
|
} |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
sub _readSubtreeToPerl { |
|
176
|
|
|
|
|
|
|
my ($reader, $flags, $ns_map, $free_ns_index, $read_siblings) = @_; |
|
177
|
|
|
|
|
|
|
my @parents; |
|
178
|
|
|
|
|
|
|
my ($av,$prev,$kids,$ret,$type,$name); |
|
179
|
|
|
|
|
|
|
my $cur_depth=$reader->depth(); |
|
180
|
|
|
|
|
|
|
my $start_depth = $cur_depth; |
|
181
|
|
|
|
|
|
|
my $prev_depth = $start_depth; |
|
182
|
|
|
|
|
|
|
my $top = []; |
|
183
|
|
|
|
|
|
|
if ($reader->nodeType()==0) { |
|
184
|
|
|
|
|
|
|
return if $reader->read()!=1; |
|
185
|
|
|
|
|
|
|
if ($flags & XCT_DOCUMENT_ROOT) { |
|
186
|
|
|
|
|
|
|
$prev = [ XML_READER_TYPE_DOCUMENT, |
|
187
|
|
|
|
|
|
|
$reader->encoding, |
|
188
|
|
|
|
|
|
|
]; |
|
189
|
|
|
|
|
|
|
$start_depth --; |
|
190
|
|
|
|
|
|
|
$prev_depth --; |
|
191
|
|
|
|
|
|
|
push @$top, $prev; |
|
192
|
|
|
|
|
|
|
push @parents, $prev; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
} |
|
195
|
|
|
|
|
|
|
do {{ |
|
196
|
|
|
|
|
|
|
$type = $reader->nodeType(); |
|
197
|
|
|
|
|
|
|
# warn("$type, $cur_depth, ".$reader->name."\n"); |
|
198
|
|
|
|
|
|
|
if ($type == XML_READER_TYPE_NONE |
|
199
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_ATTRIBUTE |
|
200
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_DOCUMENT_TYPE |
|
201
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_END_ELEMENT |
|
202
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_ENTITY |
|
203
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_END_ENTITY |
|
204
|
|
|
|
|
|
|
or $type == XML_READER_TYPE_XML_DECLARATION) { |
|
205
|
|
|
|
|
|
|
$ret = $reader->read(); |
|
206
|
|
|
|
|
|
|
} else { |
|
207
|
|
|
|
|
|
|
if (($flags & (XCT_IGNORE_WS|XCT_IGNORE_SIGNIFICANT_WS)) |
|
208
|
|
|
|
|
|
|
and $type == XML_READER_TYPE_WHITESPACE |
|
209
|
|
|
|
|
|
|
or |
|
210
|
|
|
|
|
|
|
($flags & XCT_IGNORE_SIGNIFICANT_WS) |
|
211
|
|
|
|
|
|
|
and $type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE |
|
212
|
|
|
|
|
|
|
or |
|
213
|
|
|
|
|
|
|
($flags & XCT_IGNORE_COMMENTS) |
|
214
|
|
|
|
|
|
|
and $type == XML_READER_TYPE_COMMENT |
|
215
|
|
|
|
|
|
|
or |
|
216
|
|
|
|
|
|
|
($flags & XCT_IGNORE_PROCESSING_INSTRUCTIONS |
|
217
|
|
|
|
|
|
|
and $type == XML_READER_TYPE_PROCESSING_INSTRUCTION)) { |
|
218
|
|
|
|
|
|
|
$ret = $reader->read(); |
|
219
|
|
|
|
|
|
|
} else { |
|
220
|
|
|
|
|
|
|
my @av=(); |
|
221
|
|
|
|
|
|
|
$av=\@av; |
|
222
|
|
|
|
|
|
|
push @av, $type; |
|
223
|
|
|
|
|
|
|
if ($type == XML_READER_TYPE_ELEMENT) { |
|
224
|
|
|
|
|
|
|
# warn(" element\n"); |
|
225
|
|
|
|
|
|
|
push @av, $reader->localName(); |
|
226
|
|
|
|
|
|
|
$name = $reader->namespaceURI(); |
|
227
|
|
|
|
|
|
|
if ($name) { |
|
228
|
|
|
|
|
|
|
if (exists($ns_map->{$name})) { |
|
229
|
|
|
|
|
|
|
push(@av, $ns_map->{$name} || 0); |
|
230
|
|
|
|
|
|
|
} else { |
|
231
|
|
|
|
|
|
|
# warn("storing namespace $name as $free_ns_index)"; |
|
232
|
|
|
|
|
|
|
push(@av, $free_ns_index); |
|
233
|
|
|
|
|
|
|
$ns_map->{$name}=$free_ns_index; |
|
234
|
|
|
|
|
|
|
$free_ns_index++; |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
} else { |
|
237
|
|
|
|
|
|
|
push(@av, 0); # no namespace |
|
238
|
|
|
|
|
|
|
} |
|
239
|
|
|
|
|
|
|
if ($reader->hasAttributes() && $reader->moveToFirstAttribute()==1) { |
|
240
|
|
|
|
|
|
|
if ($flags & XCT_ATTRIBUTE_ARRAY) { |
|
241
|
|
|
|
|
|
|
my @attrs; |
|
242
|
|
|
|
|
|
|
do { |
|
243
|
|
|
|
|
|
|
$name = $reader->name(); |
|
244
|
|
|
|
|
|
|
if (($flags & XCT_KEEP_NS_DECLS) || substr($name,0,5) ne 'xmlns' ) { |
|
245
|
|
|
|
|
|
|
push(@attrs, $name); |
|
246
|
|
|
|
|
|
|
push(@attrs, $reader->value()); |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
} while ($reader->moveToNextAttribute()==1); |
|
249
|
|
|
|
|
|
|
# $reader->moveToElement(); |
|
250
|
|
|
|
|
|
|
push(@av, \@attrs); |
|
251
|
|
|
|
|
|
|
} else { |
|
252
|
|
|
|
|
|
|
my %attrs; |
|
253
|
|
|
|
|
|
|
do { |
|
254
|
|
|
|
|
|
|
$name = $reader->name(); |
|
255
|
|
|
|
|
|
|
if (($flags & XCT_KEEP_NS_DECLS) || substr($name,0,5) ne 'xmlns' ) { |
|
256
|
|
|
|
|
|
|
$attrs{$name}=$reader->value(); |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
} while ($reader->moveToNextAttribute()==1); |
|
259
|
|
|
|
|
|
|
$reader->moveToElement(); |
|
260
|
|
|
|
|
|
|
push(@av, \%attrs); |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
} else { |
|
263
|
|
|
|
|
|
|
push(@av, undef); # no attributes |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
if ($flags & XCT_LINE_NUMBERS) { |
|
266
|
|
|
|
|
|
|
push(@av, $reader->lineNumber()); |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
} elsif ($type == XML_READER_TYPE_TEXT or |
|
269
|
|
|
|
|
|
|
$type == XML_READER_TYPE_CDATA or |
|
270
|
|
|
|
|
|
|
$type == XML_READER_TYPE_COMMENT or |
|
271
|
|
|
|
|
|
|
$type == XML_READER_TYPE_WHITESPACE or |
|
272
|
|
|
|
|
|
|
$type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) { |
|
273
|
|
|
|
|
|
|
push(@av, $reader->value()); |
|
274
|
|
|
|
|
|
|
} elsif ($type == XML_READER_TYPE_ENTITY_REFERENCE or |
|
275
|
|
|
|
|
|
|
$type == XML_READER_TYPE_PROCESSING_INSTRUCTION or |
|
276
|
|
|
|
|
|
|
$type == XML_READER_TYPE_NOTATION) { |
|
277
|
|
|
|
|
|
|
push(@av, $reader->localName()); |
|
278
|
|
|
|
|
|
|
push(@av, $reader->value()); |
|
279
|
|
|
|
|
|
|
} elsif ($type == XML_READER_TYPE_DOCUMENT or |
|
280
|
|
|
|
|
|
|
$type == XML_READER_TYPE_DOCUMENT_FRAGMENT) { |
|
281
|
|
|
|
|
|
|
push(@av, $reader->encoding()); |
|
282
|
|
|
|
|
|
|
} |
|
283
|
|
|
|
|
|
|
if ($cur_depth==$start_depth) { |
|
284
|
|
|
|
|
|
|
push(@$top, $av); |
|
285
|
|
|
|
|
|
|
$prev_depth = $cur_depth; |
|
286
|
|
|
|
|
|
|
$kids = undef; |
|
287
|
|
|
|
|
|
|
} elsif ($cur_depth > $prev_depth) { |
|
288
|
|
|
|
|
|
|
$kids=[]; |
|
289
|
|
|
|
|
|
|
push(@$prev, $kids); |
|
290
|
|
|
|
|
|
|
push(@$kids, $av); |
|
291
|
|
|
|
|
|
|
push(@parents, $prev); |
|
292
|
|
|
|
|
|
|
$prev_depth = $cur_depth; |
|
293
|
|
|
|
|
|
|
} elsif ($cur_depth == $prev_depth) { |
|
294
|
|
|
|
|
|
|
push(@$kids, $av) if $kids; |
|
295
|
|
|
|
|
|
|
} else { |
|
296
|
|
|
|
|
|
|
do { |
|
297
|
|
|
|
|
|
|
$prev_depth--; |
|
298
|
|
|
|
|
|
|
pop(@parents); |
|
299
|
|
|
|
|
|
|
} while ($cur_depth < $prev_depth); |
|
300
|
|
|
|
|
|
|
my $p = $parents[-1]; |
|
301
|
|
|
|
|
|
|
if ($p) { |
|
302
|
|
|
|
|
|
|
$prev = $p; |
|
303
|
|
|
|
|
|
|
$p = $prev->[-1]; |
|
304
|
|
|
|
|
|
|
if ($p) { |
|
305
|
|
|
|
|
|
|
$kids = $p; |
|
306
|
|
|
|
|
|
|
push(@$kids, $av); |
|
307
|
|
|
|
|
|
|
} |
|
308
|
|
|
|
|
|
|
} |
|
309
|
|
|
|
|
|
|
} |
|
310
|
|
|
|
|
|
|
$prev = $av; |
|
311
|
|
|
|
|
|
|
$ret = $reader->read(); |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
} |
|
314
|
|
|
|
|
|
|
# print STDERR "$cur_depth, ",$reader->depth(),"\n"; |
|
315
|
|
|
|
|
|
|
}} while ($ret == 1 && ($cur_depth = $reader->depth()) > ($start_depth - ($read_siblings ? 1 : 0))); |
|
316
|
|
|
|
|
|
|
if ($ret == 1) { |
|
317
|
|
|
|
|
|
|
if ($reader->depth() == $start_depth && |
|
318
|
|
|
|
|
|
|
$reader->nodeType() == XML_READER_TYPE_END_ELEMENT) { |
|
319
|
|
|
|
|
|
|
$reader->read(); |
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
} |
|
322
|
|
|
|
|
|
|
return $top; |
|
323
|
|
|
|
|
|
|
} |
|
324
|
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=head1 OUTPUT FORMAT |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
The result of parsing a subtree is a Perl array reference C<$node> |
|
328
|
|
|
|
|
|
|
contains a node type followed by node data whose interpretation on |
|
329
|
|
|
|
|
|
|
further positions in $node depends on the node type, as described |
|
330
|
|
|
|
|
|
|
below: |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=head2 Any Node |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=over 5 |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=item * |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
$node->[0] is an integer representing the node type. Use |
|
339
|
|
|
|
|
|
|
XML::LibXML::Reader node-tye constants, e.g. XML_READER_TYPE_ELEMENT |
|
340
|
|
|
|
|
|
|
for an element node, XML_READER_TYPE_TEXT for text node, etc. |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=back |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
=head2 Document or Document Fragment Nodes |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=over 5 |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=item * |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
$node->[1] contains the document encoding |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=item * |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
$node->[2] is an array reference containing similar represention of |
|
355
|
|
|
|
|
|
|
all the child nodes of the document (fragment). |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
=back |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
Note: XML::LibXML::Reader does not document node by default, which |
|
360
|
|
|
|
|
|
|
means that calling readSubtreeToPerl on a reader object in its initial |
|
361
|
|
|
|
|
|
|
state only parses the first node in the document (which can be the |
|
362
|
|
|
|
|
|
|
root element, but also a comment or a processing instruction). Use |
|
363
|
|
|
|
|
|
|
XCT_DOCUMENT_ROOT flag to force creating a document node in such case. |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=head2 Element nodes |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=over 5 |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=item * |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
$node->[1] is the local name (UTF-8 encoded character string) |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=item * |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
$node->[2] is the namespace index (see L below) |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=item * |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
$node->[3] is undef if the element has no attributes. Otherwise if |
|
380
|
|
|
|
|
|
|
XCT_ATTRIBUTE_ARRAY flag was used, $node->[3] is an array reference of |
|
381
|
|
|
|
|
|
|
the form C<[ name1, value1, name2, value2, ....]> of attribute names and |
|
382
|
|
|
|
|
|
|
corresponding values. If XCT_ATTRIBUTE_ARRAY flag was not used, then |
|
383
|
|
|
|
|
|
|
$node->[3] is a hash reference mapping attribute names to the |
|
384
|
|
|
|
|
|
|
corresponding attribute values C<{ name1=>value1, name2=>value2...}> |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
The flag XCT_KEEP_NS_DECLS controls whether namespace declarations |
|
387
|
|
|
|
|
|
|
(xmlns=... or xmlns:prefix=...) are included along with normal |
|
388
|
|
|
|
|
|
|
attributes or not. |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
Note: there is no support for namespaced attributes yet, but the |
|
391
|
|
|
|
|
|
|
attribute names are stored as QNames, so one can always use |
|
392
|
|
|
|
|
|
|
XCT_KEEP_NS_DECLS to keep track of namespace prefix declarations and |
|
393
|
|
|
|
|
|
|
do the resolving manually. Support for namespaced attributes is |
|
394
|
|
|
|
|
|
|
planned. |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=item * |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
If XTC_LINE_NUMBERS flag was used, $node->[4] contains the line number |
|
399
|
|
|
|
|
|
|
of the element and $node->[5] contains an array reference containing |
|
400
|
|
|
|
|
|
|
similar representions of the child nodes of the current node. |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=item * |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
If XTC_LINE_NUMBERS flag was NOT used, $node->[4] contains an array |
|
405
|
|
|
|
|
|
|
reference of similar representations of the child nodes of the current |
|
406
|
|
|
|
|
|
|
node. |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=back |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head2 Text, CDATA, Comment and White-Space Nodes |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=over 5 |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=item * |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
$node->[1] contains the node value (UTF-8 encoded character string) |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=back |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head2 Unparsed Entity, Processing-Instruction, and Notation Nodes |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=over 5 |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=item * |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
$node->[1] contains the local name (there is no support for |
|
427
|
|
|
|
|
|
|
namespaces on these types of nodes yet) |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item * |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
$node->[2] contains the node value |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=back |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=head2 Skipping Less-Significant Nodes |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
White-space (non-significant or significant), processing-instruction |
|
438
|
|
|
|
|
|
|
and comment nodes can be completely skipped, using the following |
|
439
|
|
|
|
|
|
|
flags: |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
XCT_IGNORE_WS |
|
442
|
|
|
|
|
|
|
XCT_IGNORE_SIGNIFICANT_WS |
|
443
|
|
|
|
|
|
|
XCT_IGNORE_PROCESSING_INSTRUCTIONS |
|
444
|
|
|
|
|
|
|
XCT_IGNORE_COMMENTS |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head1 NAMESPACES |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Namespaces of element nodes are stored in the element node as an |
|
449
|
|
|
|
|
|
|
integer. 0 always represents nodes without namespace, all other |
|
450
|
|
|
|
|
|
|
namespaces are assigned unique numbers in an increasing order as they |
|
451
|
|
|
|
|
|
|
appear. You can pass an empty hash reference to the parsing functions |
|
452
|
|
|
|
|
|
|
to obtain the mapping. |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=head2 Example |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
use XML::CompactTree; |
|
457
|
|
|
|
|
|
|
use XML::LibXML::Reader; |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
my $reader = XML::LibXML::Reader->new(location => $ARGV[0]); |
|
460
|
|
|
|
|
|
|
my %ns; |
|
461
|
|
|
|
|
|
|
my $data = XML::CompactTree::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT, \%ns ); |
|
462
|
|
|
|
|
|
|
$ns_map[$ns{$_}]=$_ for keys %ns; |
|
463
|
|
|
|
|
|
|
my @nodes = ($data); |
|
464
|
|
|
|
|
|
|
while (@nodes) { |
|
465
|
|
|
|
|
|
|
my $node = shift @nodes; |
|
466
|
|
|
|
|
|
|
my $type = $node->[0]; |
|
467
|
|
|
|
|
|
|
if ($type == XML_READER_TYPE_ELEMENT) { |
|
468
|
|
|
|
|
|
|
print "element $node->[1] is from ns $node->[2] '$ns_map[$node->[2]]'\n"; |
|
469
|
|
|
|
|
|
|
push @nodes, @{$node->[4]}; # queue children |
|
470
|
|
|
|
|
|
|
} elsif ($type == XML_READER_TYPE_DOCUMENT) { |
|
471
|
|
|
|
|
|
|
push @nodes, @{$node->[2]}; # queue children |
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
} |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=head1 PLANNED FEATURES |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
Planned flags: |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
XCT_USE_QNAMES - use QNames instead of local names for all nodes |
|
480
|
|
|
|
|
|
|
XCT_TEXT_AS_STRING - put text nodes into the tree as plain scalars |
|
481
|
|
|
|
|
|
|
XCT_PRESERVE_PARENT - add a slot with a weak reference to the parent node |
|
482
|
|
|
|
|
|
|
XCT_MERGE_TEXT_NODES - merge adjacent text/cdata nodes together |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
Features: allow blessing the array refs to default or user-specified |
|
485
|
|
|
|
|
|
|
classes; the default classes would provide a very small subset of DOM |
|
486
|
|
|
|
|
|
|
methods to retrieve node information, manipulate the tree, and |
|
487
|
|
|
|
|
|
|
possibly serialize the parse tree back to XML. |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
=head1 AUTHOR |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
Petr Pajas, C<< >> |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=head1 BUGS |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
|
496
|
|
|
|
|
|
|
C, or through the web interface at |
|
497
|
|
|
|
|
|
|
L. |
|
498
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on |
|
499
|
|
|
|
|
|
|
your bug as I make changes. |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
Copyright 2008-2009 Petr Pajas, All Rights Reserved. |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
|
506
|
|
|
|
|
|
|
under the same terms as Perl itself. |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
XML::CompactTree::XS |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
XML::LibXML::Reader |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=cut |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
1; # End of XML::CompactTree |