line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::Tools::XML; |
2
|
30
|
|
|
30
|
|
468054
|
use Moo; |
|
30
|
|
|
|
|
79880
|
|
|
30
|
|
|
|
|
135
|
|
3
|
30
|
|
|
30
|
|
15484
|
use Carp; |
|
30
|
|
|
|
|
68
|
|
|
30
|
|
|
|
|
1323
|
|
4
|
30
|
|
|
30
|
|
4217
|
use Search::Tools; # XS required |
|
30
|
|
|
|
|
79
|
|
|
30
|
|
|
|
|
597
|
|
5
|
30
|
|
|
30
|
|
4923
|
use Search::Tools::UTF8; |
|
30
|
|
|
|
|
91
|
|
|
30
|
|
|
|
|
2287
|
|
6
|
|
|
|
|
|
|
|
7
|
30
|
|
|
30
|
|
5030
|
use namespace::autoclean; |
|
30
|
|
|
|
|
137937
|
|
|
30
|
|
|
|
|
193
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our $VERSION = '1.006'; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=pod |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Search::Tools::XML - methods for playing nice with XML and HTML |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 SYNOPSIS |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
use Search::Tools::XML; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my $class = 'Search::Tools::XML'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
my $text = 'the "quick brown" fox'; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my $xml = $class->start_tag('foo'); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
$xml .= $class->utf8_safe( $text ); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
$xml .= $class->end_tag('foo'); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# $xml: the "quick brown" fox |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
$xml = $class->escape( $xml ); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# $xml: <foo>the "quick brown" fox</foo> |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
$xml = $class->unescape( $xml ); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# $xml: the "quick brown" fox |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
my $plain = $class->no_html( $xml ); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# $plain eq $text |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head1 DESCRIPTION |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
B The API for escape() and unescape() has changed as of version 0.16. |
49
|
|
|
|
|
|
|
The text is no longer modified in place, as this was less intuitive. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Search::Tools::XML provides utility methods for dealing with XML and HTML. |
52
|
|
|
|
|
|
|
There isn't really anything new here that CPAN doesn't provide via HTML::Entities |
53
|
|
|
|
|
|
|
or similar modules. The difference is convenience: the most common methods you |
54
|
|
|
|
|
|
|
need for search apps are in one place with no extra dependencies. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
B To get full UTF-8 character set from chr() you must be using Perl >= 5.8. |
57
|
|
|
|
|
|
|
This affects things like the unescape* methods. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=head1 VARIABLES |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=head2 %HTML_ents |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
Complete map of all named HTML entities to their decimal values. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=cut |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# regexp for what constitutes whitespace in an HTML doc |
68
|
|
|
|
|
|
|
# it's not as simple as \s| so we define it separately |
69
|
|
|
|
|
|
|
my @white_hex_pts = qw( |
70
|
|
|
|
|
|
|
0009 |
71
|
|
|
|
|
|
|
000a |
72
|
|
|
|
|
|
|
000b |
73
|
|
|
|
|
|
|
000c |
74
|
|
|
|
|
|
|
000d |
75
|
|
|
|
|
|
|
0020 |
76
|
|
|
|
|
|
|
00a0 |
77
|
|
|
|
|
|
|
2000 |
78
|
|
|
|
|
|
|
2001 |
79
|
|
|
|
|
|
|
2002 |
80
|
|
|
|
|
|
|
2003 |
81
|
|
|
|
|
|
|
2004 |
82
|
|
|
|
|
|
|
2005 |
83
|
|
|
|
|
|
|
2006 |
84
|
|
|
|
|
|
|
2007 |
85
|
|
|
|
|
|
|
2008 |
86
|
|
|
|
|
|
|
2009 |
87
|
|
|
|
|
|
|
200a |
88
|
|
|
|
|
|
|
200b |
89
|
|
|
|
|
|
|
2028 |
90
|
|
|
|
|
|
|
2029 |
91
|
|
|
|
|
|
|
202f |
92
|
|
|
|
|
|
|
205f |
93
|
|
|
|
|
|
|
2060 |
94
|
|
|
|
|
|
|
3000 |
95
|
|
|
|
|
|
|
); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
my @whitesp = ( '\s', ' ' ); |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# NOTE that the pound sign # needs escaping because we use |
100
|
|
|
|
|
|
|
# the 'x' flag in our regexp. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
for my $w (@white_hex_pts) { |
103
|
|
|
|
|
|
|
push @whitesp, sprintf( "&\\#x%s;", $w ); # hex entity |
104
|
|
|
|
|
|
|
push @whitesp, sprintf( "&\\#%s;", hex($w) ); # dec entity |
105
|
|
|
|
|
|
|
push @whitesp, sprintf( "\\%s", chr( hex($w) ) ); # byte value |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
my $HTML_WHITESPACE = join( '|', @whitesp ); |
109
|
|
|
|
|
|
|
my $WHITESPACE = join( '|', map { chr( hex($_) ) } @white_hex_pts ); |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# HTML entity table |
112
|
|
|
|
|
|
|
# this just removes a dependency on another module... |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
our %HTML_ents = ( |
115
|
|
|
|
|
|
|
quot => 34, |
116
|
|
|
|
|
|
|
amp => 38, |
117
|
|
|
|
|
|
|
apos => 39, |
118
|
|
|
|
|
|
|
'lt' => 60, |
119
|
|
|
|
|
|
|
'gt' => 62, |
120
|
|
|
|
|
|
|
nbsp => 160, |
121
|
|
|
|
|
|
|
iexcl => 161, |
122
|
|
|
|
|
|
|
cent => 162, |
123
|
|
|
|
|
|
|
pound => 163, |
124
|
|
|
|
|
|
|
curren => 164, |
125
|
|
|
|
|
|
|
yen => 165, |
126
|
|
|
|
|
|
|
brvbar => 166, |
127
|
|
|
|
|
|
|
sect => 167, |
128
|
|
|
|
|
|
|
uml => 168, |
129
|
|
|
|
|
|
|
copy => 169, |
130
|
|
|
|
|
|
|
ordf => 170, |
131
|
|
|
|
|
|
|
laquo => 171, |
132
|
|
|
|
|
|
|
not => 172, |
133
|
|
|
|
|
|
|
shy => 173, |
134
|
|
|
|
|
|
|
reg => 174, |
135
|
|
|
|
|
|
|
macr => 175, |
136
|
|
|
|
|
|
|
deg => 176, |
137
|
|
|
|
|
|
|
plusmn => 177, |
138
|
|
|
|
|
|
|
sup2 => 178, |
139
|
|
|
|
|
|
|
sup3 => 179, |
140
|
|
|
|
|
|
|
acute => 180, |
141
|
|
|
|
|
|
|
micro => 181, |
142
|
|
|
|
|
|
|
para => 182, |
143
|
|
|
|
|
|
|
middot => 183, |
144
|
|
|
|
|
|
|
cedil => 184, |
145
|
|
|
|
|
|
|
sup1 => 185, |
146
|
|
|
|
|
|
|
ordm => 186, |
147
|
|
|
|
|
|
|
raquo => 187, |
148
|
|
|
|
|
|
|
frac14 => 188, |
149
|
|
|
|
|
|
|
frac12 => 189, |
150
|
|
|
|
|
|
|
frac34 => 190, |
151
|
|
|
|
|
|
|
iquest => 191, |
152
|
|
|
|
|
|
|
Agrave => 192, |
153
|
|
|
|
|
|
|
Aacute => 193, |
154
|
|
|
|
|
|
|
Acirc => 194, |
155
|
|
|
|
|
|
|
Atilde => 195, |
156
|
|
|
|
|
|
|
Auml => 196, |
157
|
|
|
|
|
|
|
Aring => 197, |
158
|
|
|
|
|
|
|
AElig => 198, |
159
|
|
|
|
|
|
|
Ccedil => 199, |
160
|
|
|
|
|
|
|
Egrave => 200, |
161
|
|
|
|
|
|
|
Eacute => 201, |
162
|
|
|
|
|
|
|
Ecirc => 202, |
163
|
|
|
|
|
|
|
Euml => 203, |
164
|
|
|
|
|
|
|
Igrave => 204, |
165
|
|
|
|
|
|
|
Iacute => 205, |
166
|
|
|
|
|
|
|
Icirc => 206, |
167
|
|
|
|
|
|
|
Iuml => 207, |
168
|
|
|
|
|
|
|
ETH => 208, |
169
|
|
|
|
|
|
|
Ntilde => 209, |
170
|
|
|
|
|
|
|
Ograve => 210, |
171
|
|
|
|
|
|
|
Oacute => 211, |
172
|
|
|
|
|
|
|
Ocirc => 212, |
173
|
|
|
|
|
|
|
Otilde => 213, |
174
|
|
|
|
|
|
|
Ouml => 214, |
175
|
|
|
|
|
|
|
'times' => 215, |
176
|
|
|
|
|
|
|
Oslash => 216, |
177
|
|
|
|
|
|
|
Ugrave => 217, |
178
|
|
|
|
|
|
|
Uacute => 218, |
179
|
|
|
|
|
|
|
Ucirc => 219, |
180
|
|
|
|
|
|
|
Uuml => 220, |
181
|
|
|
|
|
|
|
Yacute => 221, |
182
|
|
|
|
|
|
|
THORN => 222, |
183
|
|
|
|
|
|
|
szlig => 223, |
184
|
|
|
|
|
|
|
agrave => 224, |
185
|
|
|
|
|
|
|
aacute => 225, |
186
|
|
|
|
|
|
|
acirc => 226, |
187
|
|
|
|
|
|
|
atilde => 227, |
188
|
|
|
|
|
|
|
auml => 228, |
189
|
|
|
|
|
|
|
aring => 229, |
190
|
|
|
|
|
|
|
aelig => 230, |
191
|
|
|
|
|
|
|
ccedil => 231, |
192
|
|
|
|
|
|
|
egrave => 232, |
193
|
|
|
|
|
|
|
eacute => 233, |
194
|
|
|
|
|
|
|
ecirc => 234, |
195
|
|
|
|
|
|
|
euml => 235, |
196
|
|
|
|
|
|
|
igrave => 236, |
197
|
|
|
|
|
|
|
iacute => 237, |
198
|
|
|
|
|
|
|
icirc => 238, |
199
|
|
|
|
|
|
|
iuml => 239, |
200
|
|
|
|
|
|
|
eth => 240, |
201
|
|
|
|
|
|
|
ntilde => 241, |
202
|
|
|
|
|
|
|
ograve => 242, |
203
|
|
|
|
|
|
|
oacute => 243, |
204
|
|
|
|
|
|
|
ocirc => 244, |
205
|
|
|
|
|
|
|
otilde => 245, |
206
|
|
|
|
|
|
|
ouml => 246, |
207
|
|
|
|
|
|
|
divide => 247, |
208
|
|
|
|
|
|
|
oslash => 248, |
209
|
|
|
|
|
|
|
ugrave => 249, |
210
|
|
|
|
|
|
|
uacute => 250, |
211
|
|
|
|
|
|
|
ucirc => 251, |
212
|
|
|
|
|
|
|
uuml => 252, |
213
|
|
|
|
|
|
|
yacute => 253, |
214
|
|
|
|
|
|
|
thorn => 254, |
215
|
|
|
|
|
|
|
yuml => 255, |
216
|
|
|
|
|
|
|
OElig => 338, |
217
|
|
|
|
|
|
|
oelig => 339, |
218
|
|
|
|
|
|
|
Scaron => 352, |
219
|
|
|
|
|
|
|
scaron => 353, |
220
|
|
|
|
|
|
|
Yuml => 376, |
221
|
|
|
|
|
|
|
fnof => 402, |
222
|
|
|
|
|
|
|
circ => 710, |
223
|
|
|
|
|
|
|
tilde => 732, |
224
|
|
|
|
|
|
|
Alpha => 913, |
225
|
|
|
|
|
|
|
Beta => 914, |
226
|
|
|
|
|
|
|
Gamma => 915, |
227
|
|
|
|
|
|
|
Delta => 916, |
228
|
|
|
|
|
|
|
Epsilon => 917, |
229
|
|
|
|
|
|
|
Zeta => 918, |
230
|
|
|
|
|
|
|
Eta => 919, |
231
|
|
|
|
|
|
|
Theta => 920, |
232
|
|
|
|
|
|
|
Iota => 921, |
233
|
|
|
|
|
|
|
Kappa => 922, |
234
|
|
|
|
|
|
|
Lambda => 923, |
235
|
|
|
|
|
|
|
Mu => 924, |
236
|
|
|
|
|
|
|
Nu => 925, |
237
|
|
|
|
|
|
|
Xi => 926, |
238
|
|
|
|
|
|
|
Omicron => 927, |
239
|
|
|
|
|
|
|
Pi => 928, |
240
|
|
|
|
|
|
|
Rho => 929, |
241
|
|
|
|
|
|
|
Sigma => 931, |
242
|
|
|
|
|
|
|
Tau => 932, |
243
|
|
|
|
|
|
|
Upsilon => 933, |
244
|
|
|
|
|
|
|
Phi => 934, |
245
|
|
|
|
|
|
|
Chi => 935, |
246
|
|
|
|
|
|
|
Psi => 936, |
247
|
|
|
|
|
|
|
Omega => 937, |
248
|
|
|
|
|
|
|
alpha => 945, |
249
|
|
|
|
|
|
|
beta => 946, |
250
|
|
|
|
|
|
|
gamma => 947, |
251
|
|
|
|
|
|
|
delta => 948, |
252
|
|
|
|
|
|
|
epsilon => 949, |
253
|
|
|
|
|
|
|
zeta => 950, |
254
|
|
|
|
|
|
|
eta => 951, |
255
|
|
|
|
|
|
|
theta => 952, |
256
|
|
|
|
|
|
|
iota => 953, |
257
|
|
|
|
|
|
|
kappa => 954, |
258
|
|
|
|
|
|
|
lambda => 955, |
259
|
|
|
|
|
|
|
mu => 956, |
260
|
|
|
|
|
|
|
nu => 957, |
261
|
|
|
|
|
|
|
xi => 958, |
262
|
|
|
|
|
|
|
omicron => 959, |
263
|
|
|
|
|
|
|
pi => 960, |
264
|
|
|
|
|
|
|
rho => 961, |
265
|
|
|
|
|
|
|
sigmaf => 962, |
266
|
|
|
|
|
|
|
sigma => 963, |
267
|
|
|
|
|
|
|
tau => 964, |
268
|
|
|
|
|
|
|
upsilon => 965, |
269
|
|
|
|
|
|
|
phi => 966, |
270
|
|
|
|
|
|
|
chi => 967, |
271
|
|
|
|
|
|
|
psi => 968, |
272
|
|
|
|
|
|
|
omega => 969, |
273
|
|
|
|
|
|
|
thetasym => 977, |
274
|
|
|
|
|
|
|
upsih => 978, |
275
|
|
|
|
|
|
|
piv => 982, |
276
|
|
|
|
|
|
|
ensp => 8194, |
277
|
|
|
|
|
|
|
emsp => 8195, |
278
|
|
|
|
|
|
|
thinsp => 8201, |
279
|
|
|
|
|
|
|
zwnj => 8204, |
280
|
|
|
|
|
|
|
zwj => 8205, |
281
|
|
|
|
|
|
|
lrm => 8206, |
282
|
|
|
|
|
|
|
rlm => 8207, |
283
|
|
|
|
|
|
|
ndash => 8211, |
284
|
|
|
|
|
|
|
mdash => 8212, |
285
|
|
|
|
|
|
|
lsquo => 8216, |
286
|
|
|
|
|
|
|
rsquo => 8217, |
287
|
|
|
|
|
|
|
sbquo => 8218, |
288
|
|
|
|
|
|
|
ldquo => 8220, |
289
|
|
|
|
|
|
|
rdquo => 8221, |
290
|
|
|
|
|
|
|
bdquo => 8222, |
291
|
|
|
|
|
|
|
dagger => 8224, |
292
|
|
|
|
|
|
|
Dagger => 8225, |
293
|
|
|
|
|
|
|
bull => 8226, |
294
|
|
|
|
|
|
|
hellip => 8230, |
295
|
|
|
|
|
|
|
permil => 8240, |
296
|
|
|
|
|
|
|
prime => 8242, |
297
|
|
|
|
|
|
|
Prime => 8243, |
298
|
|
|
|
|
|
|
lsaquo => 8249, |
299
|
|
|
|
|
|
|
rsaquo => 8250, |
300
|
|
|
|
|
|
|
oline => 8254, |
301
|
|
|
|
|
|
|
frasl => 8260, |
302
|
|
|
|
|
|
|
euro => 8364, |
303
|
|
|
|
|
|
|
image => 8465, |
304
|
|
|
|
|
|
|
weierp => 8472, |
305
|
|
|
|
|
|
|
real => 8476, |
306
|
|
|
|
|
|
|
trade => 8482, |
307
|
|
|
|
|
|
|
alefsym => 8501, |
308
|
|
|
|
|
|
|
larr => 8592, |
309
|
|
|
|
|
|
|
uarr => 8593, |
310
|
|
|
|
|
|
|
rarr => 8594, |
311
|
|
|
|
|
|
|
darr => 8595, |
312
|
|
|
|
|
|
|
harr => 8596, |
313
|
|
|
|
|
|
|
crarr => 8629, |
314
|
|
|
|
|
|
|
lArr => 8656, |
315
|
|
|
|
|
|
|
uArr => 8657, |
316
|
|
|
|
|
|
|
rArr => 8658, |
317
|
|
|
|
|
|
|
dArr => 8659, |
318
|
|
|
|
|
|
|
hArr => 8660, |
319
|
|
|
|
|
|
|
forall => 8704, |
320
|
|
|
|
|
|
|
part => 8706, |
321
|
|
|
|
|
|
|
exist => 8707, |
322
|
|
|
|
|
|
|
empty => 8709, |
323
|
|
|
|
|
|
|
nabla => 8711, |
324
|
|
|
|
|
|
|
isin => 8712, |
325
|
|
|
|
|
|
|
notin => 8713, |
326
|
|
|
|
|
|
|
ni => 8715, |
327
|
|
|
|
|
|
|
prod => 8719, |
328
|
|
|
|
|
|
|
'sum' => 8721, |
329
|
|
|
|
|
|
|
'minus' => 8722, |
330
|
|
|
|
|
|
|
lowast => 8727, |
331
|
|
|
|
|
|
|
radic => 8730, |
332
|
|
|
|
|
|
|
prop => 8733, |
333
|
|
|
|
|
|
|
infin => 8734, |
334
|
|
|
|
|
|
|
ang => 8736, |
335
|
|
|
|
|
|
|
'and' => 8743, |
336
|
|
|
|
|
|
|
'or' => 8744, |
337
|
|
|
|
|
|
|
cap => 8745, |
338
|
|
|
|
|
|
|
cup => 8746, |
339
|
|
|
|
|
|
|
int => 8747, |
340
|
|
|
|
|
|
|
there4 => 8756, |
341
|
|
|
|
|
|
|
sim => 8764, |
342
|
|
|
|
|
|
|
cong => 8773, |
343
|
|
|
|
|
|
|
asymp => 8776, |
344
|
|
|
|
|
|
|
ne => 8800, |
345
|
|
|
|
|
|
|
equiv => 8801, |
346
|
|
|
|
|
|
|
le => 8804, |
347
|
|
|
|
|
|
|
ge => 8805, |
348
|
|
|
|
|
|
|
sub => 8834, |
349
|
|
|
|
|
|
|
sup => 8835, |
350
|
|
|
|
|
|
|
nsub => 8836, |
351
|
|
|
|
|
|
|
sube => 8838, |
352
|
|
|
|
|
|
|
supe => 8839, |
353
|
|
|
|
|
|
|
oplus => 8853, |
354
|
|
|
|
|
|
|
otimes => 8855, |
355
|
|
|
|
|
|
|
perp => 8869, |
356
|
|
|
|
|
|
|
sdot => 8901, |
357
|
|
|
|
|
|
|
lceil => 8968, |
358
|
|
|
|
|
|
|
rceil => 8969, |
359
|
|
|
|
|
|
|
lfloor => 8970, |
360
|
|
|
|
|
|
|
rfloor => 8971, |
361
|
|
|
|
|
|
|
lang => 9001, |
362
|
|
|
|
|
|
|
rang => 9002, |
363
|
|
|
|
|
|
|
loz => 9674, |
364
|
|
|
|
|
|
|
spades => 9824, |
365
|
|
|
|
|
|
|
clubs => 9827, |
366
|
|
|
|
|
|
|
hearts => 9829, |
367
|
|
|
|
|
|
|
diams => 9830, |
368
|
|
|
|
|
|
|
); |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
my %char2entity = (); |
371
|
|
|
|
|
|
|
while ( my ( $e, $n ) = each(%HTML_ents) ) { |
372
|
|
|
|
|
|
|
my $char = chr($n); |
373
|
|
|
|
|
|
|
$char2entity{$char} = "&$e;"; |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
delete $char2entity{q/'/}; # only one-way decoding |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
# Fill in missing entities |
378
|
|
|
|
|
|
|
# TODO does this only work under latin1 locale? |
379
|
|
|
|
|
|
|
for ( 0 .. 255 ) { |
380
|
|
|
|
|
|
|
next if exists $char2entity{ chr($_) }; |
381
|
|
|
|
|
|
|
$char2entity{ chr($_) } = "$_;"; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head1 METHODS |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
The following methods may be accessed either as object or class methods. |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=head2 new |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
Create a Search::Tools::XML object. |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=cut |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=head2 tag_re |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
Returns a qr// regex for matching a SGML (XML, HTML, etc) tag. |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=cut |
399
|
|
|
|
|
|
|
|
400
|
36
|
|
|
36
|
1
|
222
|
sub tag_re {qr/<[^>]+>/s} |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=head2 html_whitespace |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
Returns a regex for all whitespace characters and |
405
|
|
|
|
|
|
|
HTML whitespace entities. |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
=cut |
408
|
|
|
|
|
|
|
|
409
|
26
|
|
|
26
|
1
|
384
|
sub html_whitespace {$HTML_WHITESPACE} |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=head2 char2ent_map |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
Returns a hash reference to the class data mapping chr() values to their |
414
|
|
|
|
|
|
|
numerical entity equivalents. |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
=cut |
417
|
|
|
|
|
|
|
|
418
|
26
|
|
|
26
|
1
|
66
|
sub char2ent_map { \%char2entity } |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head2 looks_like_html( I ) |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
Returns true if I appears to have HTML-like markup in it. |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
Aliases for this method include: |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=over |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item looks_like_xml |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=item looks_like_markup |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=back |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=cut |
435
|
|
|
|
|
|
|
|
436
|
31
|
|
|
31
|
1
|
1972
|
sub looks_like_html { return $_[1] =~ m/[<>]|&[\#\w]+;/o } |
437
|
|
|
|
|
|
|
*looks_like_xml = \&looks_like_html; |
438
|
|
|
|
|
|
|
*looks_like_markup = \&looks_like_html; |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=head2 start_tag( I [, I<\%attr> ] ) |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=head2 end_tag( I ) |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
Returns I as a tag, either start or end. I will be escaped for any non-valid |
445
|
|
|
|
|
|
|
chars using tag_safe(). |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
If I<\%attr> is passed, XML-safe attributes are generated using attr_safe(). |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=head2 singleton( I [, I<\%attr> ] ) |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
Like start_tag() but includes the closing slash. |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
=cut |
454
|
|
|
|
|
|
|
|
455
|
54
|
|
|
54
|
1
|
685
|
sub start_tag { "<" . tag_safe( $_[1] ) . $_[0]->attr_safe( $_[2] ) . ">" } |
456
|
53
|
|
|
53
|
1
|
70
|
sub end_tag { "" . tag_safe( $_[1] ) . ">" } |
457
|
0
|
|
|
0
|
1
|
0
|
sub singleton { "<" . tag_safe( $_[1] ) . $_[0]->attr_safe( $_[2] ) . "/>" } |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=pod |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=head2 tag_safe( I ) |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
Create a valid XML tag name, escaping/omitting invalid characters. |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
Example: |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
my $tag = Search::Tools::XML->tag_safe( '1 * ! tag foo' ); |
468
|
|
|
|
|
|
|
# $tag == '______tag_foo' |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
=cut |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
sub tag_safe { |
473
|
111
|
|
|
111
|
1
|
118
|
my $t = pop; |
474
|
|
|
|
|
|
|
|
475
|
111
|
50
|
|
|
|
148
|
return '_' unless length $t; |
476
|
|
|
|
|
|
|
|
477
|
111
|
|
|
|
|
145
|
$t =~ s/::/_/g; # single colons ok, but doubles are not |
478
|
111
|
|
|
|
|
132
|
$t =~ s/[^-\.\w:]/_/g; |
479
|
111
|
|
|
|
|
142
|
$t =~ s/^(\d)/_$1/; |
480
|
|
|
|
|
|
|
|
481
|
111
|
|
|
|
|
270
|
return $t; |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=head2 attr_safe( I<\%attr> ) |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
Returns stringified I<\%attr> as XML attributes. |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
=cut |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub attr_safe { |
491
|
54
|
|
|
54
|
1
|
65
|
my $self = shift; |
492
|
54
|
|
|
|
|
66
|
my $attr = shift; |
493
|
54
|
100
|
|
|
|
122
|
return '' unless defined $attr; |
494
|
11
|
50
|
|
|
|
22
|
if ( ref $attr ne "HASH" ) { |
495
|
0
|
|
|
|
|
0
|
croak "attributes must be a hash ref"; |
496
|
|
|
|
|
|
|
} |
497
|
11
|
|
|
|
|
19
|
my @xml = (''); # force space at start in return |
498
|
11
|
|
|
|
|
26
|
for my $name ( sort keys %$attr ) { |
499
|
|
|
|
|
|
|
my $val = _escape_xml( $attr->{$name}, |
500
|
4
|
|
|
|
|
15
|
is_flagged_utf8( $attr->{$name} ) ); |
501
|
4
|
|
|
|
|
10
|
push @xml, tag_safe($name) . qq{="$val"}; |
502
|
|
|
|
|
|
|
} |
503
|
11
|
|
|
|
|
36
|
return join( ' ', @xml ); |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
=pod |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head2 utf8_safe( I ) |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
Return I with special XML chars and all |
511
|
|
|
|
|
|
|
non-ASCII chars converted to numeric entities. |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
This is escape() on steroids. B |
514
|
|
|
|
|
|
|
unless you know what you're doing. See the SYNOPSIS for an example. |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=head2 escape_utf8 |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Alias for utf8_safe(). |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
=cut |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
*escape_utf8 = \&utf8_safe; |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
sub utf8_safe { |
525
|
27
|
|
|
27
|
1
|
561
|
my $t = pop; |
526
|
27
|
50
|
|
|
|
39
|
$t = '' unless defined $t; |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# converts all low chars except \t \n and \r |
529
|
|
|
|
|
|
|
# to space because XML spec disallows <32 |
530
|
27
|
|
|
|
|
49
|
$t =~ s,[\x00-\x08\x0b-\x0c\x0e-\x1f], ,g; |
531
|
|
|
|
|
|
|
|
532
|
27
|
|
|
|
|
48
|
$t =~ s{([^\x09\x0a\x0d\x20\x21\x23-\x25\x28-\x3b\x3d\x3F-\x5B\x5D-\x7E])} |
|
2
|
|
|
|
|
8
|
|
533
|
|
|
|
|
|
|
{''.(ord($1)).';'}eg; |
534
|
27
|
|
|
|
|
49
|
|
535
|
|
|
|
|
|
|
return $t; |
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=head2 no_html( I [, I] ) |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
no_html() is a brute-force method for removing all tags and entities |
541
|
|
|
|
|
|
|
from I. A simple regular expression is used, so things like |
542
|
|
|
|
|
|
|
nested comments and the like will probably break. If you really |
543
|
|
|
|
|
|
|
need to reliably filter out the tags and entities from a HTML text, use |
544
|
|
|
|
|
|
|
HTML::Parser or similar. |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
I is returned with no markup in it. |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
If I is true (defaults to false) then |
549
|
|
|
|
|
|
|
all whitespace is normalized away to ASCII space (U+0020). |
550
|
|
|
|
|
|
|
This can be helpful if you have Unicode entities representing |
551
|
|
|
|
|
|
|
line breaks or other layout instructions. |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=cut |
554
|
|
|
|
|
|
|
|
555
|
10
|
|
|
10
|
1
|
535
|
sub no_html { |
556
|
10
|
|
|
|
|
18
|
my $class = shift; |
557
|
10
|
|
100
|
|
|
56
|
my $text = shift; |
558
|
10
|
50
|
|
|
|
103
|
my $normalize_whitespace = shift || 0; |
559
|
0
|
|
|
|
|
0
|
if ( !defined $text ) { |
560
|
|
|
|
|
|
|
croak "text required"; |
561
|
10
|
|
|
|
|
34
|
} |
562
|
10
|
|
|
|
|
2283
|
my $re = $class->tag_re; |
563
|
10
|
|
|
|
|
37
|
$text =~ s,$re,,g; |
564
|
10
|
100
|
|
|
|
28
|
$text = $class->unescape($text); |
565
|
1
|
|
|
|
|
8
|
if ($normalize_whitespace) { |
566
|
|
|
|
|
|
|
$text =~ s/\s+/ /g; |
567
|
10
|
|
|
|
|
57
|
} |
568
|
|
|
|
|
|
|
return $text; |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=head2 strip_html |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
An alias for no_html(). |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=head2 strip_markup |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
An alias for no_html(). |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=cut |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
*strip_html = \&no_html; |
582
|
|
|
|
|
|
|
*strip_markup = \&no_html; |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
=head2 escape( I ) |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
Similar to escape() functions in more famous CPAN modules, but without the |
587
|
|
|
|
|
|
|
added dependency. escape() will convert the special XML chars (><'"&) to their |
588
|
|
|
|
|
|
|
named entity equivalents. |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
The escaped I is returned. |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
B The API for this method has changed as of version 0.16. I |
593
|
|
|
|
|
|
|
is no longer modified in-place. |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
As of version 0.27 escape() is written in C/XS for speed. |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
=cut |
598
|
|
|
|
|
|
|
|
599
|
2
|
|
|
2
|
1
|
959
|
sub escape { |
600
|
2
|
50
|
|
|
|
8
|
my $text = pop; |
601
|
2
|
|
|
|
|
6
|
return unless defined $text; |
602
|
|
|
|
|
|
|
return _escape_xml( $text, is_flagged_utf8($text) ); |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=head2 unescape( I ) |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
Similar to unescape() functions in more famous CPAN modules, but without the added |
608
|
|
|
|
|
|
|
dependency. unescape() will convert all entities to their chr() equivalents. |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
B unescape() does more than reverse the effects of escape(). It attempts |
611
|
|
|
|
|
|
|
to resolve B entities, not just the special XML entities (><'"&). |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
B The API for this method has changed as of version 0.16. |
614
|
|
|
|
|
|
|
I is no longer modified in-place. |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
=cut |
617
|
|
|
|
|
|
|
|
618
|
11
|
|
|
11
|
1
|
635
|
sub unescape { |
619
|
11
|
|
|
|
|
35
|
my $text = pop; |
620
|
11
|
|
|
|
|
34
|
$text = unescape_named($text); |
621
|
11
|
|
|
|
|
25
|
$text = unescape_decimal($text); |
622
|
|
|
|
|
|
|
return $text; |
623
|
|
|
|
|
|
|
} |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
=head2 unescape_named( I ) |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
Replace all named HTML entities with their chr() equivalents. |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
Returns modified copy of I. |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
=cut |
632
|
|
|
|
|
|
|
|
633
|
13
|
|
|
13
|
1
|
27
|
sub unescape_named { |
634
|
13
|
50
|
|
|
|
39
|
my $t = pop; |
635
|
|
|
|
|
|
|
if ( defined($t) ) { |
636
|
|
|
|
|
|
|
|
637
|
13
|
100
|
|
|
|
63
|
# named entities - check first to see if it is worth looping |
638
|
7
|
|
|
|
|
274
|
if ( $t =~ m/&[a-zA-Z0-9]+;/ ) { |
639
|
1771
|
|
|
|
|
2427
|
for my $e ( keys %HTML_ents ) { |
640
|
1771
|
100
|
|
|
|
10203
|
my $dec = $HTML_ents{$e}; |
|
163
|
|
|
|
|
354
|
|
641
|
|
|
|
|
|
|
if ( my $n = $t =~ s/&$e;/chr($dec)/eg ) { |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
#warn "replaced $e ($dec) -> $HTML_ents{$e} $n times in text"; |
644
|
|
|
|
|
|
|
} |
645
|
|
|
|
|
|
|
} |
646
|
|
|
|
|
|
|
} |
647
|
13
|
|
|
|
|
92
|
} |
648
|
|
|
|
|
|
|
return $t; |
649
|
|
|
|
|
|
|
} |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=head2 unescape_decimal( I ) |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
Replace all decimal entities with their chr() equivalents. |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
Returns modified copy of I. |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
=cut |
658
|
|
|
|
|
|
|
|
659
|
11
|
|
|
11
|
1
|
23
|
sub unescape_decimal { |
660
|
|
|
|
|
|
|
my $t = pop; |
661
|
|
|
|
|
|
|
|
662
|
11
|
50
|
|
|
|
55
|
# resolve numeric entities as best we can |
|
5
|
|
|
|
|
20
|
|
663
|
11
|
|
|
|
|
26
|
$t =~ s/(\d+);/chr($1)/ego if defined($t); |
664
|
|
|
|
|
|
|
return $t; |
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
=head2 perl_to_xml( I[ [, I] ) ] |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
Similar to the XML::Simple XMLout() feature, perl_to_xml() |
670
|
|
|
|
|
|
|
will take a Perl data structure I[ and convert it to XML. ] |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
I should be a hashref with the following supported key/value pairs: |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
=over |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=item root I |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
The root element. If I is a string, it is used as the tag name. If |
679
|
|
|
|
|
|
|
I is a hashref, two keys are required: |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
=over |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
=item tag |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
String indicating the element name. |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=item attrs |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
Hash ref of attribute key/value pairs (see start_tag()). |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
=back |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=item wrap_array I<1|0> |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
If B is true (the default), arrayref items are wrapped |
696
|
|
|
|
|
|
|
in an additional XML tag, keeping the array items enclosed in a logical set. |
697
|
|
|
|
|
|
|
If B is false, each item in the array is treated individually. |
698
|
|
|
|
|
|
|
See B below for the naming convention for arrayref items. |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=item strip_plural I<1|0> |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
The B option interacts with the B option. |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
If B is a true value and not a CODE ref, |
705
|
|
|
|
|
|
|
any trailing C character will be stripped from the enclosing tag name |
706
|
|
|
|
|
|
|
whenever an array of hashrefs is found. Example: |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
my $data = { |
709
|
|
|
|
|
|
|
values => [ |
710
|
|
|
|
|
|
|
{ two => 2, |
711
|
|
|
|
|
|
|
three => 3, |
712
|
|
|
|
|
|
|
}, |
713
|
|
|
|
|
|
|
{ four => 4, |
714
|
|
|
|
|
|
|
five => 5, |
715
|
|
|
|
|
|
|
}, |
716
|
|
|
|
|
|
|
], |
717
|
|
|
|
|
|
|
}; |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
my $xml = $utils->perl_to_xml($data, { |
720
|
|
|
|
|
|
|
root => 'data', |
721
|
|
|
|
|
|
|
wrap_array => 1, |
722
|
|
|
|
|
|
|
strip_plural => 1, |
723
|
|
|
|
|
|
|
}); |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
# $xml DOM will look like: |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
3 |
731
|
|
|
|
|
|
|
2 |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
5 |
735
|
|
|
|
|
|
|
4 |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
Obviously stripping the final C will not always render sensical tag names. |
741
|
|
|
|
|
|
|
Pass a CODE ref instead, expecting one value (the tag name) and returning the |
742
|
|
|
|
|
|
|
tag name to use: |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
my $xml = $utils->perl_to_xml($data, { |
745
|
|
|
|
|
|
|
root => 'data', |
746
|
|
|
|
|
|
|
wrap_array => 1, |
747
|
|
|
|
|
|
|
strip_plural => sub { |
748
|
|
|
|
|
|
|
my $tag = shift; |
749
|
|
|
|
|
|
|
$tag =~ s/foo/BAR/; |
750
|
|
|
|
|
|
|
return $tag; |
751
|
|
|
|
|
|
|
}, |
752
|
|
|
|
|
|
|
}); |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
=item escape I<1|0> |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
If B is false, strings within the B[ value will not be passed ] |
757
|
|
|
|
|
|
|
through escape(). Default is true. |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
=back |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
=cut |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
=head2 perl_to_xml( I[, I [, I ][, I] ) ] |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
This second usage is deprecated and here for backwards compatability only. |
766
|
|
|
|
|
|
|
Use the named key/value I instead. Readers of your code (including you!) will |
767
|
|
|
|
|
|
|
thank you. |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=cut |
770
|
|
|
|
|
|
|
|
771
|
4
|
|
|
4
|
|
5
|
sub _make_singular { |
772
|
4
|
|
|
|
|
9
|
my ($t) = @_; |
773
|
4
|
50
|
|
|
|
9
|
$t =~ s/ies$/y/i; |
774
|
4
|
50
|
|
|
|
11
|
return $t if ( $t =~ s/ses$/s/i ); |
775
|
4
|
|
|
|
|
12
|
return $t if ( $t =~ /[aeiouy]ss$/i ); |
776
|
4
|
50
|
|
|
|
10
|
$t =~ s/s$//i; |
777
|
|
|
|
|
|
|
return length $t ? $t : $_[0]; |
778
|
|
|
|
|
|
|
} |
779
|
|
|
|
|
|
|
|
780
|
4
|
|
|
4
|
1
|
97
|
sub perl_to_xml { |
781
|
4
|
|
|
|
|
7
|
my $self = shift; |
782
|
|
|
|
|
|
|
my $perl = shift; |
783
|
4
|
|
|
|
|
5
|
|
784
|
4
|
100
|
66
|
|
|
19
|
my ( $root, $wrap_array, $strip_plural, $escape ); |
785
|
2
|
|
|
|
|
3
|
if ( ref $_[0] eq 'HASH' and !exists $_[0]->{tag} ) { |
|
2
|
|
|
|
|
7
|
|
786
|
2
|
|
50
|
|
|
8
|
my %opts = %{ $_[0] }; |
787
|
2
|
|
100
|
|
|
6
|
$root = delete $opts{root} || '_root'; |
788
|
2
|
|
|
|
|
4
|
$strip_plural = delete $opts{strip_plural} || 0; |
789
|
2
|
100
|
|
|
|
4
|
$wrap_array = delete $opts{wrap_array}; |
790
|
2
|
|
|
|
|
4
|
$wrap_array = 1 unless defined $wrap_array; |
791
|
2
|
100
|
|
|
|
4
|
$escape = delete $opts{escape}; |
792
|
|
|
|
|
|
|
$escape = 1 unless defined $escape; |
793
|
|
|
|
|
|
|
} |
794
|
2
|
|
50
|
|
|
6
|
else { |
795
|
2
|
|
100
|
|
|
6
|
$root = shift || '_root'; |
796
|
2
|
|
|
|
|
4
|
$strip_plural = shift || 0; |
797
|
|
|
|
|
|
|
$escape = shift; |
798
|
|
|
|
|
|
|
|
799
|
2
|
50
|
33
|
|
|
13
|
# backcompat means we need to reverse logic |
|
|
50
|
33
|
|
|
|
|
|
|
50
|
|
|
|
|
|
800
|
0
|
|
|
|
|
0
|
if ( defined $escape and $escape == 1 ) { |
801
|
|
|
|
|
|
|
$escape = 0; |
802
|
|
|
|
|
|
|
} |
803
|
0
|
|
|
|
|
0
|
elsif ( defined $escape and $escape == 0 ) { |
804
|
|
|
|
|
|
|
$escape = 1; |
805
|
|
|
|
|
|
|
} |
806
|
2
|
|
|
|
|
3
|
elsif ( !defined $escape ) { |
807
|
|
|
|
|
|
|
$escape = 1; |
808
|
|
|
|
|
|
|
} |
809
|
2
|
|
|
|
|
3
|
|
810
|
|
|
|
|
|
|
$wrap_array = 1; # old behavior |
811
|
4
|
50
|
|
|
|
7
|
} |
812
|
0
|
|
|
|
|
0
|
unless ( defined $perl ) { |
813
|
|
|
|
|
|
|
croak "perl data struct required"; |
814
|
|
|
|
|
|
|
} |
815
|
4
|
100
|
66
|
|
|
13
|
|
816
|
2
|
|
|
|
|
4
|
if ( $strip_plural and ref($strip_plural) ne 'CODE' ) { |
817
|
|
|
|
|
|
|
$strip_plural = \&_make_singular; |
818
|
|
|
|
|
|
|
} |
819
|
4
|
|
|
|
|
5
|
|
820
|
4
|
100
|
|
|
|
8
|
my ( $root_tag, $attrs ); |
821
|
1
|
50
|
|
|
|
3
|
if ( ref $root ) { |
822
|
1
|
50
|
|
|
|
3
|
$root_tag = delete $root->{tag} or croak 'tag key required in root'; |
823
|
|
|
|
|
|
|
$attrs = delete $root->{attrs} or croak 'attrs key required in root'; |
824
|
|
|
|
|
|
|
} |
825
|
3
|
|
|
|
|
4
|
else { |
826
|
3
|
|
|
|
|
5
|
$root_tag = $root; |
827
|
|
|
|
|
|
|
$attrs = {}; |
828
|
|
|
|
|
|
|
} |
829
|
4
|
50
|
|
|
|
8
|
|
830
|
|
|
|
|
|
|
if ( !ref $perl ) { |
831
|
0
|
0
|
|
|
|
0
|
return |
832
|
|
|
|
|
|
|
$self->start_tag( $root_tag, $attrs ) |
833
|
|
|
|
|
|
|
. ( $escape ? $self->utf8_safe($perl) : $perl ) |
834
|
|
|
|
|
|
|
. $self->end_tag($root_tag); |
835
|
|
|
|
|
|
|
} |
836
|
4
|
|
|
|
|
11
|
|
837
|
4
|
|
|
|
|
12
|
my $xml = $self->start_tag( $root_tag, $attrs ); |
838
|
|
|
|
|
|
|
$self->_ref_to_xml( $perl, '', \$xml, $strip_plural, $escape, |
839
|
4
|
|
|
|
|
8
|
$wrap_array ); |
840
|
4
|
|
|
|
|
18
|
$xml .= $self->end_tag($root_tag); |
841
|
|
|
|
|
|
|
return $xml; |
842
|
|
|
|
|
|
|
} |
843
|
|
|
|
|
|
|
|
844
|
58
|
|
|
58
|
|
118
|
sub _ref_to_xml { |
845
|
|
|
|
|
|
|
my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array ) |
846
|
58
|
|
|
|
|
74
|
= @_; |
847
|
58
|
100
|
|
|
|
98
|
my $type = ref $perl; |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
848
|
31
|
50
|
|
|
|
60
|
if ( !$type ) { |
849
|
|
|
|
|
|
|
( $$xml_ref .= $self->start_tag($root) ) |
850
|
31
|
100
|
|
|
|
70
|
if length($root); |
851
|
31
|
50
|
|
|
|
67
|
$$xml_ref .= ( $escape ? $self->utf8_safe($perl) : $perl ); |
852
|
|
|
|
|
|
|
( $$xml_ref .= $self->end_tag($root) ) |
853
|
|
|
|
|
|
|
if length($root); |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
#$$xml_ref .= "\n"; # just for debugging |
856
|
|
|
|
|
|
|
} |
857
|
0
|
|
|
|
|
0
|
elsif ( $type eq 'SCALAR' ) { |
858
|
|
|
|
|
|
|
$self->_scalar_to_xml( $perl, $root, $xml_ref, $strip_plural, |
859
|
|
|
|
|
|
|
$escape, $wrap_array ); |
860
|
|
|
|
|
|
|
} |
861
|
7
|
|
|
|
|
17
|
elsif ( $type eq 'ARRAY' ) { |
862
|
|
|
|
|
|
|
$self->_array_to_xml( $perl, $root, $xml_ref, $strip_plural, |
863
|
|
|
|
|
|
|
$escape, $wrap_array ); |
864
|
|
|
|
|
|
|
} |
865
|
17
|
|
|
|
|
34
|
elsif ( $type eq 'HASH' ) { |
866
|
|
|
|
|
|
|
$self->_hash_to_xml( $perl, $root, $xml_ref, $strip_plural, $escape, |
867
|
|
|
|
|
|
|
$wrap_array ); |
868
|
|
|
|
|
|
|
} |
869
|
|
|
|
|
|
|
else { |
870
|
3
|
|
|
|
|
26
|
# assume blessed object, force it to stringify as a scalar |
871
|
|
|
|
|
|
|
$self->_scalar_to_xml( "$perl", $root, $xml_ref, $strip_plural, |
872
|
|
|
|
|
|
|
$escape, $wrap_array ); |
873
|
|
|
|
|
|
|
} |
874
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
} |
876
|
|
|
|
|
|
|
|
877
|
7
|
|
|
7
|
|
10
|
sub _array_to_xml { |
878
|
|
|
|
|
|
|
my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array ) |
879
|
7
|
|
|
|
|
13
|
= @_; |
880
|
25
|
100
|
66
|
|
|
89
|
for my $thing (@$perl) { |
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
881
|
|
|
|
|
|
|
if ( ref $thing |
882
|
|
|
|
|
|
|
and ( ref $thing eq 'ARRAY' or ref $thing eq 'HASH' ) |
883
|
|
|
|
|
|
|
and length($root) |
884
|
|
|
|
|
|
|
and $wrap_array ) |
885
|
|
|
|
|
|
|
{ |
886
|
8
|
|
|
|
|
15
|
#warn "<$root> ref $thing == " . ref($thing); |
887
|
|
|
|
|
|
|
$$xml_ref .= $self->start_tag($root); |
888
|
25
|
|
|
|
|
49
|
} |
889
|
|
|
|
|
|
|
$self->_ref_to_xml( $thing, $root, $xml_ref, $strip_plural, $escape, |
890
|
25
|
100
|
66
|
|
|
95
|
$wrap_array ); |
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
891
|
|
|
|
|
|
|
if ( ref $thing |
892
|
|
|
|
|
|
|
and ( ref $thing eq 'ARRAY' or ref $thing eq 'HASH' ) |
893
|
|
|
|
|
|
|
and length($root) |
894
|
|
|
|
|
|
|
and $wrap_array ) |
895
|
|
|
|
|
|
|
{ |
896
|
8
|
|
|
|
|
14
|
#warn "$root> ref $thing == " . ref($thing); |
897
|
|
|
|
|
|
|
$$xml_ref .= $self->end_tag($root); |
898
|
|
|
|
|
|
|
} |
899
|
|
|
|
|
|
|
} |
900
|
|
|
|
|
|
|
} |
901
|
|
|
|
|
|
|
|
902
|
17
|
|
|
17
|
|
23
|
sub _hash_to_xml { |
903
|
|
|
|
|
|
|
my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array ) |
904
|
17
|
|
|
|
|
35
|
= @_; |
905
|
29
|
|
|
|
|
35
|
for my $key ( keys %$perl ) { |
906
|
29
|
100
|
|
|
|
40
|
my $thing = $perl->{$key}; |
907
|
8
|
|
|
|
|
11
|
if ( ref $thing ) { |
908
|
8
|
|
|
|
|
8
|
my $key_to_pass = $key; |
909
|
8
|
100
|
100
|
|
|
25
|
my %attr; |
910
|
4
|
|
|
|
|
10
|
if ( ref $thing eq 'ARRAY' && $strip_plural ) { |
911
|
4
|
|
|
|
|
7
|
$key_to_pass = $strip_plural->($key_to_pass); |
912
|
|
|
|
|
|
|
$attr{count} = scalar @$thing; |
913
|
8
|
100
|
100
|
|
|
27
|
} |
914
|
6
|
|
|
|
|
14
|
if ( ref $thing ne 'ARRAY' or $wrap_array ) { |
915
|
|
|
|
|
|
|
$$xml_ref .= $self->start_tag( $key, \%attr ); |
916
|
|
|
|
|
|
|
} |
917
|
8
|
|
|
|
|
19
|
$self->_ref_to_xml( |
918
|
|
|
|
|
|
|
$thing, $key_to_pass, $xml_ref, |
919
|
|
|
|
|
|
|
$strip_plural, $escape, $wrap_array |
920
|
8
|
100
|
100
|
|
|
27
|
); |
921
|
6
|
|
|
|
|
11
|
if ( ref $thing ne 'ARRAY' or $wrap_array ) { |
922
|
|
|
|
|
|
|
$$xml_ref .= $self->end_tag($key); |
923
|
|
|
|
|
|
|
} |
924
|
|
|
|
|
|
|
|
925
|
|
|
|
|
|
|
#$$xml_ref .= "\n"; # just for debugging |
926
|
|
|
|
|
|
|
} |
927
|
21
|
|
|
|
|
41
|
else { |
928
|
|
|
|
|
|
|
$self->_ref_to_xml( $thing, $key, $xml_ref, $strip_plural, |
929
|
|
|
|
|
|
|
$escape, $wrap_array ); |
930
|
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
|
} |
932
|
|
|
|
|
|
|
} |
933
|
|
|
|
|
|
|
|
934
|
3
|
|
|
3
|
|
14
|
sub _scalar_to_xml { |
935
|
|
|
|
|
|
|
my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array ) |
936
|
3
|
100
|
|
|
|
7
|
= @_; |
937
|
|
|
|
|
|
|
$$xml_ref |
938
|
|
|
|
|
|
|
.= $self->start_tag($root) |
939
|
|
|
|
|
|
|
. ( $escape ? $self->utf8_safe($perl) : $perl ) |
940
|
|
|
|
|
|
|
. $self->end_tag($root); |
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
#$$xml_ref .= "\n"; # just for debugging |
943
|
|
|
|
|
|
|
} |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
=head2 tidy( I ) |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
Attempts to indent I correctly to make |
948
|
|
|
|
|
|
|
it more legible. |
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
Returns the I tidied up. |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
B This is an experimental feature. It might be |
953
|
|
|
|
|
|
|
really slow or eat your XML. You have been warned. |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
=cut |
956
|
|
|
|
|
|
|
|
957
|
0
|
|
|
0
|
1
|
|
sub tidy { |
958
|
0
|
|
|
|
|
|
my $xml = pop; |
959
|
0
|
|
|
|
|
|
my $level = 2; |
960
|
0
|
|
|
|
|
|
my $indent = 0; |
961
|
|
|
|
|
|
|
my @tidy = (); |
962
|
|
|
|
|
|
|
|
963
|
0
|
|
|
|
|
|
# normalize tag breaks |
964
|
|
|
|
|
|
|
$xml =~ s,>\s*<,>\n<,gs; |
965
|
0
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
my @xmlarr = split( m/\n/, $xml ); |
967
|
|
|
|
|
|
|
|
968
|
0
|
0
|
0
|
|
|
|
# shift off declaration |
969
|
0
|
|
|
|
|
|
if ( scalar(@xmlarr) and $xmlarr[0] =~ m/^<\?\s*xml/ ) { |
970
|
|
|
|
|
|
|
push @tidy, shift(@xmlarr); |
971
|
|
|
|
|
|
|
} |
972
|
0
|
|
|
|
|
|
|
973
|
0
|
|
|
|
|
|
my $count = 0; |
974
|
|
|
|
|
|
|
for my $el (@xmlarr) { |
975
|
0
|
0
|
|
|
|
|
|
976
|
0
|
|
|
|
|
|
if ( $count == 1 ) { |
977
|
|
|
|
|
|
|
$indent = 2; |
978
|
0
|
0
|
|
|
|
|
} |
979
|
0
|
|
|
|
|
|
if ( $count == scalar(@xmlarr) - 1 ) { |
980
|
|
|
|
|
|
|
$indent = 0; |
981
|
|
|
|
|
|
|
} |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
#warn "el: $el\n"; |
984
|
|
|
|
|
|
|
|
985
|
0
|
0
|
|
|
|
|
# singletons get special treatment |
|
|
0
|
|
|
|
|
|
986
|
|
|
|
|
|
|
if ( $el =~ m/^<([\w])+[^>]*\/>$/ ) { |
987
|
0
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
push @tidy, ( ' ' x $indent ) . $el; |
989
|
|
|
|
|
|
|
} |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
# match opening tag |
992
|
|
|
|
|
|
|
elsif ( $el =~ m/^<([\w])+[^>]*>$/ ) { |
993
|
|
|
|
|
|
|
|
994
|
0
|
|
|
|
|
|
#warn "open $indent\n"; |
995
|
0
|
|
|
|
|
|
push @tidy, ( ' ' x $indent ) . $el; |
996
|
|
|
|
|
|
|
$indent += $level; |
997
|
|
|
|
|
|
|
} |
998
|
0
|
0
|
|
|
|
|
else { |
999
|
|
|
|
|
|
|
if ( $el =~ m/^<\// ) { |
1000
|
|
|
|
|
|
|
|
1001
|
0
|
|
|
|
|
|
#warn "close $indent\n"; |
1002
|
|
|
|
|
|
|
$indent -= $level; # closing tag |
1003
|
0
|
0
|
|
|
|
|
} |
1004
|
0
|
|
|
|
|
|
if ( $indent < 0 ) { |
1005
|
|
|
|
|
|
|
$indent += $level; |
1006
|
0
|
|
|
|
|
|
} |
1007
|
|
|
|
|
|
|
push @tidy, ( ' ' x $indent ) . $el; |
1008
|
|
|
|
|
|
|
} |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
#warn "indent = $indent\n"; |
1011
|
|
|
|
|
|
|
|
1012
|
0
|
|
|
|
|
|
#Data::Dump::dump \@tidy; |
1013
|
|
|
|
|
|
|
$count++; |
1014
|
|
|
|
|
|
|
} |
1015
|
0
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
return join( "\n", @tidy ); |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
} |
1019
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
1; |
1021
|
|
|
|
|
|
|
__END__ |