| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Search::Tools::XML; |
|
2
|
31
|
|
|
31
|
|
153768
|
use Moo; |
|
|
31
|
|
|
|
|
87794
|
|
|
|
31
|
|
|
|
|
145
|
|
|
3
|
31
|
|
|
31
|
|
12920
|
use Carp; |
|
|
31
|
|
|
|
|
39
|
|
|
|
31
|
|
|
|
|
1375
|
|
|
4
|
31
|
|
|
31
|
|
4182
|
use Search::Tools; # XS required |
|
|
31
|
|
|
|
|
36
|
|
|
|
31
|
|
|
|
|
597
|
|
|
5
|
31
|
|
|
31
|
|
5498
|
use Search::Tools::UTF8; |
|
|
31
|
|
|
|
|
50
|
|
|
|
31
|
|
|
|
|
2510
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
31
|
|
|
31
|
|
6321
|
use namespace::autoclean; |
|
|
31
|
|
|
|
|
145543
|
|
|
|
31
|
|
|
|
|
135
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our $VERSION = '1.004'; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=pod |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Search::Tools::XML - methods for playing nice with XML and HTML |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
use Search::Tools::XML; |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my $class = 'Search::Tools::XML'; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
my $text = 'the "quick brown" fox'; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my $xml = $class->start_tag('foo'); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
$xml .= $class->utf8_safe( $text ); |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
$xml .= $class->end_tag('foo'); |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# $xml: the "quick brown" fox |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
$xml = $class->escape( $xml ); |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# $xml: <foo>the "quick brown" fox</foo> |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
$xml = $class->unescape( $xml ); |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# $xml: the "quick brown" fox |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
my $plain = $class->no_html( $xml ); |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# $plain eq $text |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
B The API for escape() and unescape() has changed as of version 0.16. |
|
49
|
|
|
|
|
|
|
The text is no longer modified in place, as this was less intuitive. |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Search::Tools::XML provides utility methods for dealing with XML and HTML. |
|
52
|
|
|
|
|
|
|
There isn't really anything new here that CPAN doesn't provide via HTML::Entities |
|
53
|
|
|
|
|
|
|
or similar modules. The difference is convenience: the most common methods you |
|
54
|
|
|
|
|
|
|
need for search apps are in one place with no extra dependencies. |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
B To get full UTF-8 character set from chr() you must be using Perl >= 5.8. |
|
57
|
|
|
|
|
|
|
This affects things like the unescape* methods. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=head1 VARIABLES |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=head2 %HTML_ents |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
Complete map of all named HTML entities to their decimal values. |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=cut |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# regexp for what constitutes whitespace in an HTML doc |
|
68
|
|
|
|
|
|
|
# it's not as simple as \s| so we define it separately |
|
69
|
|
|
|
|
|
|
my @white_hex_pts = qw( |
|
70
|
|
|
|
|
|
|
0009 |
|
71
|
|
|
|
|
|
|
000a |
|
72
|
|
|
|
|
|
|
000b |
|
73
|
|
|
|
|
|
|
000c |
|
74
|
|
|
|
|
|
|
000d |
|
75
|
|
|
|
|
|
|
0020 |
|
76
|
|
|
|
|
|
|
00a0 |
|
77
|
|
|
|
|
|
|
2000 |
|
78
|
|
|
|
|
|
|
2001 |
|
79
|
|
|
|
|
|
|
2002 |
|
80
|
|
|
|
|
|
|
2003 |
|
81
|
|
|
|
|
|
|
2004 |
|
82
|
|
|
|
|
|
|
2005 |
|
83
|
|
|
|
|
|
|
2006 |
|
84
|
|
|
|
|
|
|
2007 |
|
85
|
|
|
|
|
|
|
2008 |
|
86
|
|
|
|
|
|
|
2009 |
|
87
|
|
|
|
|
|
|
200a |
|
88
|
|
|
|
|
|
|
200b |
|
89
|
|
|
|
|
|
|
2028 |
|
90
|
|
|
|
|
|
|
2029 |
|
91
|
|
|
|
|
|
|
202f |
|
92
|
|
|
|
|
|
|
205f |
|
93
|
|
|
|
|
|
|
2060 |
|
94
|
|
|
|
|
|
|
3000 |
|
95
|
|
|
|
|
|
|
); |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
my @whitesp = ( '\s', ' ' ); |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# NOTE that the pound sign # needs escaping because we use |
|
100
|
|
|
|
|
|
|
# the 'x' flag in our regexp. |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
for my $w (@white_hex_pts) { |
|
103
|
|
|
|
|
|
|
push @whitesp, sprintf( "&\\#x%s;", $w ); # hex entity |
|
104
|
|
|
|
|
|
|
push @whitesp, sprintf( "&\\#%s;", hex($w) ); # dec entity |
|
105
|
|
|
|
|
|
|
push @whitesp, sprintf( "\\%s", chr( hex($w) ) ); # byte value |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
my $HTML_WHITESPACE = join( '|', @whitesp ); |
|
109
|
|
|
|
|
|
|
my $WHITESPACE = join( '|', map { chr( hex($_) ) } @white_hex_pts ); |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# HTML entity table |
|
112
|
|
|
|
|
|
|
# this just removes a dependency on another module... |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
our %HTML_ents = ( |
|
115
|
|
|
|
|
|
|
quot => 34, |
|
116
|
|
|
|
|
|
|
amp => 38, |
|
117
|
|
|
|
|
|
|
apos => 39, |
|
118
|
|
|
|
|
|
|
'lt' => 60, |
|
119
|
|
|
|
|
|
|
'gt' => 62, |
|
120
|
|
|
|
|
|
|
nbsp => 160, |
|
121
|
|
|
|
|
|
|
iexcl => 161, |
|
122
|
|
|
|
|
|
|
cent => 162, |
|
123
|
|
|
|
|
|
|
pound => 163, |
|
124
|
|
|
|
|
|
|
curren => 164, |
|
125
|
|
|
|
|
|
|
yen => 165, |
|
126
|
|
|
|
|
|
|
brvbar => 166, |
|
127
|
|
|
|
|
|
|
sect => 167, |
|
128
|
|
|
|
|
|
|
uml => 168, |
|
129
|
|
|
|
|
|
|
copy => 169, |
|
130
|
|
|
|
|
|
|
ordf => 170, |
|
131
|
|
|
|
|
|
|
laquo => 171, |
|
132
|
|
|
|
|
|
|
not => 172, |
|
133
|
|
|
|
|
|
|
shy => 173, |
|
134
|
|
|
|
|
|
|
reg => 174, |
|
135
|
|
|
|
|
|
|
macr => 175, |
|
136
|
|
|
|
|
|
|
deg => 176, |
|
137
|
|
|
|
|
|
|
plusmn => 177, |
|
138
|
|
|
|
|
|
|
sup2 => 178, |
|
139
|
|
|
|
|
|
|
sup3 => 179, |
|
140
|
|
|
|
|
|
|
acute => 180, |
|
141
|
|
|
|
|
|
|
micro => 181, |
|
142
|
|
|
|
|
|
|
para => 182, |
|
143
|
|
|
|
|
|
|
middot => 183, |
|
144
|
|
|
|
|
|
|
cedil => 184, |
|
145
|
|
|
|
|
|
|
sup1 => 185, |
|
146
|
|
|
|
|
|
|
ordm => 186, |
|
147
|
|
|
|
|
|
|
raquo => 187, |
|
148
|
|
|
|
|
|
|
frac14 => 188, |
|
149
|
|
|
|
|
|
|
frac12 => 189, |
|
150
|
|
|
|
|
|
|
frac34 => 190, |
|
151
|
|
|
|
|
|
|
iquest => 191, |
|
152
|
|
|
|
|
|
|
Agrave => 192, |
|
153
|
|
|
|
|
|
|
Aacute => 193, |
|
154
|
|
|
|
|
|
|
Acirc => 194, |
|
155
|
|
|
|
|
|
|
Atilde => 195, |
|
156
|
|
|
|
|
|
|
Auml => 196, |
|
157
|
|
|
|
|
|
|
Aring => 197, |
|
158
|
|
|
|
|
|
|
AElig => 198, |
|
159
|
|
|
|
|
|
|
Ccedil => 199, |
|
160
|
|
|
|
|
|
|
Egrave => 200, |
|
161
|
|
|
|
|
|
|
Eacute => 201, |
|
162
|
|
|
|
|
|
|
Ecirc => 202, |
|
163
|
|
|
|
|
|
|
Euml => 203, |
|
164
|
|
|
|
|
|
|
Igrave => 204, |
|
165
|
|
|
|
|
|
|
Iacute => 205, |
|
166
|
|
|
|
|
|
|
Icirc => 206, |
|
167
|
|
|
|
|
|
|
Iuml => 207, |
|
168
|
|
|
|
|
|
|
ETH => 208, |
|
169
|
|
|
|
|
|
|
Ntilde => 209, |
|
170
|
|
|
|
|
|
|
Ograve => 210, |
|
171
|
|
|
|
|
|
|
Oacute => 211, |
|
172
|
|
|
|
|
|
|
Ocirc => 212, |
|
173
|
|
|
|
|
|
|
Otilde => 213, |
|
174
|
|
|
|
|
|
|
Ouml => 214, |
|
175
|
|
|
|
|
|
|
'times' => 215, |
|
176
|
|
|
|
|
|
|
Oslash => 216, |
|
177
|
|
|
|
|
|
|
Ugrave => 217, |
|
178
|
|
|
|
|
|
|
Uacute => 218, |
|
179
|
|
|
|
|
|
|
Ucirc => 219, |
|
180
|
|
|
|
|
|
|
Uuml => 220, |
|
181
|
|
|
|
|
|
|
Yacute => 221, |
|
182
|
|
|
|
|
|
|
THORN => 222, |
|
183
|
|
|
|
|
|
|
szlig => 223, |
|
184
|
|
|
|
|
|
|
agrave => 224, |
|
185
|
|
|
|
|
|
|
aacute => 225, |
|
186
|
|
|
|
|
|
|
acirc => 226, |
|
187
|
|
|
|
|
|
|
atilde => 227, |
|
188
|
|
|
|
|
|
|
auml => 228, |
|
189
|
|
|
|
|
|
|
aring => 229, |
|
190
|
|
|
|
|
|
|
aelig => 230, |
|
191
|
|
|
|
|
|
|
ccedil => 231, |
|
192
|
|
|
|
|
|
|
egrave => 232, |
|
193
|
|
|
|
|
|
|
eacute => 233, |
|
194
|
|
|
|
|
|
|
ecirc => 234, |
|
195
|
|
|
|
|
|
|
euml => 235, |
|
196
|
|
|
|
|
|
|
igrave => 236, |
|
197
|
|
|
|
|
|
|
iacute => 237, |
|
198
|
|
|
|
|
|
|
icirc => 238, |
|
199
|
|
|
|
|
|
|
iuml => 239, |
|
200
|
|
|
|
|
|
|
eth => 240, |
|
201
|
|
|
|
|
|
|
ntilde => 241, |
|
202
|
|
|
|
|
|
|
ograve => 242, |
|
203
|
|
|
|
|
|
|
oacute => 243, |
|
204
|
|
|
|
|
|
|
ocirc => 244, |
|
205
|
|
|
|
|
|
|
otilde => 245, |
|
206
|
|
|
|
|
|
|
ouml => 246, |
|
207
|
|
|
|
|
|
|
divide => 247, |
|
208
|
|
|
|
|
|
|
oslash => 248, |
|
209
|
|
|
|
|
|
|
ugrave => 249, |
|
210
|
|
|
|
|
|
|
uacute => 250, |
|
211
|
|
|
|
|
|
|
ucirc => 251, |
|
212
|
|
|
|
|
|
|
uuml => 252, |
|
213
|
|
|
|
|
|
|
yacute => 253, |
|
214
|
|
|
|
|
|
|
thorn => 254, |
|
215
|
|
|
|
|
|
|
yuml => 255, |
|
216
|
|
|
|
|
|
|
OElig => 338, |
|
217
|
|
|
|
|
|
|
oelig => 339, |
|
218
|
|
|
|
|
|
|
Scaron => 352, |
|
219
|
|
|
|
|
|
|
scaron => 353, |
|
220
|
|
|
|
|
|
|
Yuml => 376, |
|
221
|
|
|
|
|
|
|
fnof => 402, |
|
222
|
|
|
|
|
|
|
circ => 710, |
|
223
|
|
|
|
|
|
|
tilde => 732, |
|
224
|
|
|
|
|
|
|
Alpha => 913, |
|
225
|
|
|
|
|
|
|
Beta => 914, |
|
226
|
|
|
|
|
|
|
Gamma => 915, |
|
227
|
|
|
|
|
|
|
Delta => 916, |
|
228
|
|
|
|
|
|
|
Epsilon => 917, |
|
229
|
|
|
|
|
|
|
Zeta => 918, |
|
230
|
|
|
|
|
|
|
Eta => 919, |
|
231
|
|
|
|
|
|
|
Theta => 920, |
|
232
|
|
|
|
|
|
|
Iota => 921, |
|
233
|
|
|
|
|
|
|
Kappa => 922, |
|
234
|
|
|
|
|
|
|
Lambda => 923, |
|
235
|
|
|
|
|
|
|
Mu => 924, |
|
236
|
|
|
|
|
|
|
Nu => 925, |
|
237
|
|
|
|
|
|
|
Xi => 926, |
|
238
|
|
|
|
|
|
|
Omicron => 927, |
|
239
|
|
|
|
|
|
|
Pi => 928, |
|
240
|
|
|
|
|
|
|
Rho => 929, |
|
241
|
|
|
|
|
|
|
Sigma => 931, |
|
242
|
|
|
|
|
|
|
Tau => 932, |
|
243
|
|
|
|
|
|
|
Upsilon => 933, |
|
244
|
|
|
|
|
|
|
Phi => 934, |
|
245
|
|
|
|
|
|
|
Chi => 935, |
|
246
|
|
|
|
|
|
|
Psi => 936, |
|
247
|
|
|
|
|
|
|
Omega => 937, |
|
248
|
|
|
|
|
|
|
alpha => 945, |
|
249
|
|
|
|
|
|
|
beta => 946, |
|
250
|
|
|
|
|
|
|
gamma => 947, |
|
251
|
|
|
|
|
|
|
delta => 948, |
|
252
|
|
|
|
|
|
|
epsilon => 949, |
|
253
|
|
|
|
|
|
|
zeta => 950, |
|
254
|
|
|
|
|
|
|
eta => 951, |
|
255
|
|
|
|
|
|
|
theta => 952, |
|
256
|
|
|
|
|
|
|
iota => 953, |
|
257
|
|
|
|
|
|
|
kappa => 954, |
|
258
|
|
|
|
|
|
|
lambda => 955, |
|
259
|
|
|
|
|
|
|
mu => 956, |
|
260
|
|
|
|
|
|
|
nu => 957, |
|
261
|
|
|
|
|
|
|
xi => 958, |
|
262
|
|
|
|
|
|
|
omicron => 959, |
|
263
|
|
|
|
|
|
|
pi => 960, |
|
264
|
|
|
|
|
|
|
rho => 961, |
|
265
|
|
|
|
|
|
|
sigmaf => 962, |
|
266
|
|
|
|
|
|
|
sigma => 963, |
|
267
|
|
|
|
|
|
|
tau => 964, |
|
268
|
|
|
|
|
|
|
upsilon => 965, |
|
269
|
|
|
|
|
|
|
phi => 966, |
|
270
|
|
|
|
|
|
|
chi => 967, |
|
271
|
|
|
|
|
|
|
psi => 968, |
|
272
|
|
|
|
|
|
|
omega => 969, |
|
273
|
|
|
|
|
|
|
thetasym => 977, |
|
274
|
|
|
|
|
|
|
upsih => 978, |
|
275
|
|
|
|
|
|
|
piv => 982, |
|
276
|
|
|
|
|
|
|
ensp => 8194, |
|
277
|
|
|
|
|
|
|
emsp => 8195, |
|
278
|
|
|
|
|
|
|
thinsp => 8201, |
|
279
|
|
|
|
|
|
|
zwnj => 8204, |
|
280
|
|
|
|
|
|
|
zwj => 8205, |
|
281
|
|
|
|
|
|
|
lrm => 8206, |
|
282
|
|
|
|
|
|
|
rlm => 8207, |
|
283
|
|
|
|
|
|
|
ndash => 8211, |
|
284
|
|
|
|
|
|
|
mdash => 8212, |
|
285
|
|
|
|
|
|
|
lsquo => 8216, |
|
286
|
|
|
|
|
|
|
rsquo => 8217, |
|
287
|
|
|
|
|
|
|
sbquo => 8218, |
|
288
|
|
|
|
|
|
|
ldquo => 8220, |
|
289
|
|
|
|
|
|
|
rdquo => 8221, |
|
290
|
|
|
|
|
|
|
bdquo => 8222, |
|
291
|
|
|
|
|
|
|
dagger => 8224, |
|
292
|
|
|
|
|
|
|
Dagger => 8225, |
|
293
|
|
|
|
|
|
|
bull => 8226, |
|
294
|
|
|
|
|
|
|
hellip => 8230, |
|
295
|
|
|
|
|
|
|
permil => 8240, |
|
296
|
|
|
|
|
|
|
prime => 8242, |
|
297
|
|
|
|
|
|
|
Prime => 8243, |
|
298
|
|
|
|
|
|
|
lsaquo => 8249, |
|
299
|
|
|
|
|
|
|
rsaquo => 8250, |
|
300
|
|
|
|
|
|
|
oline => 8254, |
|
301
|
|
|
|
|
|
|
frasl => 8260, |
|
302
|
|
|
|
|
|
|
euro => 8364, |
|
303
|
|
|
|
|
|
|
image => 8465, |
|
304
|
|
|
|
|
|
|
weierp => 8472, |
|
305
|
|
|
|
|
|
|
real => 8476, |
|
306
|
|
|
|
|
|
|
trade => 8482, |
|
307
|
|
|
|
|
|
|
alefsym => 8501, |
|
308
|
|
|
|
|
|
|
larr => 8592, |
|
309
|
|
|
|
|
|
|
uarr => 8593, |
|
310
|
|
|
|
|
|
|
rarr => 8594, |
|
311
|
|
|
|
|
|
|
darr => 8595, |
|
312
|
|
|
|
|
|
|
harr => 8596, |
|
313
|
|
|
|
|
|
|
crarr => 8629, |
|
314
|
|
|
|
|
|
|
lArr => 8656, |
|
315
|
|
|
|
|
|
|
uArr => 8657, |
|
316
|
|
|
|
|
|
|
rArr => 8658, |
|
317
|
|
|
|
|
|
|
dArr => 8659, |
|
318
|
|
|
|
|
|
|
hArr => 8660, |
|
319
|
|
|
|
|
|
|
forall => 8704, |
|
320
|
|
|
|
|
|
|
part => 8706, |
|
321
|
|
|
|
|
|
|
exist => 8707, |
|
322
|
|
|
|
|
|
|
empty => 8709, |
|
323
|
|
|
|
|
|
|
nabla => 8711, |
|
324
|
|
|
|
|
|
|
isin => 8712, |
|
325
|
|
|
|
|
|
|
notin => 8713, |
|
326
|
|
|
|
|
|
|
ni => 8715, |
|
327
|
|
|
|
|
|
|
prod => 8719, |
|
328
|
|
|
|
|
|
|
'sum' => 8721, |
|
329
|
|
|
|
|
|
|
'minus' => 8722, |
|
330
|
|
|
|
|
|
|
lowast => 8727, |
|
331
|
|
|
|
|
|
|
radic => 8730, |
|
332
|
|
|
|
|
|
|
prop => 8733, |
|
333
|
|
|
|
|
|
|
infin => 8734, |
|
334
|
|
|
|
|
|
|
ang => 8736, |
|
335
|
|
|
|
|
|
|
'and' => 8743, |
|
336
|
|
|
|
|
|
|
'or' => 8744, |
|
337
|
|
|
|
|
|
|
cap => 8745, |
|
338
|
|
|
|
|
|
|
cup => 8746, |
|
339
|
|
|
|
|
|
|
int => 8747, |
|
340
|
|
|
|
|
|
|
there4 => 8756, |
|
341
|
|
|
|
|
|
|
sim => 8764, |
|
342
|
|
|
|
|
|
|
cong => 8773, |
|
343
|
|
|
|
|
|
|
asymp => 8776, |
|
344
|
|
|
|
|
|
|
ne => 8800, |
|
345
|
|
|
|
|
|
|
equiv => 8801, |
|
346
|
|
|
|
|
|
|
le => 8804, |
|
347
|
|
|
|
|
|
|
ge => 8805, |
|
348
|
|
|
|
|
|
|
sub => 8834, |
|
349
|
|
|
|
|
|
|
sup => 8835, |
|
350
|
|
|
|
|
|
|
nsub => 8836, |
|
351
|
|
|
|
|
|
|
sube => 8838, |
|
352
|
|
|
|
|
|
|
supe => 8839, |
|
353
|
|
|
|
|
|
|
oplus => 8853, |
|
354
|
|
|
|
|
|
|
otimes => 8855, |
|
355
|
|
|
|
|
|
|
perp => 8869, |
|
356
|
|
|
|
|
|
|
sdot => 8901, |
|
357
|
|
|
|
|
|
|
lceil => 8968, |
|
358
|
|
|
|
|
|
|
rceil => 8969, |
|
359
|
|
|
|
|
|
|
lfloor => 8970, |
|
360
|
|
|
|
|
|
|
rfloor => 8971, |
|
361
|
|
|
|
|
|
|
lang => 9001, |
|
362
|
|
|
|
|
|
|
rang => 9002, |
|
363
|
|
|
|
|
|
|
loz => 9674, |
|
364
|
|
|
|
|
|
|
spades => 9824, |
|
365
|
|
|
|
|
|
|
clubs => 9827, |
|
366
|
|
|
|
|
|
|
hearts => 9829, |
|
367
|
|
|
|
|
|
|
diams => 9830, |
|
368
|
|
|
|
|
|
|
); |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
my %char2entity = (); |
|
371
|
|
|
|
|
|
|
while ( my ( $e, $n ) = each(%HTML_ents) ) { |
|
372
|
|
|
|
|
|
|
my $char = chr($n); |
|
373
|
|
|
|
|
|
|
$char2entity{$char} = "&$e;"; |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
delete $char2entity{q/'/}; # only one-way decoding |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
# Fill in missing entities |
|
378
|
|
|
|
|
|
|
# TODO does this only work under latin1 locale? |
|
379
|
|
|
|
|
|
|
for ( 0 .. 255 ) { |
|
380
|
|
|
|
|
|
|
next if exists $char2entity{ chr($_) }; |
|
381
|
|
|
|
|
|
|
$char2entity{ chr($_) } = "$_;"; |
|
382
|
|
|
|
|
|
|
} |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head1 METHODS |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
The following methods may be accessed either as object or class methods. |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=head2 new |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
Create a Search::Tools::XML object. |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=cut |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=head2 tag_re |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
Returns a qr// regex for matching a SGML (XML, HTML, etc) tag. |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=cut |
|
399
|
|
|
|
|
|
|
|
|
400
|
36
|
|
|
36
|
1
|
223
|
sub tag_re {qr/<[^>]+>/s} |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=head2 html_whitespace |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
Returns a regex for all whitespace characters and |
|
405
|
|
|
|
|
|
|
HTML whitespace entities. |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
=cut |
|
408
|
|
|
|
|
|
|
|
|
409
|
26
|
|
|
26
|
1
|
317
|
sub html_whitespace {$HTML_WHITESPACE} |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=head2 char2ent_map |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
Returns a hash reference to the class data mapping chr() values to their |
|
414
|
|
|
|
|
|
|
numerical entity equivalents. |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
=cut |
|
417
|
|
|
|
|
|
|
|
|
418
|
26
|
|
|
26
|
1
|
59
|
sub char2ent_map { \%char2entity } |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head2 looks_like_html( I ) |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
Returns true if I appears to have HTML-like markup in it. |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
Aliases for this method include: |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=over |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item looks_like_xml |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=item looks_like_markup |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=back |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=cut |
|
435
|
|
|
|
|
|
|
|
|
436
|
31
|
|
|
31
|
1
|
2008
|
sub looks_like_html { return $_[1] =~ m/[<>]|&[\#\w]+;/o } |
|
437
|
|
|
|
|
|
|
*looks_like_xml = \&looks_like_html; |
|
438
|
|
|
|
|
|
|
*looks_like_markup = \&looks_like_html; |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=head2 start_tag( I [, I<\%attr> ] ) |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=head2 end_tag( I ) |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
Returns I as a tag, either start or end. I will be escaped for any non-valid |
|
445
|
|
|
|
|
|
|
chars using tag_safe(). |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
If I<\%attr> is passed, XML-safe attributes are generated using attr_safe(). |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=head2 singleton( I [, I<\%attr> ] ) |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
Like start_tag() but includes the closing slash. |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
=cut |
|
454
|
|
|
|
|
|
|
|
|
455
|
54
|
|
|
54
|
1
|
306
|
sub start_tag { "<" . tag_safe( $_[1] ) . $_[0]->attr_safe( $_[2] ) . ">" } |
|
456
|
53
|
|
|
53
|
1
|
59
|
sub end_tag { "" . tag_safe( $_[1] ) . ">" } |
|
457
|
0
|
|
|
0
|
1
|
0
|
sub singleton { "<" . tag_safe( $_[1] ) . $_[0]->attr_safe( $_[2] ) . "/>" } |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=pod |
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=head2 tag_safe( I ) |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
Create a valid XML tag name, escaping/omitting invalid characters. |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
Example: |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
my $tag = Search::Tools::XML->tag_safe( '1 * ! tag foo' ); |
|
468
|
|
|
|
|
|
|
# $tag == '______tag_foo' |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
=cut |
|
471
|
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
sub tag_safe { |
|
473
|
111
|
|
|
111
|
1
|
84
|
my $t = pop; |
|
474
|
|
|
|
|
|
|
|
|
475
|
111
|
50
|
|
|
|
124
|
return '_' unless length $t; |
|
476
|
|
|
|
|
|
|
|
|
477
|
111
|
|
|
|
|
82
|
$t =~ s/::/_/g; # single colons ok, but doubles are not |
|
478
|
111
|
|
|
|
|
86
|
$t =~ s/[^-\.\w:]/_/g; |
|
479
|
111
|
|
|
|
|
132
|
$t =~ s/^(\d)/_$1/; |
|
480
|
|
|
|
|
|
|
|
|
481
|
111
|
|
|
|
|
222
|
return $t; |
|
482
|
|
|
|
|
|
|
} |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=head2 attr_safe( I<\%attr> ) |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
Returns stringified I<\%attr> as XML attributes. |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
=cut |
|
489
|
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub attr_safe { |
|
491
|
54
|
|
|
54
|
1
|
38
|
my $self = shift; |
|
492
|
54
|
|
|
|
|
45
|
my $attr = shift; |
|
493
|
54
|
100
|
|
|
|
117
|
return '' unless defined $attr; |
|
494
|
11
|
50
|
|
|
|
19
|
if ( ref $attr ne "HASH" ) { |
|
495
|
0
|
|
|
|
|
0
|
croak "attributes must be a hash ref"; |
|
496
|
|
|
|
|
|
|
} |
|
497
|
11
|
|
|
|
|
16
|
my @xml = (''); # force space at start in return |
|
498
|
11
|
|
|
|
|
26
|
for my $name ( sort keys %$attr ) { |
|
499
|
|
|
|
|
|
|
my $val = _escape_xml( $attr->{$name}, |
|
500
|
4
|
|
|
|
|
13
|
is_flagged_utf8( $attr->{$name} ) ); |
|
501
|
4
|
|
|
|
|
7
|
push @xml, tag_safe($name) . qq{="$val"}; |
|
502
|
|
|
|
|
|
|
} |
|
503
|
11
|
|
|
|
|
30
|
return join( ' ', @xml ); |
|
504
|
|
|
|
|
|
|
} |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
=pod |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head2 utf8_safe( I ) |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
Return I with special XML chars and all |
|
511
|
|
|
|
|
|
|
non-ASCII chars converted to numeric entities. |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
This is escape() on steroids. B |
|
514
|
|
|
|
|
|
|
unless you know what you're doing. See the SYNOPSIS for an example. |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=head2 escape_utf8 |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Alias for utf8_safe(). |
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
=cut |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
*escape_utf8 = \&utf8_safe; |
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
sub utf8_safe { |
|
525
|
27
|
|
|
27
|
1
|
263
|
my $t = pop; |
|
526
|
27
|
50
|
|
|
|
31
|
$t = '' unless defined $t; |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# converts all low chars except \t \n and \r |
|
529
|
|
|
|
|
|
|
# to space because XML spec disallows <32 |
|
530
|
27
|
|
|
|
|
39
|
$t =~ s,[\x00-\x08\x0b-\x0c\x0e-\x1f], ,g; |
|
531
|
|
|
|
|
|
|
|
|
532
|
27
|
|
|
|
|
26
|
$t =~ s{([^\x09\x0a\x0d\x20\x21\x23-\x25\x28-\x3b\x3d\x3F-\x5B\x5D-\x7E])} |
|
533
|
2
|
|
|
|
|
7
|
{''.(ord($1)).';'}eg; |
|
534
|
|
|
|
|
|
|
|
|
535
|
27
|
|
|
|
|
28
|
return $t; |
|
536
|
|
|
|
|
|
|
} |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=head2 no_html( I [, I] ) |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
no_html() is a brute-force method for removing all tags and entities |
|
541
|
|
|
|
|
|
|
from I. A simple regular expression is used, so things like |
|
542
|
|
|
|
|
|
|
nested comments and the like will probably break. If you really |
|
543
|
|
|
|
|
|
|
need to reliably filter out the tags and entities from a HTML text, use |
|
544
|
|
|
|
|
|
|
HTML::Parser or similar. |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
I is returned with no markup in it. |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
If I is true (defaults to false) then |
|
549
|
|
|
|
|
|
|
all whitespace is normalized away to ASCII space (U+0020). |
|
550
|
|
|
|
|
|
|
This can be helpful if you have Unicode entities representing |
|
551
|
|
|
|
|
|
|
line breaks or other layout instructions. |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=cut |
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
sub no_html { |
|
556
|
10
|
|
|
10
|
1
|
279
|
my $class = shift; |
|
557
|
10
|
|
|
|
|
16
|
my $text = shift; |
|
558
|
10
|
|
100
|
|
|
57
|
my $normalize_whitespace = shift || 0; |
|
559
|
10
|
50
|
|
|
|
34
|
if ( !defined $text ) { |
|
560
|
0
|
|
|
|
|
0
|
croak "text required"; |
|
561
|
|
|
|
|
|
|
} |
|
562
|
10
|
|
|
|
|
33
|
my $re = $class->tag_re; |
|
563
|
10
|
|
|
|
|
2378
|
$text =~ s,$re,,g; |
|
564
|
10
|
|
|
|
|
37
|
$text = $class->unescape($text); |
|
565
|
10
|
100
|
|
|
|
28
|
if ($normalize_whitespace) { |
|
566
|
1
|
|
|
|
|
8
|
$text =~ s/\s+/ /g; |
|
567
|
|
|
|
|
|
|
} |
|
568
|
10
|
|
|
|
|
45
|
return $text; |
|
569
|
|
|
|
|
|
|
} |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=head2 strip_html |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
An alias for no_html(). |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=head2 strip_markup |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
An alias for no_html(). |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=cut |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
*strip_html = \&no_html; |
|
582
|
|
|
|
|
|
|
*strip_markup = \&no_html; |
|
583
|
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
=head2 escape( I ) |
|
585
|
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
Similar to escape() functions in more famous CPAN modules, but without the |
|
587
|
|
|
|
|
|
|
added dependency. escape() will convert the special XML chars (><'"&) to their |
|
588
|
|
|
|
|
|
|
named entity equivalents. |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
The escaped I is returned. |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
B The API for this method has changed as of version 0.16. I |
|
593
|
|
|
|
|
|
|
is no longer modified in-place. |
|
594
|
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
As of version 0.27 escape() is written in C/XS for speed. |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
=cut |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
sub escape { |
|
600
|
2
|
|
|
2
|
1
|
361
|
my $text = pop; |
|
601
|
2
|
50
|
|
|
|
4
|
return unless defined $text; |
|
602
|
2
|
|
|
|
|
8
|
return _escape_xml( $text, is_flagged_utf8($text) ); |
|
603
|
|
|
|
|
|
|
} |
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=head2 unescape( I ) |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
Similar to unescape() functions in more famous CPAN modules, but without the added |
|
608
|
|
|
|
|
|
|
dependency. unescape() will convert all entities to their chr() equivalents. |
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
B unescape() does more than reverse the effects of escape(). It attempts |
|
611
|
|
|
|
|
|
|
to resolve B entities, not just the special XML entities (><'"&). |
|
612
|
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
B The API for this method has changed as of version 0.16. |
|
614
|
|
|
|
|
|
|
I is no longer modified in-place. |
|
615
|
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
=cut |
|
617
|
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
sub unescape { |
|
619
|
11
|
|
|
11
|
1
|
250
|
my $text = pop; |
|
620
|
11
|
|
|
|
|
30
|
$text = unescape_named($text); |
|
621
|
11
|
|
|
|
|
31
|
$text = unescape_decimal($text); |
|
622
|
11
|
|
|
|
|
19
|
return $text; |
|
623
|
|
|
|
|
|
|
} |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
=head2 unescape_named( I ) |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
Replace all named HTML entities with their chr() equivalents. |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
Returns modified copy of I. |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
=cut |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
sub unescape_named { |
|
634
|
13
|
|
|
13
|
1
|
18
|
my $t = pop; |
|
635
|
13
|
50
|
|
|
|
39
|
if ( defined($t) ) { |
|
636
|
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
# named entities - check first to see if it is worth looping |
|
638
|
13
|
100
|
|
|
|
54
|
if ( $t =~ m/&[a-zA-Z0-9]+;/ ) { |
|
639
|
7
|
|
|
|
|
294
|
for my $e ( keys %HTML_ents ) { |
|
640
|
1771
|
|
|
|
|
1257
|
my $dec = $HTML_ents{$e}; |
|
641
|
1771
|
100
|
|
|
|
16070
|
if ( my $n = $t =~ s/&$e;/chr($dec)/eg ) { |
|
|
163
|
|
|
|
|
303
|
|
|
642
|
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
#warn "replaced $e ($dec) -> $HTML_ents{$e} $n times in text"; |
|
644
|
|
|
|
|
|
|
} |
|
645
|
|
|
|
|
|
|
} |
|
646
|
|
|
|
|
|
|
} |
|
647
|
|
|
|
|
|
|
} |
|
648
|
13
|
|
|
|
|
80
|
return $t; |
|
649
|
|
|
|
|
|
|
} |
|
650
|
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=head2 unescape_decimal( I ) |
|
652
|
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
Replace all decimal entities with their chr() equivalents. |
|
654
|
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
Returns modified copy of I. |
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
=cut |
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
sub unescape_decimal { |
|
660
|
11
|
|
|
11
|
1
|
16
|
my $t = pop; |
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
# resolve numeric entities as best we can |
|
663
|
11
|
50
|
|
|
|
75
|
$t =~ s/(\d+);/chr($1)/ego if defined($t); |
|
|
5
|
|
|
|
|
18
|
|
|
664
|
11
|
|
|
|
|
20
|
return $t; |
|
665
|
|
|
|
|
|
|
} |
|
666
|
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
=head2 perl_to_xml( I[ [, I] ) ] |
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
Similar to the XML::Simple XMLout() feature, perl_to_xml() |
|
670
|
|
|
|
|
|
|
will take a Perl data structure I[ and convert it to XML. ] |
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
I should be a hashref with the following supported key/value pairs: |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
=over |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=item root I |
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
The root element. If I is a string, it is used as the tag name. If |
|
679
|
|
|
|
|
|
|
I is a hashref, two keys are required: |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
=over |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
=item tag |
|
684
|
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
String indicating the element name. |
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=item attrs |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
Hash ref of attribute key/value pairs (see start_tag()). |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
=back |
|
692
|
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=item wrap_array I<1|0> |
|
694
|
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
If B is true (the default), arrayref items are wrapped |
|
696
|
|
|
|
|
|
|
in an additional XML tag, keeping the array items enclosed in a logical set. |
|
697
|
|
|
|
|
|
|
If B is false, each item in the array is treated individually. |
|
698
|
|
|
|
|
|
|
See B below for the naming convention for arrayref items. |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=item strip_plural I<1|0> |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
The B option interacts with the B option. |
|
703
|
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
If B is a true value and not a CODE ref, |
|
705
|
|
|
|
|
|
|
any trailing C character will be stripped from the enclosing tag name |
|
706
|
|
|
|
|
|
|
whenever an array of hashrefs is found. Example: |
|
707
|
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
my $data = { |
|
709
|
|
|
|
|
|
|
values => [ |
|
710
|
|
|
|
|
|
|
{ two => 2, |
|
711
|
|
|
|
|
|
|
three => 3, |
|
712
|
|
|
|
|
|
|
}, |
|
713
|
|
|
|
|
|
|
{ four => 4, |
|
714
|
|
|
|
|
|
|
five => 5, |
|
715
|
|
|
|
|
|
|
}, |
|
716
|
|
|
|
|
|
|
], |
|
717
|
|
|
|
|
|
|
}; |
|
718
|
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
my $xml = $utils->perl_to_xml($data, { |
|
720
|
|
|
|
|
|
|
root => 'data', |
|
721
|
|
|
|
|
|
|
wrap_array => 1, |
|
722
|
|
|
|
|
|
|
strip_plural => 1, |
|
723
|
|
|
|
|
|
|
}); |
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
# $xml DOM will look like: |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
3 |
|
731
|
|
|
|
|
|
|
2 |
|
732
|
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
5 |
|
735
|
|
|
|
|
|
|
4 |
|
736
|
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
Obviously stripping the final C will not always render sensical tag names. |
|
741
|
|
|
|
|
|
|
Pass a CODE ref instead, expecting one value (the tag name) and returning the |
|
742
|
|
|
|
|
|
|
tag name to use: |
|
743
|
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
my $xml = $utils->perl_to_xml($data, { |
|
745
|
|
|
|
|
|
|
root => 'data', |
|
746
|
|
|
|
|
|
|
wrap_array => 1, |
|
747
|
|
|
|
|
|
|
strip_plural => sub { |
|
748
|
|
|
|
|
|
|
my $tag = shift; |
|
749
|
|
|
|
|
|
|
$tag =~ s/foo/BAR/; |
|
750
|
|
|
|
|
|
|
return $tag; |
|
751
|
|
|
|
|
|
|
}, |
|
752
|
|
|
|
|
|
|
}); |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
=item escape I<1|0> |
|
755
|
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
If B is false, strings within the B[ value will not be passed ] |
|
757
|
|
|
|
|
|
|
through escape(). Default is true. |
|
758
|
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
=back |
|
760
|
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
=cut |
|
762
|
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
=head2 perl_to_xml( I[, I [, I ][, I] ) ] |
|
764
|
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
This second usage is deprecated and here for backwards compatability only. |
|
766
|
|
|
|
|
|
|
Use the named key/value I instead. Readers of your code (including you!) will |
|
767
|
|
|
|
|
|
|
thank you. |
|
768
|
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=cut |
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
sub _make_singular { |
|
772
|
4
|
|
|
4
|
|
4
|
my ($t) = @_; |
|
773
|
4
|
|
|
|
|
8
|
$t =~ s/ies$/y/i; |
|
774
|
4
|
50
|
|
|
|
7
|
return $t if ( $t =~ s/ses$/s/i ); |
|
775
|
4
|
50
|
|
|
|
11
|
return $t if ( $t =~ /[aeiouy]ss$/i ); |
|
776
|
4
|
|
|
|
|
9
|
$t =~ s/s$//i; |
|
777
|
4
|
50
|
|
|
|
8
|
return length $t ? $t : $_[0]; |
|
778
|
|
|
|
|
|
|
} |
|
779
|
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
sub perl_to_xml { |
|
781
|
4
|
|
|
4
|
1
|
29
|
my $self = shift; |
|
782
|
4
|
|
|
|
|
4
|
my $perl = shift; |
|
783
|
|
|
|
|
|
|
|
|
784
|
4
|
|
|
|
|
5
|
my ( $root, $wrap_array, $strip_plural, $escape ); |
|
785
|
4
|
100
|
66
|
|
|
20
|
if ( ref $_[0] eq 'HASH' and !exists $_[0]->{tag} ) { |
|
786
|
2
|
|
|
|
|
3
|
my %opts = %{ $_[0] }; |
|
|
2
|
|
|
|
|
10
|
|
|
787
|
2
|
|
50
|
|
|
6
|
$root = delete $opts{root} || '_root'; |
|
788
|
2
|
|
100
|
|
|
7
|
$strip_plural = delete $opts{strip_plural} || 0; |
|
789
|
2
|
|
|
|
|
3
|
$wrap_array = delete $opts{wrap_array}; |
|
790
|
2
|
100
|
|
|
|
4
|
$wrap_array = 1 unless defined $wrap_array; |
|
791
|
2
|
|
|
|
|
3
|
$escape = delete $opts{escape}; |
|
792
|
2
|
100
|
|
|
|
5
|
$escape = 1 unless defined $escape; |
|
793
|
|
|
|
|
|
|
} |
|
794
|
|
|
|
|
|
|
else { |
|
795
|
2
|
|
50
|
|
|
4
|
$root = shift || '_root'; |
|
796
|
2
|
|
100
|
|
|
23
|
$strip_plural = shift || 0; |
|
797
|
2
|
|
|
|
|
2
|
$escape = shift; |
|
798
|
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
# backcompat means we need to reverse logic |
|
800
|
2
|
50
|
33
|
|
|
13
|
if ( defined $escape and $escape == 1 ) { |
|
|
|
50
|
33
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
801
|
0
|
|
|
|
|
0
|
$escape = 0; |
|
802
|
|
|
|
|
|
|
} |
|
803
|
|
|
|
|
|
|
elsif ( defined $escape and $escape == 0 ) { |
|
804
|
0
|
|
|
|
|
0
|
$escape = 1; |
|
805
|
|
|
|
|
|
|
} |
|
806
|
|
|
|
|
|
|
elsif ( !defined $escape ) { |
|
807
|
2
|
|
|
|
|
3
|
$escape = 1; |
|
808
|
|
|
|
|
|
|
} |
|
809
|
|
|
|
|
|
|
|
|
810
|
2
|
|
|
|
|
2
|
$wrap_array = 1; # old behavior |
|
811
|
|
|
|
|
|
|
} |
|
812
|
4
|
50
|
|
|
|
8
|
unless ( defined $perl ) { |
|
813
|
0
|
|
|
|
|
0
|
croak "perl data struct required"; |
|
814
|
|
|
|
|
|
|
} |
|
815
|
|
|
|
|
|
|
|
|
816
|
4
|
100
|
66
|
|
|
13
|
if ( $strip_plural and ref($strip_plural) ne 'CODE' ) { |
|
817
|
2
|
|
|
|
|
4
|
$strip_plural = \&_make_singular; |
|
818
|
|
|
|
|
|
|
} |
|
819
|
|
|
|
|
|
|
|
|
820
|
4
|
|
|
|
|
3
|
my ( $root_tag, $attrs ); |
|
821
|
4
|
100
|
|
|
|
7
|
if ( ref $root ) { |
|
822
|
1
|
50
|
|
|
|
3
|
$root_tag = delete $root->{tag} or croak 'tag key required in root'; |
|
823
|
1
|
50
|
|
|
|
3
|
$attrs = delete $root->{attrs} or croak 'attrs key required in root'; |
|
824
|
|
|
|
|
|
|
} |
|
825
|
|
|
|
|
|
|
else { |
|
826
|
3
|
|
|
|
|
2
|
$root_tag = $root; |
|
827
|
3
|
|
|
|
|
2
|
$attrs = {}; |
|
828
|
|
|
|
|
|
|
} |
|
829
|
|
|
|
|
|
|
|
|
830
|
4
|
50
|
|
|
|
9
|
if ( !ref $perl ) { |
|
831
|
|
|
|
|
|
|
return |
|
832
|
0
|
0
|
|
|
|
0
|
$self->start_tag( $root_tag, $attrs ) |
|
833
|
|
|
|
|
|
|
. ( $escape ? $self->utf8_safe($perl) : $perl ) |
|
834
|
|
|
|
|
|
|
. $self->end_tag($root_tag); |
|
835
|
|
|
|
|
|
|
} |
|
836
|
|
|
|
|
|
|
|
|
837
|
4
|
|
|
|
|
8
|
my $xml = $self->start_tag( $root_tag, $attrs ); |
|
838
|
4
|
|
|
|
|
10
|
$self->_ref_to_xml( $perl, '', \$xml, $strip_plural, $escape, |
|
839
|
|
|
|
|
|
|
$wrap_array ); |
|
840
|
4
|
|
|
|
|
7
|
$xml .= $self->end_tag($root_tag); |
|
841
|
4
|
|
|
|
|
21
|
return $xml; |
|
842
|
|
|
|
|
|
|
} |
|
843
|
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
sub _ref_to_xml { |
|
845
|
58
|
|
|
58
|
|
81
|
my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array ) |
|
846
|
|
|
|
|
|
|
= @_; |
|
847
|
58
|
|
|
|
|
55
|
my $type = ref $perl; |
|
848
|
58
|
100
|
|
|
|
89
|
if ( !$type ) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
849
|
31
|
50
|
|
|
|
57
|
( $$xml_ref .= $self->start_tag($root) ) |
|
850
|
|
|
|
|
|
|
if length($root); |
|
851
|
31
|
100
|
|
|
|
55
|
$$xml_ref .= ( $escape ? $self->utf8_safe($perl) : $perl ); |
|
852
|
31
|
50
|
|
|
|
57
|
( $$xml_ref .= $self->end_tag($root) ) |
|
853
|
|
|
|
|
|
|
if length($root); |
|
854
|
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
#$$xml_ref .= "\n"; # just for debugging |
|
856
|
|
|
|
|
|
|
} |
|
857
|
|
|
|
|
|
|
elsif ( $type eq 'SCALAR' ) { |
|
858
|
0
|
|
|
|
|
0
|
$self->_scalar_to_xml( $perl, $root, $xml_ref, $strip_plural, |
|
859
|
|
|
|
|
|
|
$escape, $wrap_array ); |
|
860
|
|
|
|
|
|
|
} |
|
861
|
|
|
|
|
|
|
elsif ( $type eq 'ARRAY' ) { |
|
862
|
7
|
|
|
|
|
13
|
$self->_array_to_xml( $perl, $root, $xml_ref, $strip_plural, |
|
863
|
|
|
|
|
|
|
$escape, $wrap_array ); |
|
864
|
|
|
|
|
|
|
} |
|
865
|
|
|
|
|
|
|
elsif ( $type eq 'HASH' ) { |
|
866
|
17
|
|
|
|
|
27
|
$self->_hash_to_xml( $perl, $root, $xml_ref, $strip_plural, $escape, |
|
867
|
|
|
|
|
|
|
$wrap_array ); |
|
868
|
|
|
|
|
|
|
} |
|
869
|
|
|
|
|
|
|
else { |
|
870
|
|
|
|
|
|
|
# assume blessed object, force it to stringify as a scalar |
|
871
|
3
|
|
|
|
|
28
|
$self->_scalar_to_xml( "$perl", $root, $xml_ref, $strip_plural, |
|
872
|
|
|
|
|
|
|
$escape, $wrap_array ); |
|
873
|
|
|
|
|
|
|
} |
|
874
|
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
} |
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
sub _array_to_xml { |
|
878
|
7
|
|
|
7
|
|
6
|
my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array ) |
|
879
|
|
|
|
|
|
|
= @_; |
|
880
|
7
|
|
|
|
|
10
|
for my $thing (@$perl) { |
|
881
|
25
|
100
|
66
|
|
|
110
|
if ( ref $thing |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
882
|
|
|
|
|
|
|
and ( ref $thing eq 'ARRAY' or ref $thing eq 'HASH' ) |
|
883
|
|
|
|
|
|
|
and length($root) |
|
884
|
|
|
|
|
|
|
and $wrap_array ) |
|
885
|
|
|
|
|
|
|
{ |
|
886
|
|
|
|
|
|
|
#warn "<$root> ref $thing == " . ref($thing); |
|
887
|
8
|
|
|
|
|
11
|
$$xml_ref .= $self->start_tag($root); |
|
888
|
|
|
|
|
|
|
} |
|
889
|
25
|
|
|
|
|
36
|
$self->_ref_to_xml( $thing, $root, $xml_ref, $strip_plural, $escape, |
|
890
|
|
|
|
|
|
|
$wrap_array ); |
|
891
|
25
|
100
|
66
|
|
|
119
|
if ( ref $thing |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
892
|
|
|
|
|
|
|
and ( ref $thing eq 'ARRAY' or ref $thing eq 'HASH' ) |
|
893
|
|
|
|
|
|
|
and length($root) |
|
894
|
|
|
|
|
|
|
and $wrap_array ) |
|
895
|
|
|
|
|
|
|
{ |
|
896
|
|
|
|
|
|
|
#warn "$root> ref $thing == " . ref($thing); |
|
897
|
8
|
|
|
|
|
9
|
$$xml_ref .= $self->end_tag($root); |
|
898
|
|
|
|
|
|
|
} |
|
899
|
|
|
|
|
|
|
} |
|
900
|
|
|
|
|
|
|
} |
|
901
|
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
sub _hash_to_xml { |
|
903
|
17
|
|
|
17
|
|
15
|
my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array ) |
|
904
|
|
|
|
|
|
|
= @_; |
|
905
|
17
|
|
|
|
|
30
|
for my $key ( keys %$perl ) { |
|
906
|
29
|
|
|
|
|
28
|
my $thing = $perl->{$key}; |
|
907
|
29
|
100
|
|
|
|
30
|
if ( ref $thing ) { |
|
908
|
8
|
|
|
|
|
7
|
my $key_to_pass = $key; |
|
909
|
8
|
|
|
|
|
5
|
my %attr; |
|
910
|
8
|
100
|
100
|
|
|
27
|
if ( ref $thing eq 'ARRAY' && $strip_plural ) { |
|
911
|
4
|
|
|
|
|
6
|
$key_to_pass = $strip_plural->($key_to_pass); |
|
912
|
4
|
|
|
|
|
6
|
$attr{count} = scalar @$thing; |
|
913
|
|
|
|
|
|
|
} |
|
914
|
8
|
100
|
100
|
|
|
26
|
if ( ref $thing ne 'ARRAY' or $wrap_array ) { |
|
915
|
6
|
|
|
|
|
9
|
$$xml_ref .= $self->start_tag( $key, \%attr ); |
|
916
|
|
|
|
|
|
|
} |
|
917
|
|
|
|
|
|
|
$self->_ref_to_xml( |
|
918
|
8
|
|
|
|
|
13
|
$thing, $key_to_pass, $xml_ref, |
|
919
|
|
|
|
|
|
|
$strip_plural, $escape, $wrap_array |
|
920
|
|
|
|
|
|
|
); |
|
921
|
8
|
100
|
100
|
|
|
32
|
if ( ref $thing ne 'ARRAY' or $wrap_array ) { |
|
922
|
6
|
|
|
|
|
7
|
$$xml_ref .= $self->end_tag($key); |
|
923
|
|
|
|
|
|
|
} |
|
924
|
|
|
|
|
|
|
|
|
925
|
|
|
|
|
|
|
#$$xml_ref .= "\n"; # just for debugging |
|
926
|
|
|
|
|
|
|
} |
|
927
|
|
|
|
|
|
|
else { |
|
928
|
21
|
|
|
|
|
29
|
$self->_ref_to_xml( $thing, $key, $xml_ref, $strip_plural, |
|
929
|
|
|
|
|
|
|
$escape, $wrap_array ); |
|
930
|
|
|
|
|
|
|
} |
|
931
|
|
|
|
|
|
|
} |
|
932
|
|
|
|
|
|
|
} |
|
933
|
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
sub _scalar_to_xml { |
|
935
|
3
|
|
|
3
|
|
12
|
my ( $self, $perl, $root, $xml_ref, $strip_plural, $escape, $wrap_array ) |
|
936
|
|
|
|
|
|
|
= @_; |
|
937
|
3
|
100
|
|
|
|
10
|
$$xml_ref |
|
938
|
|
|
|
|
|
|
.= $self->start_tag($root) |
|
939
|
|
|
|
|
|
|
. ( $escape ? $self->utf8_safe($perl) : $perl ) |
|
940
|
|
|
|
|
|
|
. $self->end_tag($root); |
|
941
|
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
#$$xml_ref .= "\n"; # just for debugging |
|
943
|
|
|
|
|
|
|
} |
|
944
|
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
=head2 tidy( I ) |
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
Attempts to indent I correctly to make |
|
948
|
|
|
|
|
|
|
it more legible. |
|
949
|
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
Returns the I tidied up. |
|
951
|
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
B This is an experimental feature. It might be |
|
953
|
|
|
|
|
|
|
really slow or eat your XML. You have been warned. |
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
=cut |
|
956
|
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
sub tidy { |
|
958
|
0
|
|
|
0
|
1
|
|
my $xml = pop; |
|
959
|
0
|
|
|
|
|
|
my $level = 2; |
|
960
|
0
|
|
|
|
|
|
my $indent = 0; |
|
961
|
0
|
|
|
|
|
|
my @tidy = (); |
|
962
|
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
# normalize tag breaks |
|
964
|
0
|
|
|
|
|
|
$xml =~ s,>\s*<,>\n<,gs; |
|
965
|
|
|
|
|
|
|
|
|
966
|
0
|
|
|
|
|
|
my @xmlarr = split( m/\n/, $xml ); |
|
967
|
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
# shift off declaration |
|
969
|
0
|
0
|
0
|
|
|
|
if ( scalar(@xmlarr) and $xmlarr[0] =~ m/^<\?\s*xml/ ) { |
|
970
|
0
|
|
|
|
|
|
push @tidy, shift(@xmlarr); |
|
971
|
|
|
|
|
|
|
} |
|
972
|
|
|
|
|
|
|
|
|
973
|
0
|
|
|
|
|
|
my $count = 0; |
|
974
|
0
|
|
|
|
|
|
for my $el (@xmlarr) { |
|
975
|
|
|
|
|
|
|
|
|
976
|
0
|
0
|
|
|
|
|
if ( $count == 1 ) { |
|
977
|
0
|
|
|
|
|
|
$indent = 2; |
|
978
|
|
|
|
|
|
|
} |
|
979
|
0
|
0
|
|
|
|
|
if ( $count == scalar(@xmlarr) - 1 ) { |
|
980
|
0
|
|
|
|
|
|
$indent = 0; |
|
981
|
|
|
|
|
|
|
} |
|
982
|
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
#warn "el: $el\n"; |
|
984
|
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
# singletons get special treatment |
|
986
|
0
|
0
|
|
|
|
|
if ( $el =~ m/^<([\w])+[^>]*\/>$/ ) { |
|
|
|
0
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
|
|
988
|
0
|
|
|
|
|
|
push @tidy, ( ' ' x $indent ) . $el; |
|
989
|
|
|
|
|
|
|
} |
|
990
|
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
# match opening tag |
|
992
|
|
|
|
|
|
|
elsif ( $el =~ m/^<([\w])+[^>]*>$/ ) { |
|
993
|
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
#warn "open $indent\n"; |
|
995
|
0
|
|
|
|
|
|
push @tidy, ( ' ' x $indent ) . $el; |
|
996
|
0
|
|
|
|
|
|
$indent += $level; |
|
997
|
|
|
|
|
|
|
} |
|
998
|
|
|
|
|
|
|
else { |
|
999
|
0
|
0
|
|
|
|
|
if ( $el =~ m/^<\// ) { |
|
1000
|
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
#warn "close $indent\n"; |
|
1002
|
0
|
|
|
|
|
|
$indent -= $level; # closing tag |
|
1003
|
|
|
|
|
|
|
} |
|
1004
|
0
|
0
|
|
|
|
|
if ( $indent < 0 ) { |
|
1005
|
0
|
|
|
|
|
|
$indent += $level; |
|
1006
|
|
|
|
|
|
|
} |
|
1007
|
0
|
|
|
|
|
|
push @tidy, ( ' ' x $indent ) . $el; |
|
1008
|
|
|
|
|
|
|
} |
|
1009
|
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
#warn "indent = $indent\n"; |
|
1011
|
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
#Data::Dump::dump \@tidy; |
|
1013
|
0
|
|
|
|
|
|
$count++; |
|
1014
|
|
|
|
|
|
|
} |
|
1015
|
|
|
|
|
|
|
|
|
1016
|
0
|
|
|
|
|
|
return join( "\n", @tidy ); |
|
1017
|
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
} |
|
1019
|
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
1; |
|
1021
|
|
|
|
|
|
|
__END__ |