line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: Clean.pm,v 1.6 2003/09/21 14:04:37 petr Exp $ |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
XML::Clean - Ensure, that I<(HTML)> text pass throught an XML parser. |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use XML::Clean; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
print XML::Clean::clean ("barfoo"); |
12
|
|
|
|
|
|
|
# barfoo |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
print XML::Clean::clean ("barfoo",1.5); |
15
|
|
|
|
|
|
|
# |
16
|
|
|
|
|
|
|
# barfoo |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
print XML::Clean::clean ("bar bar",1.6,){root=>"XML_ROOT",encoding=>"ISO-8859-2"} ); |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
#
|
21
|
|
|
|
|
|
|
# bar bar |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 DESCRIPTION |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
The ultimate quest of this module is to produce from non-XML text |
26
|
|
|
|
|
|
|
text, that will will most probably pass throught any XML parser one |
27
|
|
|
|
|
|
|
could find. |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
Basic cleaning is just XML tag matching (for every opening tag there |
30
|
|
|
|
|
|
|
will be closing tag as well, and they will form a tree structure). |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
When you add some extra parameters, you will receive complete XML |
33
|
|
|
|
|
|
|
text, including XML head and root element (if none were defined in |
34
|
|
|
|
|
|
|
text, then some will be added). |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 FUNCTIONS AND METHODS |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=over 4 |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=item XML::Clean::clean($text, [$version, [%options] ]) |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Return (almost) XML text, made from input parameter C<$text>. |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
When C<$version> is false, only match tags, and escapes any unmatched |
46
|
|
|
|
|
|
|
tags. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
When you pass C<$version> parameter, then text is checked for standard |
49
|
|
|
|
|
|
|
XML head (), and depending on options (force_root), some is |
50
|
|
|
|
|
|
|
added / existing is modified. Also depending on options, text is checked for |
51
|
|
|
|
|
|
|
root element. VERSION XML head parameter in output text is set to parameter |
52
|
|
|
|
|
|
|
value you pass. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Options are: |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
encoding - String to be added as XML encoding attribute in XML header. Defaults |
57
|
|
|
|
|
|
|
to I. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
force_root - If true, output text will have XML root. Defaults to I. |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
root - Output text will have that tag as root element. Defaults to |
62
|
|
|
|
|
|
|
I. |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item clean_file $filename [$version [%options] ] |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Open file called C<$filename>, reads all text from it, pass it to clean |
67
|
|
|
|
|
|
|
with C<$version> and C<%options>, write output text to file called |
68
|
|
|
|
|
|
|
C<$filename>. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Die on I/O error. |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=back |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=head1 BUGS |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
This module is still under development. Not all XML errors are |
77
|
|
|
|
|
|
|
corrected with it. |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Its otherwise too ineficient and slow:). |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=head1 AUTHOR |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=for html |
84
|
|
|
|
|
|
|
petr@kubanek.net. Send there any complains, comments and so on. |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=head1 DISTRIBUTION |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=for html |
89
|
|
|
|
|
|
|
http://www.kubanek.net/xmlclean |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=cut |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
BEGIN { |
94
|
1
|
|
|
1
|
|
1037
|
$VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
25
|
|
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
30
|
|
98
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
42
|
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
package XML::Clean; |
101
|
1
|
|
|
1
|
|
4
|
use vars qw(@ISA @EXPORT); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
1383
|
|
102
|
|
|
|
|
|
|
require Exporter; |
103
|
|
|
|
|
|
|
@ISA =qw(Exporter); |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
my @stack; |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
my %escapes = ( "<" => "<", ">" => ">", "&" => "&" ) ; |
108
|
|
|
|
|
|
|
my $escapes_keys = "(" . join ( "|", keys %escapes ) . ")"; |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# help routine to ensure, that xml attributes for tags are correct. |
111
|
|
|
|
|
|
|
# It means, they match variable="value" style |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub clean_attr { |
114
|
4
|
|
|
4
|
0
|
4
|
my $attr = shift; |
115
|
4
|
100
|
|
|
|
12
|
return "" unless $attr; |
116
|
3
|
|
|
|
|
4
|
my $ret; |
117
|
3
|
|
|
|
|
13
|
$ret = ""; |
118
|
|
|
|
|
|
|
# put to result only well-formed or almost-well formed values |
119
|
3
|
|
|
|
|
18
|
while ($attr =~ m/((?:\w|_|-)+)\s*=\s*((?:\w|\d|_|-)+|".*?")/g) { |
120
|
1
|
|
|
|
|
3
|
my $name=$1; |
121
|
1
|
|
|
|
|
3
|
my $val=$2; |
122
|
1
|
|
|
|
|
3
|
$val =~ s#(^["']|["']$)##g; |
123
|
1
|
|
|
|
|
6
|
$ret .= ' '.$name.'="'.$val.'"'; |
124
|
|
|
|
|
|
|
} |
125
|
3
|
50
|
|
|
|
10
|
$ret = $ret."/" if ($attr =~ m#/$#); |
126
|
3
|
|
|
|
|
5
|
return $ret; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# help routine to handle start tags. Check, if they aren't legal XML |
130
|
|
|
|
|
|
|
# tag (not ending with /), then push them to @stack. |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub handle_start { |
133
|
4
|
|
|
4
|
0
|
6
|
my $element = shift; |
134
|
4
|
|
|
|
|
6
|
my $attr = shift; |
135
|
|
|
|
|
|
|
|
136
|
4
|
50
|
|
|
|
15
|
push @stack, $element unless ($attr =~ m#/$#); |
137
|
|
|
|
|
|
|
|
138
|
4
|
|
|
|
|
9
|
$attr = clean_attr $attr; |
139
|
|
|
|
|
|
|
|
140
|
4
|
|
|
|
|
15
|
return "<$element$attr>"; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# help routine to handel end tags. pop from @stack while it doesn't |
144
|
|
|
|
|
|
|
# find matching same end tag, write end tag to output, returns |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub handle_end { |
147
|
|
|
|
|
|
|
# exit, if empty |
148
|
0
|
0
|
|
0
|
0
|
0
|
return "" unless @stack; |
149
|
0
|
|
|
|
|
0
|
my $element = shift; |
150
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
0
|
my $end_tags = ""; |
152
|
0
|
|
|
|
|
0
|
my @tmp_stack = @stack; |
153
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
0
|
my $end; |
155
|
|
|
|
|
|
|
|
156
|
0
|
|
0
|
|
|
0
|
do { |
157
|
0
|
|
|
|
|
0
|
$end = pop @tmp_stack; |
158
|
0
|
|
|
|
|
0
|
$end_tags .= "$end>"; |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
} until ($end eq $element) or ($#tmp_stack == -1); |
161
|
|
|
|
|
|
|
|
162
|
0
|
0
|
0
|
|
|
0
|
if (not(@tmp_stack) and (($#stack !=0) and ($stack[0] ne $element))) { |
|
|
|
0
|
|
|
|
|
163
|
0
|
|
|
|
|
0
|
return 1; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
0
|
|
|
|
|
0
|
@stack = @tmp_stack; |
167
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
0
|
return $end_tags; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub handle_text { |
172
|
7
|
|
|
7
|
0
|
12
|
my $element = shift; |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# escape our elements |
175
|
7
|
50
|
|
|
|
52
|
$element =~ s#$escapes_keys#$escapes{$1}#exg if defined $element; |
|
0
|
|
|
|
|
0
|
|
176
|
|
|
|
|
|
|
|
177
|
7
|
|
|
|
|
20
|
return $element; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub clean { |
181
|
|
|
|
|
|
|
|
182
|
3
|
|
|
3
|
1
|
115
|
my $text = shift; |
183
|
3
|
|
|
|
|
4
|
my $version = shift; |
184
|
3
|
|
|
|
|
4
|
my $options = shift; |
185
|
|
|
|
|
|
|
|
186
|
3
|
|
|
|
|
7
|
my $root = $$options{root}; |
187
|
3
|
|
|
|
|
4
|
my $encoding = $$options{encoding}; |
188
|
|
|
|
|
|
|
|
189
|
3
|
|
|
|
|
4
|
my $output = ""; |
190
|
|
|
|
|
|
|
|
191
|
3
|
100
|
|
|
|
10
|
$encoding = "ISO-8859-1" unless $encoding; |
192
|
|
|
|
|
|
|
|
193
|
3
|
100
|
|
|
|
8
|
if ($version) { |
194
|
|
|
|
|
|
|
# first, check for tag |
195
|
2
|
50
|
|
|
|
7
|
if ($text !~ m/^<\?xml[^<>]*\?>\s*(]*>)?\s*<\w+[^<>]*>/s ) { |
196
|
2
|
|
|
|
|
18
|
$output = "\n"; |
197
|
2
|
100
|
|
|
|
9
|
$text = "<$root>\n". $text if ($root); |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# if there is something in $output, it must be
|
202
|
|
|
|
|
|
|
# version..> string |
203
|
|
|
|
|
|
|
|
204
|
3
|
50
|
|
|
|
10
|
$text =~ s/^<\?xml[^<>]*\?>\s*(]*>)?\s*//s if defined $text; |
205
|
3
|
100
|
|
|
|
9
|
$output = $& unless $output; |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# if we are asked to produce full-correct text with root as root |
208
|
|
|
|
|
|
|
# element, then do it |
209
|
|
|
|
|
|
|
|
210
|
3
|
50
|
66
|
|
|
24
|
if (($version) and ($$options{force_root}) and |
|
|
|
33
|
|
|
|
|
211
|
|
|
|
|
|
|
($text !~ m/<$root[^<>]*>/s)) { |
212
|
0
|
|
|
|
|
0
|
$text = "<$root>\n". $text; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
3
|
|
|
|
|
7
|
undef @stack; |
216
|
|
|
|
|
|
|
|
217
|
3
|
50
|
|
|
|
8
|
if (defined $text) { |
218
|
3
|
|
|
|
|
22
|
while ($text =~ m#^(.*?)<(/?\w+.*?)>(.*)#s) { |
219
|
|
|
|
|
|
|
|
220
|
4
|
|
|
|
|
17
|
my ($bg, $cont, $en) = ($1, $2, $3); |
221
|
|
|
|
|
|
|
|
222
|
4
|
|
|
|
|
10
|
$output .= handle_text ($bg); |
223
|
|
|
|
|
|
|
|
224
|
4
|
50
|
|
|
|
15
|
if ($cont =~ /^\w+/s) { |
|
|
0
|
|
|
|
|
|
225
|
4
|
|
|
|
|
5
|
my ($tag, $attr); |
226
|
4
|
100
|
|
|
|
21
|
if ($cont =~ /(\w*?)\s(.*)/s) { |
227
|
3
|
|
|
|
|
10
|
($tag, $attr) = ($1, " ".$2); |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
else { |
230
|
1
|
|
|
|
|
4
|
($tag, $attr) = ($cont, ""); |
231
|
|
|
|
|
|
|
} |
232
|
4
|
|
|
|
|
12
|
$output .= handle_start ($tag, $attr); |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
elsif ($cont =~ m#^/\w+#s) { |
236
|
0
|
|
|
|
|
0
|
my ($tag, $attr); |
237
|
0
|
0
|
|
|
|
0
|
if ($cont =~ /^\/(\w*?)\s(.*)/s) { |
238
|
0
|
|
|
|
|
0
|
($tag, $attr) = ($1, " ".$2); |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
else { |
241
|
0
|
|
|
|
|
0
|
($tag, $attr) = ($cont, ""); |
242
|
0
|
|
|
|
|
0
|
$tag =~ s/^\///; |
243
|
|
|
|
|
|
|
} |
244
|
0
|
|
|
|
|
0
|
$output .= handle_end ($tag); |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
else { |
248
|
0
|
|
|
|
|
0
|
$output .= handle_text ("<$cont>"); |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
4
|
|
|
|
|
17
|
$text = $en; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
3
|
50
|
|
|
|
10
|
$output .= handle_text ($text) if defined $text; |
256
|
|
|
|
|
|
|
|
257
|
3
|
|
|
|
|
6
|
my $x; |
258
|
3
|
|
|
|
|
4
|
foreach $x (reverse @stack) { |
259
|
4
|
|
|
|
|
10
|
$output .= "$x>"; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
3
|
|
|
|
|
12
|
return $output; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub clean_file { |
266
|
0
|
|
|
0
|
1
|
|
my $filename = shift; |
267
|
0
|
|
|
|
|
|
my $version = shift; |
268
|
0
|
|
|
|
|
|
my $options = shift; |
269
|
|
|
|
|
|
|
|
270
|
0
|
0
|
|
|
|
|
$version = "1.0" unless $version; |
271
|
|
|
|
|
|
|
|
272
|
0
|
0
|
|
|
|
|
open FILE, "<$filename" or die "Cannot open $filename for reading: $!"; |
273
|
|
|
|
|
|
|
|
274
|
0
|
|
|
|
|
|
undef $/; |
275
|
|
|
|
|
|
|
|
276
|
0
|
|
|
|
|
|
my $text = ; |
277
|
|
|
|
|
|
|
|
278
|
0
|
0
|
|
|
|
|
close FILE or print "Cannot close $filename after reading from it: $!"; |
279
|
|
|
|
|
|
|
|
280
|
0
|
|
|
|
|
|
$text = clean $text, $version, $options; |
281
|
|
|
|
|
|
|
|
282
|
0
|
0
|
|
|
|
|
open FILE, ">$filename" or die "Cannot open $filename for writing: $!"; |
283
|
|
|
|
|
|
|
|
284
|
0
|
|
|
|
|
|
print FILE $text; |
285
|
|
|
|
|
|
|
|
286
|
0
|
0
|
|
|
|
|
close FILE or die "Cannot close $filename after writing to it: $!"; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
1; |
290
|
|
|
|
|
|
|
|