| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# $Id: Clean.pm,v 1.6 2003/09/21 14:04:37 petr Exp $ |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
XML::Clean - Ensure, that I<(HTML)> text pass throught an XML parser. |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use XML::Clean; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
print XML::Clean::clean ("barfoo"); |
|
12
|
|
|
|
|
|
|
# barfoo |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
print XML::Clean::clean ("barfoo",1.5); |
|
15
|
|
|
|
|
|
|
# |
|
16
|
|
|
|
|
|
|
# barfoo |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
print XML::Clean::clean ("bar bar",1.6,){root=>"XML_ROOT",encoding=>"ISO-8859-2"} ); |
|
19
|
|
|
|
|
|
|
# |
|
20
|
|
|
|
|
|
|
#
|
|
21
|
|
|
|
|
|
|
# bar bar |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
The ultimate quest of this module is to produce from non-XML text |
|
26
|
|
|
|
|
|
|
text, that will will most probably pass throught any XML parser one |
|
27
|
|
|
|
|
|
|
could find. |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
Basic cleaning is just XML tag matching (for every opening tag there |
|
30
|
|
|
|
|
|
|
will be closing tag as well, and they will form a tree structure). |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
When you add some extra parameters, you will receive complete XML |
|
33
|
|
|
|
|
|
|
text, including XML head and root element (if none were defined in |
|
34
|
|
|
|
|
|
|
text, then some will be added). |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 FUNCTIONS AND METHODS |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=over 4 |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=item XML::Clean::clean($text, [$version, [%options] ]) |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Return (almost) XML text, made from input parameter C<$text>. |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
When C<$version> is false, only match tags, and escapes any unmatched |
|
46
|
|
|
|
|
|
|
tags. |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
When you pass C<$version> parameter, then text is checked for standard |
|
49
|
|
|
|
|
|
|
XML head (), and depending on options (force_root), some is |
|
50
|
|
|
|
|
|
|
added / existing is modified. Also depending on options, text is checked for |
|
51
|
|
|
|
|
|
|
root element. VERSION XML head parameter in output text is set to parameter |
|
52
|
|
|
|
|
|
|
value you pass. |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Options are: |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
encoding - String to be added as XML encoding attribute in XML header. Defaults |
|
57
|
|
|
|
|
|
|
to I. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
force_root - If true, output text will have XML root. Defaults to I. |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
root - Output text will have that tag as root element. Defaults to |
|
62
|
|
|
|
|
|
|
I. |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item clean_file $filename [$version [%options] ] |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Open file called C<$filename>, reads all text from it, pass it to clean |
|
67
|
|
|
|
|
|
|
with C<$version> and C<%options>, write output text to file called |
|
68
|
|
|
|
|
|
|
C<$filename>. |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Die on I/O error. |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=back |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=head1 BUGS |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
This module is still under development. Not all XML errors are |
|
77
|
|
|
|
|
|
|
corrected with it. |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Its otherwise too ineficient and slow:). |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=head1 AUTHOR |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=for html |
|
84
|
|
|
|
|
|
|
petr@kubanek.net. Send there any complains, comments and so on. |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=head1 DISTRIBUTION |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=for html |
|
89
|
|
|
|
|
|
|
http://www.kubanek.net/xmlclean |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=cut |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
BEGIN { |
|
94
|
1
|
|
|
1
|
|
1037
|
$VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
|
|
1
|
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
25
|
|
|
95
|
|
|
|
|
|
|
} |
|
96
|
|
|
|
|
|
|
|
|
97
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
30
|
|
|
98
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
42
|
|
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
package XML::Clean; |
|
101
|
1
|
|
|
1
|
|
4
|
use vars qw(@ISA @EXPORT); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
1383
|
|
|
102
|
|
|
|
|
|
|
require Exporter; |
|
103
|
|
|
|
|
|
|
@ISA =qw(Exporter); |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
my @stack; |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
my %escapes = ( "<" => "<", ">" => ">", "&" => "&" ) ; |
|
108
|
|
|
|
|
|
|
my $escapes_keys = "(" . join ( "|", keys %escapes ) . ")"; |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# help routine to ensure, that xml attributes for tags are correct. |
|
111
|
|
|
|
|
|
|
# It means, they match variable="value" style |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub clean_attr { |
|
114
|
4
|
|
|
4
|
0
|
4
|
my $attr = shift; |
|
115
|
4
|
100
|
|
|
|
12
|
return "" unless $attr; |
|
116
|
3
|
|
|
|
|
4
|
my $ret; |
|
117
|
3
|
|
|
|
|
13
|
$ret = ""; |
|
118
|
|
|
|
|
|
|
# put to result only well-formed or almost-well formed values |
|
119
|
3
|
|
|
|
|
18
|
while ($attr =~ m/((?:\w|_|-)+)\s*=\s*((?:\w|\d|_|-)+|".*?")/g) { |
|
120
|
1
|
|
|
|
|
3
|
my $name=$1; |
|
121
|
1
|
|
|
|
|
3
|
my $val=$2; |
|
122
|
1
|
|
|
|
|
3
|
$val =~ s#(^["']|["']$)##g; |
|
123
|
1
|
|
|
|
|
6
|
$ret .= ' '.$name.'="'.$val.'"'; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
3
|
50
|
|
|
|
10
|
$ret = $ret."/" if ($attr =~ m#/$#); |
|
126
|
3
|
|
|
|
|
5
|
return $ret; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# help routine to handle start tags. Check, if they aren't legal XML |
|
130
|
|
|
|
|
|
|
# tag (not ending with /), then push them to @stack. |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub handle_start { |
|
133
|
4
|
|
|
4
|
0
|
6
|
my $element = shift; |
|
134
|
4
|
|
|
|
|
6
|
my $attr = shift; |
|
135
|
|
|
|
|
|
|
|
|
136
|
4
|
50
|
|
|
|
15
|
push @stack, $element unless ($attr =~ m#/$#); |
|
137
|
|
|
|
|
|
|
|
|
138
|
4
|
|
|
|
|
9
|
$attr = clean_attr $attr; |
|
139
|
|
|
|
|
|
|
|
|
140
|
4
|
|
|
|
|
15
|
return "<$element$attr>"; |
|
141
|
|
|
|
|
|
|
} |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# help routine to handel end tags. pop from @stack while it doesn't |
|
144
|
|
|
|
|
|
|
# find matching same end tag, write end tag to output, returns |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub handle_end { |
|
147
|
|
|
|
|
|
|
# exit, if empty |
|
148
|
0
|
0
|
|
0
|
0
|
0
|
return "" unless @stack; |
|
149
|
0
|
|
|
|
|
0
|
my $element = shift; |
|
150
|
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
0
|
my $end_tags = ""; |
|
152
|
0
|
|
|
|
|
0
|
my @tmp_stack = @stack; |
|
153
|
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
0
|
my $end; |
|
155
|
|
|
|
|
|
|
|
|
156
|
0
|
|
0
|
|
|
0
|
do { |
|
157
|
0
|
|
|
|
|
0
|
$end = pop @tmp_stack; |
|
158
|
0
|
|
|
|
|
0
|
$end_tags .= "$end>"; |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
} until ($end eq $element) or ($#tmp_stack == -1); |
|
161
|
|
|
|
|
|
|
|
|
162
|
0
|
0
|
0
|
|
|
0
|
if (not(@tmp_stack) and (($#stack !=0) and ($stack[0] ne $element))) { |
|
|
|
|
0
|
|
|
|
|
|
163
|
0
|
|
|
|
|
0
|
return 1; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
0
|
|
|
|
|
0
|
@stack = @tmp_stack; |
|
167
|
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
0
|
return $end_tags; |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub handle_text { |
|
172
|
7
|
|
|
7
|
0
|
12
|
my $element = shift; |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# escape our elements |
|
175
|
7
|
50
|
|
|
|
52
|
$element =~ s#$escapes_keys#$escapes{$1}#exg if defined $element; |
|
|
0
|
|
|
|
|
0
|
|
|
176
|
|
|
|
|
|
|
|
|
177
|
7
|
|
|
|
|
20
|
return $element; |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub clean { |
|
181
|
|
|
|
|
|
|
|
|
182
|
3
|
|
|
3
|
1
|
115
|
my $text = shift; |
|
183
|
3
|
|
|
|
|
4
|
my $version = shift; |
|
184
|
3
|
|
|
|
|
4
|
my $options = shift; |
|
185
|
|
|
|
|
|
|
|
|
186
|
3
|
|
|
|
|
7
|
my $root = $$options{root}; |
|
187
|
3
|
|
|
|
|
4
|
my $encoding = $$options{encoding}; |
|
188
|
|
|
|
|
|
|
|
|
189
|
3
|
|
|
|
|
4
|
my $output = ""; |
|
190
|
|
|
|
|
|
|
|
|
191
|
3
|
100
|
|
|
|
10
|
$encoding = "ISO-8859-1" unless $encoding; |
|
192
|
|
|
|
|
|
|
|
|
193
|
3
|
100
|
|
|
|
8
|
if ($version) { |
|
194
|
|
|
|
|
|
|
# first, check for tag |
|
195
|
2
|
50
|
|
|
|
7
|
if ($text !~ m/^<\?xml[^<>]*\?>\s*(]*>)?\s*<\w+[^<>]*>/s ) { |
|
196
|
2
|
|
|
|
|
18
|
$output = "\n"; |
|
197
|
2
|
100
|
|
|
|
9
|
$text = "<$root>\n". $text if ($root); |
|
198
|
|
|
|
|
|
|
} |
|
199
|
|
|
|
|
|
|
} |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# if there is something in $output, it must be
|
|
202
|
|
|
|
|
|
|
# version..> string |
|
203
|
|
|
|
|
|
|
|
|
204
|
3
|
50
|
|
|
|
10
|
$text =~ s/^<\?xml[^<>]*\?>\s*(]*>)?\s*//s if defined $text; |
|
205
|
3
|
100
|
|
|
|
9
|
$output = $& unless $output; |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# if we are asked to produce full-correct text with root as root |
|
208
|
|
|
|
|
|
|
# element, then do it |
|
209
|
|
|
|
|
|
|
|
|
210
|
3
|
50
|
66
|
|
|
24
|
if (($version) and ($$options{force_root}) and |
|
|
|
|
33
|
|
|
|
|
|
211
|
|
|
|
|
|
|
($text !~ m/<$root[^<>]*>/s)) { |
|
212
|
0
|
|
|
|
|
0
|
$text = "<$root>\n". $text; |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
|
|
215
|
3
|
|
|
|
|
7
|
undef @stack; |
|
216
|
|
|
|
|
|
|
|
|
217
|
3
|
50
|
|
|
|
8
|
if (defined $text) { |
|
218
|
3
|
|
|
|
|
22
|
while ($text =~ m#^(.*?)<(/?\w+.*?)>(.*)#s) { |
|
219
|
|
|
|
|
|
|
|
|
220
|
4
|
|
|
|
|
17
|
my ($bg, $cont, $en) = ($1, $2, $3); |
|
221
|
|
|
|
|
|
|
|
|
222
|
4
|
|
|
|
|
10
|
$output .= handle_text ($bg); |
|
223
|
|
|
|
|
|
|
|
|
224
|
4
|
50
|
|
|
|
15
|
if ($cont =~ /^\w+/s) { |
|
|
|
0
|
|
|
|
|
|
|
225
|
4
|
|
|
|
|
5
|
my ($tag, $attr); |
|
226
|
4
|
100
|
|
|
|
21
|
if ($cont =~ /(\w*?)\s(.*)/s) { |
|
227
|
3
|
|
|
|
|
10
|
($tag, $attr) = ($1, " ".$2); |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
else { |
|
230
|
1
|
|
|
|
|
4
|
($tag, $attr) = ($cont, ""); |
|
231
|
|
|
|
|
|
|
} |
|
232
|
4
|
|
|
|
|
12
|
$output .= handle_start ($tag, $attr); |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
elsif ($cont =~ m#^/\w+#s) { |
|
236
|
0
|
|
|
|
|
0
|
my ($tag, $attr); |
|
237
|
0
|
0
|
|
|
|
0
|
if ($cont =~ /^\/(\w*?)\s(.*)/s) { |
|
238
|
0
|
|
|
|
|
0
|
($tag, $attr) = ($1, " ".$2); |
|
239
|
|
|
|
|
|
|
} |
|
240
|
|
|
|
|
|
|
else { |
|
241
|
0
|
|
|
|
|
0
|
($tag, $attr) = ($cont, ""); |
|
242
|
0
|
|
|
|
|
0
|
$tag =~ s/^\///; |
|
243
|
|
|
|
|
|
|
} |
|
244
|
0
|
|
|
|
|
0
|
$output .= handle_end ($tag); |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
else { |
|
248
|
0
|
|
|
|
|
0
|
$output .= handle_text ("<$cont>"); |
|
249
|
|
|
|
|
|
|
} |
|
250
|
|
|
|
|
|
|
|
|
251
|
4
|
|
|
|
|
17
|
$text = $en; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
|
|
255
|
3
|
50
|
|
|
|
10
|
$output .= handle_text ($text) if defined $text; |
|
256
|
|
|
|
|
|
|
|
|
257
|
3
|
|
|
|
|
6
|
my $x; |
|
258
|
3
|
|
|
|
|
4
|
foreach $x (reverse @stack) { |
|
259
|
4
|
|
|
|
|
10
|
$output .= "$x>"; |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
|
|
262
|
3
|
|
|
|
|
12
|
return $output; |
|
263
|
|
|
|
|
|
|
} |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub clean_file { |
|
266
|
0
|
|
|
0
|
1
|
|
my $filename = shift; |
|
267
|
0
|
|
|
|
|
|
my $version = shift; |
|
268
|
0
|
|
|
|
|
|
my $options = shift; |
|
269
|
|
|
|
|
|
|
|
|
270
|
0
|
0
|
|
|
|
|
$version = "1.0" unless $version; |
|
271
|
|
|
|
|
|
|
|
|
272
|
0
|
0
|
|
|
|
|
open FILE, "<$filename" or die "Cannot open $filename for reading: $!"; |
|
273
|
|
|
|
|
|
|
|
|
274
|
0
|
|
|
|
|
|
undef $/; |
|
275
|
|
|
|
|
|
|
|
|
276
|
0
|
|
|
|
|
|
my $text = ; |
|
277
|
|
|
|
|
|
|
|
|
278
|
0
|
0
|
|
|
|
|
close FILE or print "Cannot close $filename after reading from it: $!"; |
|
279
|
|
|
|
|
|
|
|
|
280
|
0
|
|
|
|
|
|
$text = clean $text, $version, $options; |
|
281
|
|
|
|
|
|
|
|
|
282
|
0
|
0
|
|
|
|
|
open FILE, ">$filename" or die "Cannot open $filename for writing: $!"; |
|
283
|
|
|
|
|
|
|
|
|
284
|
0
|
|
|
|
|
|
print FILE $text; |
|
285
|
|
|
|
|
|
|
|
|
286
|
0
|
0
|
|
|
|
|
close FILE or die "Cannot close $filename after writing to it: $!"; |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
1; |
|
290
|
|
|
|
|
|
|
|