| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package HTML::HTML5::Sanity; |
|
2
|
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
67786
|
use 5.010; |
|
|
2
|
|
|
|
|
7
|
|
|
|
2
|
|
|
|
|
87
|
|
|
4
|
2
|
|
|
2
|
|
13
|
use strict; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
78
|
|
|
5
|
2
|
|
|
2
|
|
12
|
use warnings; |
|
|
2
|
|
|
|
|
8
|
|
|
|
2
|
|
|
|
|
115
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
BEGIN { |
|
8
|
2
|
|
|
2
|
|
6
|
$HTML::HTML5::Sanity::AUTHORITY = 'cpan:TOBYINK'; |
|
9
|
2
|
|
|
|
|
399
|
$HTML::HTML5::Sanity::VERSION = '0.104'; |
|
10
|
|
|
|
|
|
|
} |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
require Exporter; |
|
13
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
14
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( |
|
15
|
|
|
|
|
|
|
'all' => [ qw(fix_document) ], |
|
16
|
|
|
|
|
|
|
'standard' => [ qw(fix_document) ], |
|
17
|
|
|
|
|
|
|
); |
|
18
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
|
19
|
|
|
|
|
|
|
our @EXPORT = ( @{ $EXPORT_TAGS{'standard'} } ); |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $FIX_LANG_ATTRIBUTES = 1; |
|
22
|
|
|
|
|
|
|
|
|
23
|
2
|
|
|
2
|
|
2189
|
use Locale::Country qw(country_code2code LOCALE_CODE_ALPHA_2 LOCALE_CODE_NUMERIC); |
|
|
2
|
|
|
|
|
96342
|
|
|
|
2
|
|
|
|
|
159
|
|
|
24
|
2
|
|
|
2
|
|
1038
|
use XML::LibXML qw(:ns :libxml); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our $lang_3to2 = { |
|
27
|
|
|
|
|
|
|
'aar' => 'aa' , |
|
28
|
|
|
|
|
|
|
'abk' => 'ab' , |
|
29
|
|
|
|
|
|
|
'ave' => 'ae' , |
|
30
|
|
|
|
|
|
|
'afr' => 'af' , |
|
31
|
|
|
|
|
|
|
'aka' => 'ak' , |
|
32
|
|
|
|
|
|
|
'amh' => 'am' , |
|
33
|
|
|
|
|
|
|
'arg' => 'an' , |
|
34
|
|
|
|
|
|
|
'ara' => 'ar' , |
|
35
|
|
|
|
|
|
|
'asm' => 'as' , |
|
36
|
|
|
|
|
|
|
'ava' => 'av' , |
|
37
|
|
|
|
|
|
|
'aym' => 'ay' , |
|
38
|
|
|
|
|
|
|
'aze' => 'az' , |
|
39
|
|
|
|
|
|
|
'bak' => 'ba' , |
|
40
|
|
|
|
|
|
|
'bel' => 'be' , |
|
41
|
|
|
|
|
|
|
'bul' => 'bg' , |
|
42
|
|
|
|
|
|
|
'bih' => 'bh' , |
|
43
|
|
|
|
|
|
|
'bis' => 'bi' , |
|
44
|
|
|
|
|
|
|
'bam' => 'bm' , |
|
45
|
|
|
|
|
|
|
'ben' => 'bn' , |
|
46
|
|
|
|
|
|
|
'tib' => 'bo' , |
|
47
|
|
|
|
|
|
|
'bod' => 'bo' , |
|
48
|
|
|
|
|
|
|
'bre' => 'br' , |
|
49
|
|
|
|
|
|
|
'bos' => 'bs' , |
|
50
|
|
|
|
|
|
|
'cat' => 'ca' , |
|
51
|
|
|
|
|
|
|
'che' => 'ce' , |
|
52
|
|
|
|
|
|
|
'cha' => 'ch' , |
|
53
|
|
|
|
|
|
|
'cos' => 'co' , |
|
54
|
|
|
|
|
|
|
'cre' => 'cr' , |
|
55
|
|
|
|
|
|
|
'cze' => 'cs' , |
|
56
|
|
|
|
|
|
|
'ces' => 'cs' , |
|
57
|
|
|
|
|
|
|
'chu' => 'cu' , |
|
58
|
|
|
|
|
|
|
'chv' => 'cv' , |
|
59
|
|
|
|
|
|
|
'wel' => 'cy' , |
|
60
|
|
|
|
|
|
|
'cym' => 'cy' , |
|
61
|
|
|
|
|
|
|
'dan' => 'da' , |
|
62
|
|
|
|
|
|
|
'ger' => 'de' , |
|
63
|
|
|
|
|
|
|
'deu' => 'de' , |
|
64
|
|
|
|
|
|
|
'div' => 'dv' , |
|
65
|
|
|
|
|
|
|
'dzo' => 'dz' , |
|
66
|
|
|
|
|
|
|
'ewe' => 'ee' , |
|
67
|
|
|
|
|
|
|
'gre' => 'el' , |
|
68
|
|
|
|
|
|
|
'ell' => 'el' , |
|
69
|
|
|
|
|
|
|
'eng' => 'en' , |
|
70
|
|
|
|
|
|
|
'epo' => 'eo' , |
|
71
|
|
|
|
|
|
|
'spa' => 'es' , |
|
72
|
|
|
|
|
|
|
'est' => 'et' , |
|
73
|
|
|
|
|
|
|
'baq' => 'eu' , |
|
74
|
|
|
|
|
|
|
'eus' => 'eu' , |
|
75
|
|
|
|
|
|
|
'per' => 'fa' , |
|
76
|
|
|
|
|
|
|
'fas' => 'fa' , |
|
77
|
|
|
|
|
|
|
'ful' => 'ff' , |
|
78
|
|
|
|
|
|
|
'fin' => 'fi' , |
|
79
|
|
|
|
|
|
|
'fij' => 'fj' , |
|
80
|
|
|
|
|
|
|
'fao' => 'fo' , |
|
81
|
|
|
|
|
|
|
'fre' => 'fr' , |
|
82
|
|
|
|
|
|
|
'fra' => 'fr' , |
|
83
|
|
|
|
|
|
|
'fry' => 'fy' , |
|
84
|
|
|
|
|
|
|
'gle' => 'ga' , |
|
85
|
|
|
|
|
|
|
'gla' => 'gd' , |
|
86
|
|
|
|
|
|
|
'glg' => 'gl' , |
|
87
|
|
|
|
|
|
|
'grn' => 'gn' , |
|
88
|
|
|
|
|
|
|
'guj' => 'gu' , |
|
89
|
|
|
|
|
|
|
'glv' => 'gv' , |
|
90
|
|
|
|
|
|
|
'hau' => 'ha' , |
|
91
|
|
|
|
|
|
|
'heb' => 'he' , |
|
92
|
|
|
|
|
|
|
'hin' => 'hi' , |
|
93
|
|
|
|
|
|
|
'hmo' => 'ho' , |
|
94
|
|
|
|
|
|
|
'hrv' => 'hr' , |
|
95
|
|
|
|
|
|
|
'hat' => 'ht' , |
|
96
|
|
|
|
|
|
|
'hat' => 'ht' , |
|
97
|
|
|
|
|
|
|
'hun' => 'hu' , |
|
98
|
|
|
|
|
|
|
'arm' => 'hy' , |
|
99
|
|
|
|
|
|
|
'hye' => 'hy' , |
|
100
|
|
|
|
|
|
|
'her' => 'hz' , |
|
101
|
|
|
|
|
|
|
'ina' => 'ia' , |
|
102
|
|
|
|
|
|
|
'ind' => 'id' , |
|
103
|
|
|
|
|
|
|
'ile' => 'ie' , |
|
104
|
|
|
|
|
|
|
'ibo' => 'ig' , |
|
105
|
|
|
|
|
|
|
'iii' => 'ii' , |
|
106
|
|
|
|
|
|
|
'ipk' => 'ik' , |
|
107
|
|
|
|
|
|
|
'ido' => 'io' , |
|
108
|
|
|
|
|
|
|
'ice' => 'is' , |
|
109
|
|
|
|
|
|
|
'isl' => 'is' , |
|
110
|
|
|
|
|
|
|
'ita' => 'it' , |
|
111
|
|
|
|
|
|
|
'iku' => 'iu' , |
|
112
|
|
|
|
|
|
|
'jpn' => 'ja' , |
|
113
|
|
|
|
|
|
|
'jav' => 'jv' , |
|
114
|
|
|
|
|
|
|
'geo' => 'ka' , |
|
115
|
|
|
|
|
|
|
'kat' => 'ka' , |
|
116
|
|
|
|
|
|
|
'kon' => 'kg' , |
|
117
|
|
|
|
|
|
|
'kik' => 'ki' , |
|
118
|
|
|
|
|
|
|
'kik' => 'ki' , |
|
119
|
|
|
|
|
|
|
'kua' => 'kj' , |
|
120
|
|
|
|
|
|
|
'kaz' => 'kk' , |
|
121
|
|
|
|
|
|
|
'kal' => 'kl' , |
|
122
|
|
|
|
|
|
|
'khm' => 'km' , |
|
123
|
|
|
|
|
|
|
'kan' => 'kn' , |
|
124
|
|
|
|
|
|
|
'kor' => 'ko' , |
|
125
|
|
|
|
|
|
|
'kau' => 'kr' , |
|
126
|
|
|
|
|
|
|
'kas' => 'ks' , |
|
127
|
|
|
|
|
|
|
'kur' => 'ku' , |
|
128
|
|
|
|
|
|
|
'kom' => 'kv' , |
|
129
|
|
|
|
|
|
|
'cor' => 'kw' , |
|
130
|
|
|
|
|
|
|
'kir' => 'ky' , |
|
131
|
|
|
|
|
|
|
'lat' => 'la' , |
|
132
|
|
|
|
|
|
|
'ltz' => 'lb' , |
|
133
|
|
|
|
|
|
|
'ltz' => 'lb' , |
|
134
|
|
|
|
|
|
|
'lug' => 'lg' , |
|
135
|
|
|
|
|
|
|
'lim' => 'li' , |
|
136
|
|
|
|
|
|
|
'lin' => 'ln' , |
|
137
|
|
|
|
|
|
|
'lao' => 'lo' , |
|
138
|
|
|
|
|
|
|
'lit' => 'lt' , |
|
139
|
|
|
|
|
|
|
'lub' => 'lu' , |
|
140
|
|
|
|
|
|
|
'lav' => 'lv' , |
|
141
|
|
|
|
|
|
|
'mlg' => 'mg' , |
|
142
|
|
|
|
|
|
|
'mah' => 'mh' , |
|
143
|
|
|
|
|
|
|
'mao' => 'mi' , |
|
144
|
|
|
|
|
|
|
'mri' => 'mi' , |
|
145
|
|
|
|
|
|
|
'mac' => 'mk' , |
|
146
|
|
|
|
|
|
|
'mkd' => 'mk' , |
|
147
|
|
|
|
|
|
|
'mal' => 'ml' , |
|
148
|
|
|
|
|
|
|
'mon' => 'mn' , |
|
149
|
|
|
|
|
|
|
'mar' => 'mr' , |
|
150
|
|
|
|
|
|
|
'may' => 'ms' , |
|
151
|
|
|
|
|
|
|
'msa' => 'ms' , |
|
152
|
|
|
|
|
|
|
'mlt' => 'mt' , |
|
153
|
|
|
|
|
|
|
'bur' => 'my' , |
|
154
|
|
|
|
|
|
|
'mya' => 'my' , |
|
155
|
|
|
|
|
|
|
'nau' => 'na' , |
|
156
|
|
|
|
|
|
|
'nob' => 'nb' , |
|
157
|
|
|
|
|
|
|
'nde' => 'nd' , |
|
158
|
|
|
|
|
|
|
'nep' => 'ne' , |
|
159
|
|
|
|
|
|
|
'ndo' => 'ng' , |
|
160
|
|
|
|
|
|
|
'dut' => 'nl' , |
|
161
|
|
|
|
|
|
|
'nld' => 'nl' , |
|
162
|
|
|
|
|
|
|
'nno' => 'nn' , |
|
163
|
|
|
|
|
|
|
'nor' => 'no' , |
|
164
|
|
|
|
|
|
|
'nbl' => 'nr' , |
|
165
|
|
|
|
|
|
|
'nav' => 'nv' , |
|
166
|
|
|
|
|
|
|
'nya' => 'ny' , |
|
167
|
|
|
|
|
|
|
'oci' => 'oc' , |
|
168
|
|
|
|
|
|
|
'oji' => 'oj' , |
|
169
|
|
|
|
|
|
|
'orm' => 'om' , |
|
170
|
|
|
|
|
|
|
'ori' => 'or' , |
|
171
|
|
|
|
|
|
|
'oss' => 'os' , |
|
172
|
|
|
|
|
|
|
'pan' => 'pa' , |
|
173
|
|
|
|
|
|
|
'pli' => 'pi' , |
|
174
|
|
|
|
|
|
|
'pol' => 'pl' , |
|
175
|
|
|
|
|
|
|
'pus' => 'ps' , |
|
176
|
|
|
|
|
|
|
'por' => 'pt' , |
|
177
|
|
|
|
|
|
|
'que' => 'qu' , |
|
178
|
|
|
|
|
|
|
'roh' => 'rm' , |
|
179
|
|
|
|
|
|
|
'run' => 'rn' , |
|
180
|
|
|
|
|
|
|
'rum' => 'ro' , |
|
181
|
|
|
|
|
|
|
'ron' => 'ro' , |
|
182
|
|
|
|
|
|
|
'rus' => 'ru' , |
|
183
|
|
|
|
|
|
|
'kin' => 'rw' , |
|
184
|
|
|
|
|
|
|
'san' => 'sa' , |
|
185
|
|
|
|
|
|
|
'srd' => 'sc' , |
|
186
|
|
|
|
|
|
|
'snd' => 'sd' , |
|
187
|
|
|
|
|
|
|
'sme' => 'se' , |
|
188
|
|
|
|
|
|
|
'sag' => 'sg' , |
|
189
|
|
|
|
|
|
|
'sin' => 'si' , |
|
190
|
|
|
|
|
|
|
'slo' => 'sk' , |
|
191
|
|
|
|
|
|
|
'slk' => 'sk' , |
|
192
|
|
|
|
|
|
|
'slv' => 'sl' , |
|
193
|
|
|
|
|
|
|
'smo' => 'sm' , |
|
194
|
|
|
|
|
|
|
'sna' => 'sn' , |
|
195
|
|
|
|
|
|
|
'som' => 'so' , |
|
196
|
|
|
|
|
|
|
'alb' => 'sq' , |
|
197
|
|
|
|
|
|
|
'sqi' => 'sq' , |
|
198
|
|
|
|
|
|
|
'srp' => 'sr' , |
|
199
|
|
|
|
|
|
|
'ssw' => 'ss' , |
|
200
|
|
|
|
|
|
|
'sot' => 'st' , |
|
201
|
|
|
|
|
|
|
'sun' => 'su' , |
|
202
|
|
|
|
|
|
|
'swe' => 'sv' , |
|
203
|
|
|
|
|
|
|
'swa' => 'sw' , |
|
204
|
|
|
|
|
|
|
'tam' => 'ta' , |
|
205
|
|
|
|
|
|
|
'tel' => 'te' , |
|
206
|
|
|
|
|
|
|
'tgk' => 'tg' , |
|
207
|
|
|
|
|
|
|
'tha' => 'th' , |
|
208
|
|
|
|
|
|
|
'tir' => 'ti' , |
|
209
|
|
|
|
|
|
|
'tuk' => 'tk' , |
|
210
|
|
|
|
|
|
|
'tgl' => 'tl' , |
|
211
|
|
|
|
|
|
|
'tsn' => 'tn' , |
|
212
|
|
|
|
|
|
|
'ton' => 'to' , |
|
213
|
|
|
|
|
|
|
'tur' => 'tr' , |
|
214
|
|
|
|
|
|
|
'tso' => 'ts' , |
|
215
|
|
|
|
|
|
|
'tat' => 'tt' , |
|
216
|
|
|
|
|
|
|
'twi' => 'tw' , |
|
217
|
|
|
|
|
|
|
'tah' => 'ty' , |
|
218
|
|
|
|
|
|
|
'uig' => 'ug' , |
|
219
|
|
|
|
|
|
|
'ukr' => 'uk' , |
|
220
|
|
|
|
|
|
|
'urd' => 'ur' , |
|
221
|
|
|
|
|
|
|
'uzb' => 'uz' , |
|
222
|
|
|
|
|
|
|
'ven' => 've' , |
|
223
|
|
|
|
|
|
|
'vie' => 'vi' , |
|
224
|
|
|
|
|
|
|
'vol' => 'vo' , |
|
225
|
|
|
|
|
|
|
'wln' => 'wa' , |
|
226
|
|
|
|
|
|
|
'wol' => 'wo' , |
|
227
|
|
|
|
|
|
|
'xho' => 'xh' , |
|
228
|
|
|
|
|
|
|
'yid' => 'yi' , |
|
229
|
|
|
|
|
|
|
'yor' => 'yo' , |
|
230
|
|
|
|
|
|
|
'zha' => 'za' , |
|
231
|
|
|
|
|
|
|
'chi' => 'zh' , |
|
232
|
|
|
|
|
|
|
'zho' => 'zh' , |
|
233
|
|
|
|
|
|
|
'zul' => 'zu' , |
|
234
|
|
|
|
|
|
|
}; |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
our $lang_grandfather = { |
|
237
|
|
|
|
|
|
|
'art-lojban' => 'jbo', |
|
238
|
|
|
|
|
|
|
'i-ami' => 'ami', |
|
239
|
|
|
|
|
|
|
'i-bnn' => 'bnn', |
|
240
|
|
|
|
|
|
|
'i-hak' => 'hak', |
|
241
|
|
|
|
|
|
|
'i-klingon' => 'tlh', |
|
242
|
|
|
|
|
|
|
'i-lux' => 'lb', |
|
243
|
|
|
|
|
|
|
'i-navajo' => 'nv', |
|
244
|
|
|
|
|
|
|
'i-pwn' => 'pwn', #haha |
|
245
|
|
|
|
|
|
|
'i-tao' => 'tao', |
|
246
|
|
|
|
|
|
|
'i-tay' => 'tay', |
|
247
|
|
|
|
|
|
|
'i-tsu' => 'tsu', |
|
248
|
|
|
|
|
|
|
'no-bok' => 'nb', |
|
249
|
|
|
|
|
|
|
'no-nyn' => 'nn', |
|
250
|
|
|
|
|
|
|
'sgn-be-fr' => 'sfb', |
|
251
|
|
|
|
|
|
|
'sgn-be-nl' => 'vgt', |
|
252
|
|
|
|
|
|
|
'sgn-ch-de' => 'sgg', |
|
253
|
|
|
|
|
|
|
'zh-guoyu' => 'cmn', |
|
254
|
|
|
|
|
|
|
'zh-hakka' => 'hak', |
|
255
|
|
|
|
|
|
|
'zh-min-nan' => 'nan', |
|
256
|
|
|
|
|
|
|
'zh-xiang' => 'hsn', |
|
257
|
|
|
|
|
|
|
}; |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
our $obsolete_iso3166 = { |
|
260
|
|
|
|
|
|
|
'UK' => 'GB', # Exceptionally reserved |
|
261
|
|
|
|
|
|
|
'FX' => 'FR', # Exceptionally reserved |
|
262
|
|
|
|
|
|
|
'ZR' => 'CD', # Zaire => Congo |
|
263
|
|
|
|
|
|
|
'HV' => 'BF', # Upper Volta => Burkina Faso |
|
264
|
|
|
|
|
|
|
'DY' => 'BJ', # Dahomey => Benin |
|
265
|
|
|
|
|
|
|
'BU' => 'MM', # Burma => Myanmar |
|
266
|
|
|
|
|
|
|
'TP' => 'TL', # East Timor => Timor-Leste |
|
267
|
|
|
|
|
|
|
'NH' => 'VU', # New Hebrides => Vanuatu |
|
268
|
|
|
|
|
|
|
'RH' => 'ZW', # Rhodesia => Zimbabwe |
|
269
|
|
|
|
|
|
|
}; |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
our $canon_lang = {}; |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
sub fix_document |
|
274
|
|
|
|
|
|
|
{ |
|
275
|
|
|
|
|
|
|
my $old_document = shift; |
|
276
|
|
|
|
|
|
|
my $attribute_behaviour = shift || 0; |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
my $new_document = XML::LibXML::Document->new; |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
my $new_root = fix_element( |
|
281
|
|
|
|
|
|
|
$old_document->documentElement, |
|
282
|
|
|
|
|
|
|
$new_document, |
|
283
|
|
|
|
|
|
|
{ ':' => 'http://www.w3.org/1999/xhtml', 'xml' => XML_XML_NS } |
|
284
|
|
|
|
|
|
|
); |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
$new_document->setDocumentElement($new_root); |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
return $new_document; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
sub fix_element |
|
292
|
|
|
|
|
|
|
{ |
|
293
|
|
|
|
|
|
|
my $old_element = shift; |
|
294
|
|
|
|
|
|
|
my $new_document = shift; |
|
295
|
|
|
|
|
|
|
my $parent_declarations = shift; |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
my $declared_namespaces = {}; |
|
298
|
|
|
|
|
|
|
foreach my $k (keys %{$parent_declarations}) |
|
299
|
|
|
|
|
|
|
{ |
|
300
|
|
|
|
|
|
|
$declared_namespaces->{$k} = $parent_declarations->{$k}; |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# Process namespace declarations on this element. |
|
304
|
|
|
|
|
|
|
foreach my $attr ($old_element->attributes) |
|
305
|
|
|
|
|
|
|
{ |
|
306
|
|
|
|
|
|
|
next if $attr->nodeType == XML_NAMESPACE_DECL; |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
if ($attr->nodeName =~ /^xmlns:(.*)$/) |
|
309
|
|
|
|
|
|
|
{ |
|
310
|
|
|
|
|
|
|
my $prefix = $1; |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
if ($prefix eq 'xml' && $attr->getData eq XML_XML_NS) |
|
313
|
|
|
|
|
|
|
{ |
|
314
|
|
|
|
|
|
|
# that's OK. |
|
315
|
|
|
|
|
|
|
} |
|
316
|
|
|
|
|
|
|
elsif ($prefix eq 'xml' || $attr->getData eq XML_XML_NS) |
|
317
|
|
|
|
|
|
|
{ |
|
318
|
|
|
|
|
|
|
next; |
|
319
|
|
|
|
|
|
|
} |
|
320
|
|
|
|
|
|
|
elsif ($prefix eq 'xmlns' || $attr->getData eq XML_XMLNS_NS) |
|
321
|
|
|
|
|
|
|
{ |
|
322
|
|
|
|
|
|
|
next; |
|
323
|
|
|
|
|
|
|
} |
|
324
|
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
$declared_namespaces->{ $prefix } = $attr->getData; |
|
326
|
|
|
|
|
|
|
} |
|
327
|
|
|
|
|
|
|
} |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# Process any default XML Namespace |
|
330
|
|
|
|
|
|
|
my $hasExplicit = 0; |
|
331
|
|
|
|
|
|
|
if ($old_element->hasAttributeNS(undef, 'xmlns')) |
|
332
|
|
|
|
|
|
|
{ |
|
333
|
|
|
|
|
|
|
$hasExplicit = 1; |
|
334
|
|
|
|
|
|
|
$declared_namespaces->{ ':' } = $old_element->getAttributeNS(undef, 'xmlns'); |
|
335
|
|
|
|
|
|
|
} |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# Create a new element. |
|
338
|
|
|
|
|
|
|
my $new_element; |
|
339
|
|
|
|
|
|
|
if ($hasExplicit) |
|
340
|
|
|
|
|
|
|
{ |
|
341
|
|
|
|
|
|
|
$new_element = $new_document->createElementNS( |
|
342
|
|
|
|
|
|
|
$declared_namespaces->{ ':' }, |
|
343
|
|
|
|
|
|
|
$old_element->nodeName, |
|
344
|
|
|
|
|
|
|
); |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
else |
|
347
|
|
|
|
|
|
|
{ |
|
348
|
|
|
|
|
|
|
my $tag = $old_element->nodeName; |
|
349
|
|
|
|
|
|
|
if ($tag =~ /^([^:]+)\:([^:]+)$/) |
|
350
|
|
|
|
|
|
|
{ |
|
351
|
|
|
|
|
|
|
my $ns_prefix = $1; |
|
352
|
|
|
|
|
|
|
my $localname = $2; |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
if (defined $declared_namespaces->{$ns_prefix}) |
|
355
|
|
|
|
|
|
|
{ |
|
356
|
|
|
|
|
|
|
$new_element = $new_document->createElementNS( |
|
357
|
|
|
|
|
|
|
$declared_namespaces->{$ns_prefix}, $tag); |
|
358
|
|
|
|
|
|
|
} |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
unless ($new_element) |
|
361
|
|
|
|
|
|
|
{ |
|
362
|
|
|
|
|
|
|
$new_element = $new_document->createElementNS( |
|
363
|
|
|
|
|
|
|
$declared_namespaces->{ ':' }, $tag); |
|
364
|
|
|
|
|
|
|
} |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
# Add attributes to new element. |
|
368
|
|
|
|
|
|
|
foreach my $old_attr ($old_element->attributes) |
|
369
|
|
|
|
|
|
|
{ |
|
370
|
|
|
|
|
|
|
next if $old_attr->nodeType == XML_NAMESPACE_DECL; |
|
371
|
|
|
|
|
|
|
# next if $old_attr->nodeName =~ /^xmlns(:.*)?$/; |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
fix_attribute($old_attr, $new_element, $declared_namespaces); |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
# Process child nodes. |
|
377
|
|
|
|
|
|
|
foreach my $old_kid ($old_element->childNodes) |
|
378
|
|
|
|
|
|
|
{ |
|
379
|
|
|
|
|
|
|
if ($old_kid->nodeType == XML_TEXT_NODE |
|
380
|
|
|
|
|
|
|
|| $old_kid->nodeType == XML_CDATA_SECTION_NODE) |
|
381
|
|
|
|
|
|
|
{ |
|
382
|
|
|
|
|
|
|
$new_element->appendTextNode($old_kid->nodeValue); |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
elsif ($old_kid->nodeType == XML_COMMENT_NODE) |
|
385
|
|
|
|
|
|
|
{ |
|
386
|
|
|
|
|
|
|
$new_element->appendChild( |
|
387
|
|
|
|
|
|
|
$new_document->createComment($old_kid->nodeValue) |
|
388
|
|
|
|
|
|
|
); |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
elsif ($old_kid->nodeType == XML_ELEMENT_NODE) |
|
391
|
|
|
|
|
|
|
{ |
|
392
|
|
|
|
|
|
|
$new_element->appendChild( |
|
393
|
|
|
|
|
|
|
fix_element($old_kid, $new_document, $declared_namespaces) |
|
394
|
|
|
|
|
|
|
); |
|
395
|
|
|
|
|
|
|
} |
|
396
|
|
|
|
|
|
|
} |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
return $new_element; |
|
399
|
|
|
|
|
|
|
} |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
sub fix_attribute |
|
402
|
|
|
|
|
|
|
{ |
|
403
|
|
|
|
|
|
|
my $old_attribute = shift; |
|
404
|
|
|
|
|
|
|
my $new_element = shift; |
|
405
|
|
|
|
|
|
|
my $declared_namespaces = shift; |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
my $name = $old_attribute->nodeName; |
|
408
|
|
|
|
|
|
|
my @new_attribute; |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
if ($name =~ /^([^:]+)\:([^:]+)$/) |
|
411
|
|
|
|
|
|
|
{ |
|
412
|
|
|
|
|
|
|
my $ns_prefix = $1; |
|
413
|
|
|
|
|
|
|
my $localname = $2; |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
if (defined $declared_namespaces->{$ns_prefix}) |
|
416
|
|
|
|
|
|
|
{ |
|
417
|
|
|
|
|
|
|
@new_attribute = ( |
|
418
|
|
|
|
|
|
|
$declared_namespaces->{$ns_prefix}, |
|
419
|
|
|
|
|
|
|
sprintf("%s:%s", $ns_prefix, $localname), |
|
420
|
|
|
|
|
|
|
); |
|
421
|
|
|
|
|
|
|
} |
|
422
|
|
|
|
|
|
|
} |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
my $node_value = $old_attribute->nodeValue; |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
if ($FIX_LANG_ATTRIBUTES && $name =~ /^(xml:)?lang$/i) |
|
427
|
|
|
|
|
|
|
{ |
|
428
|
|
|
|
|
|
|
return undef |
|
429
|
|
|
|
|
|
|
unless _valid_lang($node_value); |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
if ($FIX_LANG_ATTRIBUTES == 2) |
|
432
|
|
|
|
|
|
|
{ |
|
433
|
|
|
|
|
|
|
$node_value = _canon_lang($node_value); |
|
434
|
|
|
|
|
|
|
} |
|
435
|
|
|
|
|
|
|
} |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
if (@new_attribute) |
|
438
|
|
|
|
|
|
|
{ |
|
439
|
|
|
|
|
|
|
$new_element->setAttributeNS(@new_attribute, $node_value); |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
else |
|
442
|
|
|
|
|
|
|
{ |
|
443
|
|
|
|
|
|
|
$new_element->setAttribute($name, $node_value); |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
return undef; |
|
447
|
|
|
|
|
|
|
} |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
sub _valid_lang |
|
450
|
|
|
|
|
|
|
{ |
|
451
|
|
|
|
|
|
|
my $value_to_test = shift; |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
return 1 if (defined $value_to_test) && ($value_to_test eq ''); |
|
454
|
|
|
|
|
|
|
return 0 unless defined $value_to_test; |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
# Regex for recognizing RFC 4646 well-formed tags |
|
457
|
|
|
|
|
|
|
# http://www.rfc-editor.org/rfc/rfc4646.txt |
|
458
|
|
|
|
|
|
|
# http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21 |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# The structure requires no forward references, so it reverses the order. |
|
461
|
|
|
|
|
|
|
# It uses Java/Perl syntax instead of the old ABNF |
|
462
|
|
|
|
|
|
|
# The uppercase comments are fragments copied from RFC 4646 |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
# Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped. |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
my $alpha = '[a-z]'; # ALPHA |
|
467
|
|
|
|
|
|
|
my $digit = '[0-9]'; # DIGIT |
|
468
|
|
|
|
|
|
|
my $alphanum = '[a-z0-9]'; # ALPHA / DIGIT |
|
469
|
|
|
|
|
|
|
my $x = 'x'; # private use singleton |
|
470
|
|
|
|
|
|
|
my $singleton = '[a-wyz]'; # other singleton |
|
471
|
|
|
|
|
|
|
my $s = '[_-]'; # separator -- lenient parsers will use [_-] -- strict will use [-] |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
# Now do the components. The structure is slightly different to allow for capturing the right components. |
|
474
|
|
|
|
|
|
|
# The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing. |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
my $language = '([a-z]{2,8}) | ([a-z]{2,3} [_-] [a-z]{3})'; |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
# ABNF (2*3ALPHA) / 4ALPHA / 5*8ALPHA --- note: because of how | works in regex, don't use $alpha{2,3} | $alpha{4,8} |
|
479
|
|
|
|
|
|
|
# We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan). |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
# Note: extlang invalid in Unicode language tags |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
my $script = '[a-z]{4}' ; # 4ALPHA |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
my $region = '(?: [a-z]{2}|[0-9]{3})' ; # 2ALPHA / 3DIGIT |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
my $variant = '(?: [a-z0-9]{5,8} | [0-9] [a-z0-9]{3} )' ; # 5*8alphanum / (DIGIT 3alphanum) |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
my $extension = '(?: [a-wyz] (?: [_-] [a-z0-9]{2,8} )+ )' ; # singleton 1*("-" (2*8alphanum)) |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
my $privateUse = '(?: x (?: [_-] [a-z0-9]{1,8} )+ )' ; # "x" 1*("-" (1*8alphanum)) |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# Define certain grandfathered codes, since otherwise the regex is pretty useless. |
|
494
|
|
|
|
|
|
|
# Since these are limited, this is safe even later changes to the registry -- |
|
495
|
|
|
|
|
|
|
# the only oddity is that it might change the type of the tag, and thus |
|
496
|
|
|
|
|
|
|
# the results from the capturing groups. |
|
497
|
|
|
|
|
|
|
# http://www.iana.org/assignments/language-subtag-registry |
|
498
|
|
|
|
|
|
|
# Note that these have to be compared case insensitively, requiring (?i) below. |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
my $grandfathered = '(?: |
|
501
|
|
|
|
|
|
|
(en [_-] GB [_-] oed) |
|
502
|
|
|
|
|
|
|
| (i [_-] (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )) |
|
503
|
|
|
|
|
|
|
| (no [_-] (?: bok | nyn )) |
|
504
|
|
|
|
|
|
|
| (sgn [_-] (?: BE [_-] (?: fr | nl) | CH [_-] de )) |
|
505
|
|
|
|
|
|
|
| (zh [_-] min [_-] nan) |
|
506
|
|
|
|
|
|
|
)'; |
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
# old: | zh $s (?: cmn (?: $s Hans | $s Hant )? | gan | min (?: $s nan)? | wuu | yue ); |
|
509
|
|
|
|
|
|
|
# For well-formedness, we don't need the ones that would otherwise pass. |
|
510
|
|
|
|
|
|
|
# For validity, they need to be checked. |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# $grandfatheredWellFormed = (?: |
|
513
|
|
|
|
|
|
|
# art $s lojban |
|
514
|
|
|
|
|
|
|
# | cel $s gaulish |
|
515
|
|
|
|
|
|
|
# | zh $s (?: guoyu | hakka | xiang ) |
|
516
|
|
|
|
|
|
|
# ); |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
# Unicode locales: but we are shifting to a compatible form |
|
519
|
|
|
|
|
|
|
# $keyvalue = (?: $alphanum+ \= $alphanum+); |
|
520
|
|
|
|
|
|
|
# $keywords = ($keyvalue (?: \; $keyvalue)*); |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
# We separate items that we want to capture as a single group |
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
my $variantList = $variant . '(?:' . $s . $variant . ')*' ; # special for multiples |
|
525
|
|
|
|
|
|
|
my $extensionList = $extension . '(?:' . $s . $extension . ')*' ; # special for multiples |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
my $langtag = " |
|
528
|
|
|
|
|
|
|
($language) |
|
529
|
|
|
|
|
|
|
($s ( $script ) )? |
|
530
|
|
|
|
|
|
|
($s ( $region ) )? |
|
531
|
|
|
|
|
|
|
($s ( $variantList ) )? |
|
532
|
|
|
|
|
|
|
($s ( $extensionList ) )? |
|
533
|
|
|
|
|
|
|
($s ( $privateUse ) )? |
|
534
|
|
|
|
|
|
|
"; |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
# Here is the final breakdown, with capturing groups for each of these components |
|
537
|
|
|
|
|
|
|
# The variants, extensions, grandfathered, and private-use may have interior '-' |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
my $r = ($value_to_test =~ |
|
540
|
|
|
|
|
|
|
/^( |
|
541
|
|
|
|
|
|
|
($langtag) |
|
542
|
|
|
|
|
|
|
| ($privateUse) |
|
543
|
|
|
|
|
|
|
| ($grandfathered) |
|
544
|
|
|
|
|
|
|
)$/xi); |
|
545
|
|
|
|
|
|
|
return $r; |
|
546
|
|
|
|
|
|
|
} |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
# If people use a non-canon lang once, they're likely to do it twice, so a little caching. |
|
549
|
|
|
|
|
|
|
sub _canon_lang |
|
550
|
|
|
|
|
|
|
{ |
|
551
|
|
|
|
|
|
|
my $lang = shift; |
|
552
|
|
|
|
|
|
|
unless (defined $canon_lang->{$lang}) |
|
553
|
|
|
|
|
|
|
{ |
|
554
|
|
|
|
|
|
|
$canon_lang->{$lang} = __canon_lang($lang); |
|
555
|
|
|
|
|
|
|
} |
|
556
|
|
|
|
|
|
|
return $canon_lang->{$lang}; |
|
557
|
|
|
|
|
|
|
} |
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
sub __canon_lang |
|
560
|
|
|
|
|
|
|
{ |
|
561
|
|
|
|
|
|
|
# Use lower case only |
|
562
|
|
|
|
|
|
|
my $lang = lc shift; |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
# If there's a 2 letter code where a three letter code has been used, replace it. |
|
565
|
|
|
|
|
|
|
if ($lang =~ /^([a-z]{3})/) |
|
566
|
|
|
|
|
|
|
{ |
|
567
|
|
|
|
|
|
|
substr($lang, 0, 3) = $lang_3to2->{$1} |
|
568
|
|
|
|
|
|
|
if defined $lang_3to2->{$1}; |
|
569
|
|
|
|
|
|
|
} |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
return $lang_grandfather->{$lang} |
|
572
|
|
|
|
|
|
|
if defined $lang_grandfather->{$lang}; |
|
573
|
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
# Simple case. |
|
575
|
|
|
|
|
|
|
return $lang if length $lang < 4; |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
# Replace '_' with '-'. |
|
578
|
|
|
|
|
|
|
$lang =~ s/_/-/g; |
|
579
|
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
# upper case the country component of lang-country pairs. |
|
581
|
|
|
|
|
|
|
return sprintf('%s-%s', $1, _canon_country($2)) |
|
582
|
|
|
|
|
|
|
if $lang =~ /^([a-z]{2,3})-([a-z]{2}|\d{3})$/; |
|
583
|
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
# title case the script component of lang-script pairs. |
|
585
|
|
|
|
|
|
|
return sprintf('%s-%s', $1, _canon_script($2)) |
|
586
|
|
|
|
|
|
|
if $lang =~ /^([a-z]{2,3})-([a-z]{4})$/; |
|
587
|
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
# title case the script component, upper case country componet of lang-script-country triplets. |
|
589
|
|
|
|
|
|
|
return sprintf('%s-%s-%s', $1, _canon_script($2), _canon_country($3)) |
|
590
|
|
|
|
|
|
|
if $lang =~ /^([a-z]{2,3})-([a-z]{4})-([a-z]{2}|\d{3})$/; |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
# Too complicated - give up and return lower case. |
|
593
|
|
|
|
|
|
|
return $lang; |
|
594
|
|
|
|
|
|
|
} |
|
595
|
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
sub _canon_country |
|
597
|
|
|
|
|
|
|
{ |
|
598
|
|
|
|
|
|
|
my $c = uc shift; |
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
if ($c =~ /^\d\d\d$/) |
|
601
|
|
|
|
|
|
|
{ |
|
602
|
|
|
|
|
|
|
my $c1 = country_code2code($c, LOCALE_CODE_NUMERIC, LOCALE_CODE_ALPHA_2); |
|
603
|
|
|
|
|
|
|
$c = uc $c1 |
|
604
|
|
|
|
|
|
|
if defined $c1 && length $c1; |
|
605
|
|
|
|
|
|
|
} |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
return $obsolete_iso3166->{$c} |
|
608
|
|
|
|
|
|
|
if defined $obsolete_iso3166->{$c}; |
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
return $c; |
|
611
|
|
|
|
|
|
|
} |
|
612
|
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
sub _canon_script |
|
614
|
|
|
|
|
|
|
{ |
|
615
|
|
|
|
|
|
|
my $s = ucfirst lc shift; |
|
616
|
|
|
|
|
|
|
return $s; |
|
617
|
|
|
|
|
|
|
} |
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
1; |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
__END__ |
|
622
|
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=head1 NAME |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
HTML::HTML5::Sanity - make HTML5 DOM trees less insane |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
use HTML::HTML5::Parser; |
|
630
|
|
|
|
|
|
|
use HTML::HTML5::Sanity; |
|
631
|
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
my $parser = HTML::HTML5::Parser->new; |
|
633
|
|
|
|
|
|
|
my $html5_dom = $parser->parse_file('http://example.com/'); |
|
634
|
|
|
|
|
|
|
my $sane_dom = fix_document($html5_dom); |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
637
|
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
The Document Object Model (DOM) generated by HTML::HTML5::Parser meets |
|
639
|
|
|
|
|
|
|
the requirements of the HTML5 spec, but will probably catch a lot of |
|
640
|
|
|
|
|
|
|
people by surprise. |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
The main oddity is that elements and attributes which appear to be |
|
643
|
|
|
|
|
|
|
namespaced are not really. For example, the following element: |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
<div xml:lang="fr">...</div> |
|
646
|
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
Looks like it should be parsed so that it has an attribute "lang" in |
|
648
|
|
|
|
|
|
|
the XML namespace. Not so. It will really be parsed as having the |
|
649
|
|
|
|
|
|
|
attribute "xml:lang" in the null namespace. |
|
650
|
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=over 4 |
|
652
|
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
=item C<< fix_document($document) >> |
|
654
|
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
$sane_dom = fix_document($html5_dom); |
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
Returns a modified copy of the DOM and leaving the original DOM |
|
658
|
|
|
|
|
|
|
unmodified. |
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
=item C<< fix_element($element_node, $new_document_node, \%namespaces) >> |
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
Don't use this. Not exported. |
|
663
|
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=item C<< fix_attribute($attribute_node, $new_element_node, \%namespaces) >> |
|
665
|
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
Don't use this. Not exported. |
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=item C<$HTML::HTML5::Sanity::FIX_LANG_ATTRIBUTES> |
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
$HTML::HTML5::Sanity::FIX_LANG_ATTRIBUTES = 2; |
|
671
|
|
|
|
|
|
|
$sane_dom = fix_document($html5_dom); |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
If set to 1 (the default), the package will detect invalid values in |
|
674
|
|
|
|
|
|
|
@lang and @xml:lang, and remove the attribute if it is invalid. If set |
|
675
|
|
|
|
|
|
|
to 2, it will also attempt to canonicalise the value (e.g. 'EN_GB' will |
|
676
|
|
|
|
|
|
|
be converted to to 'en-GB'). If set to 0, then the value of language |
|
677
|
|
|
|
|
|
|
attributes is not checked. |
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
=back |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
=head1 BUGS |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
Please report any bugs to L<http://rt.cpan.org/>. |
|
684
|
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
L<HTML::HTML5::Parser>, L<XML::LibXML>, L<Task::HTML5>. |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=head1 AUTHOR |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
Toby Inkster E<lt>tobyink@cpan.orgE<gt>. |
|
692
|
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
694
|
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
Copyright (C) 2009-2013 by Toby Inkster |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
698
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=head1 DISCLAIMER OF WARRANTIES |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED |
|
703
|
|
|
|
|
|
|
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF |
|
704
|
|
|
|
|
|
|
MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
=cut |