blib/lib/W3C/XHTML/HTMLCompatChecker.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 7 | 10 | 70.0 |
branch | n/a | ||
condition | n/a | ||
subroutine | 3 | 4 | 75.0 |
pod | n/a | ||
total | 10 | 14 | 71.4 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package W3C::XHTML::HTMLCompatChecker; | ||||||
2 | |||||||
3 | 3 | 3 | 2281 | use strict; | |||
3 | 6 | ||||||
3 | 116 | ||||||
4 | 3 | 3 | 17 | use warnings; | |||
3 | 6 | ||||||
3 | 545 | ||||||
5 | |||||||
6 | require Exporter; | ||||||
7 | our @ISA = qw(Exporter); | ||||||
8 | our %EXPORT_TAGS = ( 'all' => [ qw() ] ); | ||||||
9 | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); | ||||||
10 | our @EXPORT = qw(); | ||||||
11 | our $VERSION = sprintf "%d.%03d",q$Revision: 1.1 $ =~ /(\d+)\.(\d+)/; | ||||||
12 | 0 | 0 | sub Version { $VERSION; } | ||||
13 | |||||||
14 | 3 | 3 | 5088 | use XML::Parser; | |||
0 | |||||||
0 | |||||||
15 | use URI; | ||||||
16 | use LWP::UserAgent; | ||||||
17 | |||||||
18 | # Define global constants | ||||||
19 | use constant TRUE => 1; | ||||||
20 | use constant FALSE => 0; | ||||||
21 | |||||||
22 | |||||||
23 | use constant APPC_FOUND_XML_DECL => 1; # http://www.w3.org/TR/xhtml1/#C_1 | ||||||
24 | use constant APPC_FOUND_XML_PI => 2; # http://www.w3.org/TR/xhtml1/#C_1 | ||||||
25 | use constant APPC_MISSING_SPACE => 3; # http://www.w3.org/TR/xhtml1/#C_2 | ||||||
26 | use constant APPC_UNMINIMIZED => 4; # http://www.w3.org/TR/xhtml1/#C_2 | ||||||
27 | use constant APPC_MINIMIZED => 5; # http://www.w3.org/TR/xhtml1/#C_3 | ||||||
28 | use constant APPC_MANY_ISINDEX => 6; # http://www.w3.org/TR/xhtml1/#C_6 | ||||||
29 | use constant APPC_ONLY_LANG => 7; # http://www.w3.org/TR/xhtml1/#C_7 | ||||||
30 | use constant APPC_ONLY_XML_LANG => 8; # http://www.w3.org/TR/xhtml1/#C_7 | ||||||
31 | use constant APPC_APOS_IN_ATTR => 9; # http://www.w3.org/TR/xhtml1/#C_16 | ||||||
32 | use constant APPC_APOS_IN_ELEM => 10; # http://www.w3.org/TR/xhtml1/#C_16 | ||||||
33 | |||||||
34 | use constant APPC_ERRO => 0; # @@ | ||||||
35 | use constant APPC_WARN => 1; # @@ | ||||||
36 | use constant APPC_INFO => 2; # @@ | ||||||
37 | use constant APPC_HINT => 3; # @@ | ||||||
38 | |||||||
39 | use constant SEVERITY_NAMES => | ||||||
40 | { | ||||||
41 | APPC_ERRO, "Error", | ||||||
42 | APPC_WARN, "Warning", | ||||||
43 | APPC_INFO, "Info", | ||||||
44 | APPC_HINT, "Hint", | ||||||
45 | }; | ||||||
46 | |||||||
47 | use constant CRITERIA => | ||||||
48 | { | ||||||
49 | APPC_FOUND_XML_DECL, [ 1, APPC_INFO, "XML declarations are problematic" ], | ||||||
50 | APPC_FOUND_XML_PI, [ 1, APPC_INFO, "XML processing instructions are problematic" ], | ||||||
51 | APPC_MISSING_SPACE, [ 2, APPC_ERRO, " |
||||||
52 | APPC_UNMINIMIZED, [ 2, APPC_ERRO, "For empty elements you shall use |
||||||
53 | APPC_MINIMIZED, [ 3, APPC_ERRO, "For non-empty elements, you shall use |
||||||
54 | APPC_ONLY_LANG, [ 7, APPC_ERRO, " |
||||||
55 | APPC_ONLY_XML_LANG, [ 7, APPC_ERRO, " |
||||||
56 | APPC_MANY_ISINDEX, [10, APPC_WARN, "Avoid more than one |
||||||
57 | APPC_APOS_IN_ATTR, [16, APPC_ERRO, "You must write ' as e.g. ' for legacy user agents" ], | ||||||
58 | APPC_APOS_IN_ELEM, [16, APPC_ERRO, "You must write ' as e.g. ' for legacy user agents" ], | ||||||
59 | }; | ||||||
60 | |||||||
61 | use constant GUIDELINE_TITLES => | ||||||
62 | { | ||||||
63 | 1, "Processing Instructions and the XML Declaration", | ||||||
64 | 2, "Empty Elements", | ||||||
65 | 3, "Element Minimization and Empty Element Content", | ||||||
66 | 6, "Isindex", | ||||||
67 | 7, "The lang and xml:lang Attributes", | ||||||
68 | 16, "The Named Character Reference '", | ||||||
69 | }; | ||||||
70 | |||||||
71 | use constant EMPTY_ELEMENTS => { map { $_ => 1 } | ||||||
72 | qw/ | ||||||
73 | base basefont link area hr img | ||||||
74 | meta param input isindex col br | ||||||
75 | / }; | ||||||
76 | |||||||
77 | # global variables... | ||||||
78 | our $ISINDEX = 0; | ||||||
79 | our $IS_RELEVANT_DOC = 1; # whether the checker is relevant to the doctype of the document being processed. | ||||||
80 | our $IS_RELEVANT_CT = 1; # whether the checker is relevant to the media type of the document being processed. | ||||||
81 | our $IS_WF = 1; # whether the document is at least well-formed XML | ||||||
82 | our @MESSAGES; | ||||||
83 | |||||||
84 | |||||||
85 | ########################### | ||||||
86 | # usual package interface # | ||||||
87 | ########################### | ||||||
88 | sub new | ||||||
89 | { | ||||||
90 | my $self = {}; | ||||||
91 | my $proto = shift; | ||||||
92 | my $class = ref($proto) || $proto; | ||||||
93 | bless($self, $class); | ||||||
94 | return $self; | ||||||
95 | } | ||||||
96 | |||||||
97 | |||||||
98 | |||||||
99 | |||||||
100 | ## Helper functions ####################################################### | ||||||
101 | sub is_empty_element { EMPTY_ELEMENTS->{shift()} } | ||||||
102 | sub is_isindex_element { shift eq "isindex" } | ||||||
103 | sub is_inside_head { shift->within_element("head") } | ||||||
104 | |||||||
105 | sub report_problem | ||||||
106 | { | ||||||
107 | my $exp = shift; | ||||||
108 | my $cod = shift; | ||||||
109 | my $loc = shift; | ||||||
110 | |||||||
111 | my $str = $exp->recognized_string; | ||||||
112 | my $lin = $exp->current_line; | ||||||
113 | my $col = $exp->current_column; | ||||||
114 | my $off = $exp->current_byte; | ||||||
115 | |||||||
116 | # determine position after skipping $loc, e.g. if there is | ||||||
117 | # | ||||||
118 | #
| ||||||
119 | # xml:lang = "de" | ||||||
120 | # class = "a b c d e f g" | ||||||
121 | # id = "example"/> | ||||||
122 | # | ||||||
123 | # the error is the / and it would be unhelpful to point at | ||||||
124 | # the < as expat would do in this case. | ||||||
125 | |||||||
126 | my $left = substr $str, 0, $loc; | ||||||
127 | my $lines = $left =~ y/\n//; # @@ does \n always work? | ||||||
128 | $left =~ s/^.*\n//s; # @@ does \n always work? | ||||||
129 | my $chars = length $left; | ||||||
130 | |||||||
131 | # set new positions | ||||||
132 | my $posy = $lin + $lines; # advance pointer | ||||||
133 | my $posx = $lines ? $chars : $col + $chars; # advance or replace | ||||||
134 | my $posxy = $off + $loc; # advance pointer | ||||||
135 | |||||||
136 | my $stext = SEVERITY_NAMES->{CRITERIA->{$cod}->[1]}; | ||||||
137 | my $mtext = CRITERIA->{$cod}->[2]; | ||||||
138 | my $cnum = CRITERIA->{$cod}->[0]; | ||||||
139 | my $gtitle = GUIDELINE_TITLES->{$cnum}; | ||||||
140 | |||||||
141 | push @MESSAGES, {severity => $stext, line => $posy, column => $posx + 1, cnum => $cnum, message_text => $mtext, guideline_title => $gtitle} | ||||||
142 | |||||||
143 | } | ||||||
144 | |||||||
145 | |||||||
146 | ## Pre-Parsing routines ################################################### | ||||||
147 | # make sure we are actually handling XHTML 1.0 documents served as text/html | ||||||
148 | # some code taken from W3C Markup Validator Codebase | ||||||
149 | |||||||
150 | sub parse_content_type { | ||||||
151 | my $Content_Type = shift; | ||||||
152 | my ($ct, @others) = split /\s*;\s*/, lc $Content_Type; | ||||||
153 | #print p($ct); | ||||||
154 | if ($ct ne "text/html") { | ||||||
155 | $IS_RELEVANT_CT = 0; | ||||||
156 | } | ||||||
157 | return $ct; | ||||||
158 | } | ||||||
159 | |||||||
160 | |||||||
161 | ## Handler for XML::Parser ################################################ | ||||||
162 | |||||||
163 | sub _start | ||||||
164 | { | ||||||
165 | my $exp = shift; | ||||||
166 | my $ele = shift; | ||||||
167 | my %att = @_; | ||||||
168 | my $str = $exp->recognized_string; | ||||||
169 | my $lin = $exp->current_line; | ||||||
170 | my $col = $exp->current_column; | ||||||
171 | my $off = $exp->current_byte; | ||||||
172 | my $end = length($str) - 1; | ||||||
173 | |||||||
174 | # check for multiple isindex elements | ||||||
175 | if (is_isindex_element($ele) and | ||||||
176 | is_inside_head($exp) and | ||||||
177 | $ISINDEX++) | ||||||
178 | { | ||||||
179 | report_problem($exp, APPC_MANY_ISINDEX, 0); | ||||||
180 | } | ||||||
181 | |||||||
182 | if ($str =~ m|/>$|) | ||||||
183 | { | ||||||
184 | # check for preceding space in empty element tag | ||||||
185 | if ($str !~ m|[ \x0d\x0a\t]/>$|) | ||||||
186 | { | ||||||
187 | report_problem($exp, APPC_MISSING_SPACE, $end - 1); | ||||||
188 | } | ||||||
189 | |||||||
190 | # check that empty element tags are used only for | ||||||
191 | # elements declared as EMPTY in the DTD | ||||||
192 | if (!is_empty_element($ele)) | ||||||
193 | { | ||||||
194 | report_problem($exp, APPC_MINIMIZED, $end - 1); | ||||||
195 | } | ||||||
196 | } | ||||||
197 | |||||||
198 | # check that elements declared as EMPTY use empty element tags | ||||||
199 | if (is_empty_element($ele)) | ||||||
200 | { | ||||||
201 | if ($str !~ m|/>$|) | ||||||
202 | { | ||||||
203 | report_problem($exp, APPC_UNMINIMIZED, $end); | ||||||
204 | } | ||||||
205 | } | ||||||
206 | |||||||
207 | # check for ' in attribute values | ||||||
208 | if ($str =~ m|'|) | ||||||
209 | { | ||||||
210 | local $_ = $str; | ||||||
211 | my $len = 0; | ||||||
212 | |||||||
213 | while(s/^(.*?)'//) | ||||||
214 | { | ||||||
215 | $len += length $1; | ||||||
216 | report_problem($exp, APPC_APOS_IN_ATTR, $len); | ||||||
217 | |||||||
218 | } | ||||||
219 | } | ||||||
220 | |||||||
221 | # check for ... |
||||||
222 | if (exists $att{'lang'} && not exists $att{'xml:lang'}) | ||||||
223 | { | ||||||
224 | report_problem($exp, APPC_ONLY_LANG, $end); | ||||||
225 | } | ||||||
226 | |||||||
227 | # check for ... |
||||||
228 | if (exists $att{'xml:lang'} && not exists $att{'lang'}) | ||||||
229 | { | ||||||
230 | report_problem($exp, APPC_ONLY_XML_LANG, $end); | ||||||
231 | } | ||||||
232 | } | ||||||
233 | |||||||
234 | sub _char | ||||||
235 | { | ||||||
236 | my $exp = shift; | ||||||
237 | my $txt = shift; | ||||||
238 | my $str = $exp->recognized_string; | ||||||
239 | my $lin = $exp->current_line; | ||||||
240 | my $col = $exp->current_column; | ||||||
241 | my $off = $exp->current_byte; | ||||||
242 | |||||||
243 | # check for ' in parsed character data | ||||||
244 | if ($str =~ /'/) | ||||||
245 | { | ||||||
246 | local $_ = $str; | ||||||
247 | my $len = 0; | ||||||
248 | |||||||
249 | while(s/^(.*?)'//) | ||||||
250 | { | ||||||
251 | $len += length $1; | ||||||
252 | report_problem($exp, APPC_APOS_IN_ELEM, $len); | ||||||
253 | |||||||
254 | } | ||||||
255 | } | ||||||
256 | } | ||||||
257 | |||||||
258 | sub _proc | ||||||
259 | { | ||||||
260 | # check for XML processing instructions | ||||||
261 | report_problem(shift, APPC_FOUND_XML_PI, 0); | ||||||
262 | } | ||||||
263 | |||||||
264 | sub _xmldecl | ||||||
265 | { | ||||||
266 | # check for XML declaration | ||||||
267 | report_problem(shift, APPC_FOUND_XML_DECL, 0); | ||||||
268 | } | ||||||
269 | |||||||
270 | sub _doctype | ||||||
271 | { | ||||||
272 | my $exp = shift; | ||||||
273 | my $doctypename = shift; | ||||||
274 | my $doctypesys = shift; | ||||||
275 | my $doctypepub = shift; | ||||||
276 | my $doctypeint = shift; | ||||||
277 | if (defined $doctypename) { | ||||||
278 | $IS_RELEVANT_DOC = 0 if ($doctypename ne "html"); | ||||||
279 | } | ||||||
280 | if(defined $doctypesys) { | ||||||
281 | $_ = $doctypesys; | ||||||
282 | $IS_RELEVANT_DOC = 0 if (not /http:\/\/www.w3.org\/.*\/xhtml.*.dtd/); | ||||||
283 | #$IS_RELEVANT_DOC = 0 if (not /http:\/\/www.w3.org\/.*\/xhtml1\/DTD\/xhtml1-(strict|transitional|frameset).dtd/); | ||||||
284 | } | ||||||
285 | if (defined $doctypepub) { | ||||||
286 | $_ = $doctypepub; | ||||||
287 | $IS_RELEVANT_DOC = 0 if (not /-\/\/W3C\/\/DTD XHTML .*\/\/EN/); | ||||||
288 | # we choose to accept checking any XHTML - could be stricter and only check for XHTML 1.0 | ||||||
289 | #$IS_RELEVANT_DOC = 0 if (not /-\/\/W3C\/\/DTD XHTML 1.0 (Strict|Transitional|Frameset)\/\/EN/); | ||||||
290 | } | ||||||
291 | if (defined $doctypeint) # there should be no internal subset | ||||||
292 | { | ||||||
293 | $IS_RELEVANT_DOC = 0 if (length $doctypeint); | ||||||
294 | } | ||||||
295 | $IS_RELEVANT_DOC = 0 if ((not defined $doctypesys) and (not defined $doctypepub)); # should not happen with XHTML 1.0 | ||||||
296 | } | ||||||
297 | |||||||
298 | |||||||
299 | |||||||
300 | ## Main ################################################################### | ||||||
301 | |||||||
302 | sub check_uri { | ||||||
303 | my $self = shift; | ||||||
304 | my $uri = shift; | ||||||
305 | my $any_xhtml = 0; # by default, only check XHTML 1.0 docs served as text/html | ||||||
306 | my @local_messages; | ||||||
307 | if (@_) { | ||||||
308 | my @anyxhtmlarry = @_; | ||||||
309 | if (int (@anyxhtmlarry) eq 2) | ||||||
310 | { | ||||||
311 | my $any_xhtml_varname=shift; | ||||||
312 | $any_xhtml = shift; | ||||||
313 | if ($any_xhtml ne "1") {$any_xhtml = 0} | ||||||
314 | } | ||||||
315 | } | ||||||
316 | |||||||
317 | # body... | ||||||
318 | my @messages; | ||||||
319 | |||||||
320 | if (defined $uri and length $uri and URI->new($uri)->scheme eq "http") | ||||||
321 | { | ||||||
322 | my $ua = LWP::UserAgent->new; | ||||||
323 | my $response = $ua->get($uri); | ||||||
324 | my $xml = undef; | ||||||
325 | my $ct = undef; | ||||||
326 | my @content_type_values = undef; | ||||||
327 | if ($response->is_success) { | ||||||
328 | $xml = $response->content; | ||||||
329 | @content_type_values = $response->header('Content-Type'); | ||||||
330 | $ct = $content_type_values[0]; | ||||||
331 | @messages = $self->check_content($xml); | ||||||
332 | } | ||||||
333 | if (defined $ct and length $ct) { | ||||||
334 | $ct = &parse_content_type($ct); | ||||||
335 | } | ||||||
336 | if ($IS_RELEVANT_CT eq 0 and $any_xhtml eq 0) { | ||||||
337 | push @local_messages, {severity => "Abort", message_text => "not text/html"}; | ||||||
338 | return @local_messages; | ||||||
339 | } | ||||||
340 | } | ||||||
341 | else { | ||||||
342 | push @local_messages, {severity => "Abort", message_text => "Bad URI"}; | ||||||
343 | return @local_messages; | ||||||
344 | } | ||||||
345 | return @messages; | ||||||
346 | } | ||||||
347 | |||||||
348 | sub check_content { | ||||||
349 | my $self = shift; | ||||||
350 | my $xml = shift; | ||||||
351 | my $any_xhtml = 0; # by default, only check XHTML 1.0 docs | ||||||
352 | my @local_messages; | ||||||
353 | |||||||
354 | if (@_) { | ||||||
355 | my @anyxhtmlarry = @_; | ||||||
356 | if (int (@anyxhtmlarry) eq 2) | ||||||
357 | { | ||||||
358 | my $any_xhtml_varname=shift; | ||||||
359 | $any_xhtml = shift; | ||||||
360 | if ($any_xhtml ne "1") {$any_xhtml = 0} | ||||||
361 | } | ||||||
362 | } | ||||||
363 | |||||||
364 | if (defined $xml and length $xml) | ||||||
365 | { | ||||||
366 | my $p = XML::Parser->new; | ||||||
367 | $p->setHandlers(Doctype => \&_doctype); | ||||||
368 | |||||||
369 | eval { $p->parsestring($xml); }; | ||||||
370 | #$output->param(is_relevant_ct => $IS_RELEVANT_CT); | ||||||
371 | #$output->param(is_relevant_doctype => $IS_RELEVANT_DOC); | ||||||
372 | |||||||
373 | if ($@) # not well-formed | ||||||
374 | { | ||||||
375 | $IS_WF = 0; | ||||||
376 | my $wf_errors = join '', $@; | ||||||
377 | push @local_messages, {severity => "Abort", message_text => "Content is not well-formed XML"}; | ||||||
378 | return @local_messages; | ||||||
379 | #$output->param(info_count => 1); | ||||||
380 | #$output->param(wf_errors => $wf_errors); | ||||||
381 | } | ||||||
382 | elsif (not $IS_RELEVANT_DOC) | ||||||
383 | { | ||||||
384 | if ($any_xhtml){ | ||||||
385 | push @local_messages, {severity => "Abort", message_text => "Content is not XHTML"}; | ||||||
386 | } | ||||||
387 | else { | ||||||
388 | push @local_messages, {severity => "Abort", message_text => "Content is not XHTML 1.0"}; | ||||||
389 | |||||||
390 | } | ||||||
391 | return @local_messages; | ||||||
392 | |||||||
393 | } | ||||||
394 | else # woot, Well-formed, and relevant. Let's get to work. | ||||||
395 | { | ||||||
396 | my $p = XML::Parser->new; | ||||||
397 | $p->setHandlers(Char => \&_char, | ||||||
398 | Proc => \&_proc, | ||||||
399 | Start => \&_start, | ||||||
400 | XMLDecl => \&_xmldecl); | ||||||
401 | eval { $p->parsestring($xml); }; | ||||||
402 | return @MESSAGES; | ||||||
403 | } | ||||||
404 | } | ||||||
405 | else { | ||||||
406 | return -1; | ||||||
407 | } | ||||||
408 | } | ||||||
409 | |||||||
410 | |||||||
411 | package W3C::XHTML::HTMLCompatChecker; | ||||||
412 | 1; | ||||||
413 | |||||||
414 | |||||||
415 | __END__ |