File Coverage

blib/lib/WebService/Validator/HTML/W3C.pm
Criterion Covered Total %
statement 69 169 40.8
branch 19 72 26.3
condition 10 16 62.5
subroutine 15 22 68.1
pod 9 9 100.0
total 122 288 42.3


line stmt bran cond sub pod time code
1             # $Id$
2              
3             package WebService::Validator::HTML::W3C;
4              
5 15     15   687215 use strict;
  15         45  
  15         844  
6 15     15   92 use base qw( Class::Accessor );
  15         41  
  15         28269  
7 15     15   124579 use LWP::UserAgent;
  15         1391069  
  15         627  
8 15     15   30861 use HTTP::Request::Common 'POST';
  15         86752  
  15         4095  
9 15     15   150 use URI::Escape;
  15         31  
  15         2509  
10 15     15   16807 use WebService::Validator::HTML::W3C::Error;
  15         48  
  15         136  
11 15     15   11209 use WebService::Validator::HTML::W3C::Warning;
  15         42  
  15         106  
12              
13             __PACKAGE__->mk_accessors(
14             qw( http_timeout validator_uri proxy ua _http_method
15             is_valid num_errors num_warnings uri _content _output _response ) );
16              
17 15     15   814 use vars qw( $VERSION $VALIDATOR_URI $HTTP_TIMEOUT );
  15         31  
  15         39097  
18              
19             $VERSION = 0.28;
20             $VALIDATOR_URI = 'http://validator.w3.org/check';
21             $HTTP_TIMEOUT = 30;
22              
23             =head1 NAME
24              
25             WebService::Validator::HTML::W3C - Access the W3Cs online HTML validator
26              
27             =head1 SYNOPSIS
28              
29             use WebService::Validator::HTML::W3C;
30              
31             my $v = WebService::Validator::HTML::W3C->new(
32             detailed => 1
33             );
34              
35             if ( $v->validate("http://www.example.com/") ) {
36             if ( $v->is_valid ) {
37             printf ("%s is valid\n", $v->uri);
38             } else {
39             printf ("%s is not valid\n", $v->uri);
40             foreach my $error ( @{$v->errors} ) {
41             printf("%s at line %d\n", $error->msg,
42             $error->line);
43             }
44             }
45             } else {
46             printf ("Failed to validate the website: %s\n", $v->validator_error);
47             }
48              
49             =head1 DESCRIPTION
50              
51             WebService::Validator::HTML::W3C provides access to the W3C's online
52             Markup validator. As well as reporting on whether a page is valid it
53             also provides access to a detailed list of the errors and where in
54             the validated document they occur.
55              
56             =head1 METHODS
57              
58             =head2 new
59              
60             my $v = WebService::Validator::HTML::W3C->new();
61              
62             Returns a new instance of the WebService::Validator::HTML::W3C object.
63              
64             There are various options that can be set when creating the Validator
65             object like so:
66              
67             my $v = WebService::Validator::HTML::W3C->new( http_timeout => 20 );
68              
69             =over 4
70              
71             =item validator_uri
72              
73             The URI of the validator to use. By default this accesses the W3Cs validator at http://validator.w3.org/check. If you have a local installation of the validator ( recommended if you wish to do a lot of testing ) or wish to use a validator at another location then you can use this option. Please note that you need to use the full path to the validator cgi.
74              
75             =item ua
76              
77             The user agent to use. Should be an LWP::UserAgent object or something that provides the same interface. If this argument is provided, the C and C arguments are ignored.
78              
79             =item http_timeout
80              
81             How long (in seconds) to wait for the HTTP connection to timeout when
82             contacting the validator. By default this is 30 seconds.
83              
84             =item detailed
85              
86             This fetches the XML response from the validator in order to provide information for the errors method. You should set this to true if you intend to use the errors method.
87              
88             =item proxy
89              
90             An HTTP proxy to use when communicating with the validation service.
91              
92             =item output
93              
94             Controls which output format is used. Can be either xml or soap12.
95              
96             The default is soap12 as the XML format is deprecated and is likely to be removed in the future.
97              
98             The default will always work so unless you're using a locally installed Validator you can safely ignore this.
99              
100             =back
101              
102             =cut
103              
104             sub new {
105 14     14 1 143611 my $ref = shift;
106 14   33     131 my $class = ref $ref || $ref;
107 14         38 my $obj = {};
108 14         51 bless $obj, $class;
109 14         76 $obj->_init(@_);
110 14         462 return $obj;
111             }
112              
113             sub _init {
114 14     14   34 my $self = shift;
115 14         64 my %args = @_;
116              
117 14   66     145 $self->http_timeout( $args{http_timeout} || $HTTP_TIMEOUT );
118 14   66     476 $self->validator_uri( $args{validator_uri} || $VALIDATOR_URI );
119 14         238 $self->ua( $args{ua} );
120 14 100       266 $self->_http_method( $args{detailed} ? 'GET' : 'HEAD' );
121 14   100     236 $self->_output( $args{output} || 'soap12' );
122 14   50     228 $self->proxy( $args{proxy} || '' );
123             }
124              
125             =head2 validate
126              
127             $v->validate( 'http:://www.example.com/' );
128              
129             Validate a URI. Returns 0 if the validation fails (e.g if the
130             validator cannot be reached), otherwise 1.
131              
132             =head2 validate_file
133              
134             $v->validate_file( './file.html' );
135              
136             Validate a file by uploading it to the W3C Validator. NB This has only been tested on a Linux box so may not work on non unix machines.
137              
138             =head2 validate_markup
139              
140             $v->validate_markup( $markup );
141              
142             Validate a scalar containing HTML.
143              
144             =head2 Alternate interface
145              
146             You can also pass a hash in to specify what you wish to validate. This is provided to ensure compatibility with the CSS validator module.
147              
148             $v->validate( uri => 'http://example.com/' );
149             $v->validate( string => $markup );
150             $v->validate( file => './file.html' );
151            
152             =cut
153              
154             sub validate_file {
155 0     0 1 0 my $self = shift;
156 0         0 my $file = shift;
157              
158 0 0       0 return $self->validator_error("You need to supply a file to validate")
159             unless $file;
160              
161 0         0 return $self->_validate( { file => $file } );
162             }
163              
164             sub validate_markup {
165 0     0 1 0 my $self = shift;
166 0         0 my $markup = shift;
167              
168 0 0       0 return $self->validator_error("You need to supply markup to validate")
169             unless $markup;
170              
171 0         0 return $self->_validate( { markup => $markup } );
172             }
173              
174             sub validate {
175 3     3 1 2664 my $self = shift;
176              
177 3         6 my ( %opts, $uri );
178 3 100       12 if ( scalar( @_ ) > 1 ) {
179 1         4 %opts = @_;
180            
181 1 50       11 if ( $opts{ 'uri' } ) {
    50          
    50          
182 0         0 $uri = $opts{ 'uri' };
183             } elsif ( $opts{ 'string' } ) {
184 0         0 return $self->validate_markup( $opts{ 'string' } );
185             } elsif( $opts{ 'file' } ) {
186 0         0 return $self->validate_file( $opts{ 'file' } );
187             } else {
188 1         7 return $self->validator_error( "You need to provide a uri, string or file to validate" );
189             }
190             } else {
191 2         5 $uri = shift;
192             }
193              
194 2 100       8 return $self->validator_error("You need to supply a URI to validate")
195             unless $uri;
196              
197 1 50       7 return $self->validator_error("You need to supply a URI scheme (e.g http)")
198             unless $uri =~ m(^.*?://);
199              
200 0         0 return $self->_validate( $uri );
201             }
202              
203             sub _validate {
204 0     0   0 my $self = shift;
205 0         0 my $uri = shift;
206              
207 0         0 my $uri_orig = $uri;
208              
209 0         0 $self->uri($uri_orig);
210              
211 0         0 my $ua = $self->ua;
212 0 0       0 if ( ! $ua ) {
213 0         0 $ua = LWP::UserAgent->new( agent => __PACKAGE__ . "/$VERSION",
214             timeout => $self->http_timeout );
215              
216 0 0       0 if ( $self->proxy ) { $ua->proxy( 'http', $self->proxy ); }
  0         0  
217             }
218              
219 0         0 my $request = $self->_get_request( $uri );
220              
221 0         0 my $response = $ua->request($request);
222              
223 0 0       0 if ( $response->is_success ) # not an error, we could contact the server
224             {
225              
226             # set both valid and error number according to response
227              
228 0         0 $self->_response( $response );
229            
230 0         0 my $res = $self->_parse_validator_response();
231 0 0       0 $self->_content( $response->content() )
232             if $self->_http_method() !~ /HEAD/;
233              
234             # we know the validator has been able to (in)validate if
235             # $self->valid is not NULL
236              
237 0 0       0 if ( $res ) {
238 0         0 return 1;
239             } else {
240 0         0 return 0;
241             }
242             }
243             else {
244 0         0 return $self->validator_error('Could not contact validator');
245             }
246             }
247              
248             =head2 is_valid
249              
250             $v->is_valid;
251              
252             Returns true (1) if the URI validated otherwise 0.
253              
254              
255             =head2 uri
256              
257             $v->uri();
258              
259             Returns the URI of the last page on which validation succeeded.
260              
261              
262             =head2 num_errors
263              
264             $num_errors = $v->num_errors();
265              
266             Returns the number of errors that the validator encountered.
267              
268             =head2 errorcount
269              
270             Synonym for num_errors. There to match CSS Validator interface.
271              
272             =head2 warningcount
273              
274             $num_errors = $v->warningcount();
275              
276             Returns the number of warnings that the validator encountered.
277              
278             =head2 errors
279              
280             $errors = $v->errors();
281            
282             foreach my $err ( @$errors ) {
283             printf("line: %s, col: %s\n\terror: %s\n",
284             $err->line, $err->col, $err->msg);
285             }
286              
287             Returns an array ref of WebService::Validator::HTML::W3C::Error objects.
288             These have line, col and msg methods that return a line number, a column
289             in that line and the error that occurred at that point.
290              
291             Note that you need XML::XPath for this to work and you must have initialised
292             WebService::Validator::HTML::W3C with the detailed option. If you have not
293             set the detailed option a warning will be issued, the detailed option will
294             be set and a second request made to the validator in order to fetch the
295             required information.
296              
297             If there was a problem processing the detailed information then this method
298             will return 0.
299              
300             =head2 warnings
301              
302             $warnings = $v->warnings();
303              
304             Works exactly the same as errors only returns an array ref of
305             WebService::Validator::HTML::W3C::Warning objects. In all other respects it's the same.
306              
307             =cut
308              
309             sub errors {
310 1     1 1 780 my $self = shift;
311              
312 1 50       5 return undef unless $self->num_errors();
313              
314 0 0       0 unless ( $self->_http_method() eq 'GET' ) {
315 0         0 warn "You should set detailed when initalising if you intend to use the errors method";
316 0         0 $self->_http_method( 'GET' );
317 0         0 $self->validate( $self->uri() );
318             }
319              
320 0         0 my @errs;
321              
322 0         0 eval { require XML::XPath; };
  0         0  
323 0 0       0 if ($@) {
324 0         0 warn "XML::XPath must be installed in order to get detailed errors";
325 0         0 return undef;
326             }
327              
328 0         0 my $xp = XML::XPath->new( xml => $self->_content() );
329              
330 0 0       0 if ( $self->_output eq 'xml' ) {
331 0 0       0 if ( ! $xp->findnodes('/result') ) {
332 0         0 return $self->validator_error( 'Result format does not appear to be XML' );
333             }
334 0         0 my @messages = $xp->findnodes('/result/messages/msg');
335              
336 0         0 foreach my $msg (@messages) {
337 0         0 my $err = WebService::Validator::HTML::W3C::Error->new({
338             line => $msg->getAttribute('line'),
339             col => $msg->getAttribute('col'),
340             msg => $msg->getChildNode(1)->getValue(),
341             });
342              
343 0         0 push @errs, $err;
344             }
345             } else { # assume soap...
346 0 0       0 if ( ! $xp->findnodes('/env:Envelope') ) {
347 0         0 return $self->validator_error( 'Result format does not appear to be SOAP' );
348             }
349 0         0 my @messages = $xp->findnodes( '/env:Envelope/env:Body/m:markupvalidationresponse/m:errors/m:errorlist/m:error' );
350              
351 0         0 foreach my $msg ( @messages ) {
352 0         0 my $err = WebService::Validator::HTML::W3C::Error->new({
353             line => $xp->find( './m:line', $msg )->get_node(1)->getChildNode(1)->getValue,
354             col => $xp->find( './m:col', $msg )->get_node(1)->getChildNode(1)->getValue,
355             msg => $xp->find( './m:message', $msg )->get_node(1)->getChildNode(1)->getValue,
356             msgid => $xp->find( './m:messageid', $msg )->get_node(1)->getChildNode(1)->getValue,
357             explanation => $xp->find( './m:explanation', $msg )->get_node(1)->getChildNode(1)->getValue,
358             });
359            
360 0 0       0 if ( $xp->find( './m:source' ) ) {
361 0         0 $err->source( $xp->find( './m:source', $msg )->get_node(1)->getChildNode(1)->getValue );
362             }
363              
364 0         0 push @errs, $err;
365             }
366             }
367              
368 0         0 return \@errs;
369             }
370              
371             sub errorcount {
372 1     1 1 566 shift->num_errors;
373             }
374              
375             sub warningcount {
376 0     0 1 0 shift->num_warnings;
377             }
378              
379             sub warnings {
380 0     0 1 0 my $self = shift;
381              
382 0 0       0 unless ( $self->_http_method() eq 'GET' ) {
383 0         0 warn "You should set detailed when initalising if you intend to use the warnings method";
384 0         0 $self->_http_method( 'GET' );
385 0         0 $self->validate( $self->uri() );
386             }
387              
388              
389 0         0 eval { require XML::XPath; };
  0         0  
390 0 0       0 if ($@) {
391 0         0 warn "XML::XPath must be installed in order to get warnings";
392 0         0 return undef;
393             }
394              
395 0         0 my $xp = XML::XPath->new( xml => $self->_content() );
396              
397 0         0 my @warnings;
398              
399 0 0       0 if ( $self->_output eq 'soap12' ) {
400 0 0       0 if ( ! $xp->findnodes('/env:Envelope') ) {
401 0         0 return $self->validator_error( 'Result format does not appear to be SOAP' );
402             }
403 0         0 my @messages = $xp->findnodes( '/env:Envelope/env:Body/m:markupvalidationresponse/m:warnings/m:warninglist/m:warning' );
404              
405 0         0 foreach my $msg ( @messages ) {
406 0         0 my ($line, $col);
407              
408 0 0       0 if( ($line = $xp->findvalue('./m:line', $msg)) eq "") {
409 0         0 $line = undef;
410             }
411              
412 0 0       0 if( ($col = $xp->findvalue('./m:col', $msg)) eq "") {
413 0         0 $col = undef;
414             }
415              
416 0         0 my $warning = WebService::Validator::HTML::W3C::Warning->new({
417             line => $line,
418             col => $col,
419             msg => $xp->find( './m:message', $msg )->get_node(1)->getChildNode(1)->getValue,
420             });
421              
422             # we may not get a source element if, e.g the only error is a
423             # missing doctype so check first
424 0 0       0 if ( $xp->find( './m:source' ) ) {
425 0         0 $warning->source( $xp->find( './m:source', $msg )->get_node(1)->getChildNode(1)->getValue );
426             }
427              
428 0         0 push @warnings, $warning;
429             }
430 0         0 return \@warnings;
431             } else {
432 0         0 return $self->validator_error( 'Warnings only available with SOAP output format' );
433              
434             }
435             }
436              
437             =head2 validator_error
438              
439             $error = $v->validator_error();
440              
441             Returns a string indicating why validation may not have occurred. This is not
442             the reason that a webpage was invalid. It is the reason that no meaningful
443             information about the attempted validation could be obtained. This is most
444             likely to be an HTTP error
445              
446             Possible values are:
447              
448             =over 4
449              
450             =item You need to supply a URI to validate
451              
452             You didn't pass a URI to the validate method
453              
454             =item You need to supply a URI with a scheme
455              
456             The URI you passed to validate didn't have a scheme on the front. The
457             W3C validator can't handle URIs like www.example.com but instead
458             needs URIs of the form http://www.example.com/.
459              
460             =item Not a W3C Validator or Bad URI
461              
462             The URI did not return the headers that WebService::Validator::HTML::W3C
463             relies on so it is likely that there is not a W3C Validator at that URI.
464             The other possibility is that it didn't like the URI you provided. Sadly
465             the Validator doesn't give very useful feedback on this at the moment.
466              
467             =item Could not contact validator
468              
469             WebService::Validator::HTML::W3C could not establish a connection to the URI.
470              
471             =item Did not get a sensible result from the validator
472              
473             Should never happen and most likely indicates a problem somewhere but
474             on the off chance that WebService::Validator::HTML::W3C is unable to make
475             sense of the response from the validator you'll get this error.
476              
477             =item Result format does not appear to be SOAP|XML
478              
479             If you've asked for detailed results and the reponse from the validator
480             isn't in the expected format then you'll get this error. Most likely to
481             happen if you ask for SOAP output from a validator that doesn't
482             support that format.
483              
484             =item You need to provide a uri, string or file to validate
485              
486             You've passed in a hash ( or in fact more than one argument ) to validate
487             but the hash does not contain one of the three expected keys.
488              
489             =back
490              
491             =cut
492              
493             sub validator_error {
494 6     6 1 10 my $self = shift;
495 6         8 my $validator_error = shift;
496              
497 6 100       21 if ( defined $validator_error ) {
498 3         8 $self->{'validator_error'} = $validator_error;
499 3         17 return 0;
500             }
501              
502 3         17 return $self->{'validator_error'};
503             }
504              
505             =head2 validator_uri
506              
507             $uri = $v->validator_uri();
508             $v->validator_uri('http://validator.w3.org/check');
509              
510             Returns or sets the URI of the validator to use. Please note that you need
511             to use the full path to the validator cgi.
512              
513              
514             =head2 http_timeout
515              
516             $timeout = $v->http_timeout();
517             $v->http_timeout(10);
518              
519             Returns or sets the timeout for the HTTP request.
520              
521             =cut
522              
523             sub _construct_uri {
524 0     0   0 my $self = shift;
525 0         0 my $uri_to_validate = shift;
526              
527             # creating the HTTP query string with all parameters
528 0         0 my $req_uri =
529             join ( '', "?uri=", uri_escape($uri_to_validate), ";output=", $self->_output );
530              
531 0         0 return $self->validator_uri . $req_uri;
532             }
533              
534             sub _parse_validator_response {
535 3     3   5684 my $self = shift;
536 3         12 my $response = $self->_response();
537              
538 3         107 my $valid = $response->header('X-W3C-Validator-Status');
539 3         290 my $valid_err_num = $response->header('X-W3C-Validator-Errors');
540 3         116 $self->num_warnings($response->header('X-W3C-Validator-Warnings'));
541              
542             # remove non digits to fix output bug in some versions of validator
543 3 100       178 $valid_err_num =~ s/\D+//g if $valid_err_num;
544              
545 3 100 66     47 if ( $valid and $valid_err_num ) {
    50          
    50          
546 2         12 $self->is_valid(0);
547 2         29 $self->num_errors($valid_err_num);
548 2         21 return 1;
549             }
550             elsif ( !defined $valid ) {
551 0         0 return $self->validator_error('Not a W3C Validator or Bad URI');
552             }
553             elsif ( $valid =~ /\bvalid\b/i ) {
554 1         4 $self->is_valid(1);
555 1         21 $self->num_errors($valid_err_num);
556 1         11 return 1;
557             }
558              
559 0           return $self->validator_error(
560             'Did not get a sensible result from the Validator');
561             }
562              
563             sub _get_request {
564 0     0     my $self = shift;
565 0           my $uri = shift;
566              
567 0 0         if ( ref $uri ) {
568 0 0         if ( $uri->{ file } ) {
    0          
569             return POST $self->validator_uri,
570             Content_Type => 'form-data',
571             Content => [
572             output => $self->_output,
573 0           uploaded_file => [ $uri->{ file } ],
574             ];
575             } elsif ( $uri->{ markup } ) {
576             return POST $self->validator_uri,
577             Content_Type => 'form-data',
578             Content => [
579             output => $self->_output,
580             fragment => $uri->{ markup },
581 0           ];
582             }
583             } else {
584 0           return new HTTP::Request( $self->_http_method(), $self->_construct_uri( $uri ) );
585             }
586             }
587            
588             1;
589              
590             __END__