File Coverage

blib/lib/W3C/LogValidator/HTMLValidator.pm
Criterion Covered Total %
statement 4 6 66.6
branch n/a
condition n/a
subroutine 2 2 100.0
pod n/a
total 6 8 75.0


line stmt bran cond sub pod time code
1             # Copyright (c) 2002-2005 the World Wide Web Consortium :
2             # Keio University,
3             # European Research Consortium for Informatics and Mathematics
4             # Massachusetts Institute of Technology.
5             # written by Olivier Thereaux for W3C
6             #
7             # $Id: HTMLValidator.pm,v 1.29 2008/11/18 16:48:56 ot Exp $
8              
9             package W3C::LogValidator::HTMLValidator;
10 1     1   907 use strict;
  1         2  
  1         34  
11 1     1   2493 use DB_File;
  0            
  0            
12              
13              
14             require Exporter;
15             our @ISA = qw(Exporter);
16             our %EXPORT_TAGS = ( 'all' => [ qw() ] );
17             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
18             our @EXPORT = qw();
19             our $VERSION = sprintf "%d.%03d",q$Revision: 1.29 $ =~ /(\d+)\.(\d+)/;
20              
21              
22              
23             ###########################
24             # usual package interface #
25             ###########################
26             our $verbose = 1;
27             our %config;
28              
29             sub new
30             {
31             my $self = {};
32             my $proto = shift;
33             my $class = ref($proto) || $proto;
34             # mandatory vars for the API
35             $self->{RESULT} = undef;
36             # internal stuff
37             $self->{VALID} = undef;
38             $self->{VALID_ERR_NUM} = undef;
39             $self->{VALID_SUCCESS} = undef;
40             $self->{VALID_HEAD} = undef;
41             # configuration for this module
42             if (@_) {%config = %{(shift)};}
43             $config{ValidatorMethod} = "HEAD" ;
44             $config{ValidatorHost} = "validator.w3.org" if (! exists $config{ValidatorHost});
45             $config{ValidatorPort} = "80" if (!exists $config{ValidatorPort});
46             $config{ValidatorString} = "/check\?uri=" if (!exists $config{ValidatorString});
47             $config{ValidatorPostString} = "\;output=xml" if (!exists $config{ValidatorPostString});
48             if (exists $config{AuthorizedExtensions})
49             {
50             $self->{AUTH_EXT} = $config{AuthorizedExtensions};
51             }
52             else
53             {
54             $self->{AUTH_EXT} = ".html .xhtml .phtml .htm .shtml .php .svg .xml /";
55             }
56             $config{ShowInvalid} = "Yes" if (!exists $config{ShowInvalid});
57             $config{ShowAborted} = "No" if (!exists $config{ShowAborted});
58             $config{ShowValid} = "No" if (!exists $config{ShowValid});
59             $config{CheckExtensionlessURIs} = "No" if (!exists $config{CheckExtensionlessURIs});
60              
61              
62             if (exists $config{verbose}) {$verbose = $config{verbose}}
63             @{$self->{URIs}} = undef;
64             bless($self, $class);
65             return $self;
66             }
67              
68             sub uris
69             {
70             my $self = shift;
71             if (@_) { @{$self->{URIs}} = @_ }
72             return @{$self->{URIs}};
73             }
74              
75              
76             sub valid
77             {
78             my $self = shift;
79             if (@_) { $self->{VALID} = shift }
80             return $self->{VALID};
81             }
82              
83             sub valid_err_num
84             {
85             my $self = shift;
86             if (@_) { $self->{VALID_ERR_NUM} = shift }
87             return $self->{VALID_ERR_NUM};
88             }
89              
90             sub valid_success
91             {
92             my $self = shift;
93             if (@_) { $self->{VALID_SUCCESS} = shift }
94             return $self->{VALID_SUCCESS};
95             }
96              
97             sub valid_head
98             {
99             my $self = shift;
100             if (@_) { $self->{VALID_HEAD} = shift }
101             return $self->{VALID_HEAD};
102             }
103              
104              
105             sub auth_ext
106             {
107             my $self=shift;
108             if (@_) { $self->{AUTH_EXT} = shift}
109             return $self->{AUTH_EXT};
110             }
111              
112             sub new_doc{
113             my $self=shift;
114             $self->{VALID} = undef;
115             $self->{VALID_ERR_NUM} = undef;
116             $self->{VALID_SUCCESS} = undef;
117             $self->{VALID_HEAD} = undef;
118             }
119              
120             sub HEAD_check {
121             ## Checking whether a document with no extension is actually an HTML/XML document
122             ## causes a lot of requests, but internal - should be OK?
123             my $self = shift;
124             my $check_uri;
125             use LWP::UserAgent;
126             if (@_) { $check_uri = shift }
127             my $ua = new LWP::UserAgent;
128             my $method = "HEAD";
129             my $request = new HTTP::Request("$method", "$check_uri");
130             my $response = new HTTP::Response;
131             $response = $ua->simple_request($request);
132             my $is_html = 0;
133             if ($response->is_success) # not an error, we could contact the server
134             {
135             my $type = $response->header('Content-Type');
136             if ($type =~ /text\/html|application\/xhtml+xml|text\/xml/) #should be enough for a start
137             {
138             $is_html = 1;
139             # print "URI with no extension $check_uri has content-type $type\n" if ($verbose > 2); # debug
140             }
141             }
142             return $is_html;
143             }
144              
145             sub trim_uris
146             {
147             my $self = shift;
148             my @authorized_extensions = split(" ", $self->auth_ext);
149             my @trimmed_uris;
150             my $exclude_regexp = "";
151             my @excluded_areas;
152             $exclude_regexp = $config{ExcludeAreas};
153             if ($exclude_regexp){
154             $exclude_regexp =~ s/\//\\\//g ;
155             @excluded_areas = split(" ", $exclude_regexp);
156             }
157             else { print "nothing to exclude\n" if ($verbose >2);}
158             my $uri;
159             my %HTTPcodes;
160             if (defined ($config{tmpfile_HTTP_codes}))
161             {
162             my $tmp_file_HTTP_codes = $config{tmpfile_HTTP_codes};
163             tie (%HTTPcodes, 'DB_File', "$tmp_file_HTTP_codes", O_RDONLY) ||
164             die ("Cannot create or open $tmp_file_HTTP_codes");
165             }
166              
167              
168             while ($uri = shift)
169             {
170             my $uri_ext = "";
171             my $match = 0;
172             if ($uri =~ /(\.[0-9a-zA-Z]+)(\?.*)?$/)
173             {
174             $uri_ext = $1;
175             }
176             elsif ($uri =~ /\/$/) { $uri_ext = "/";}
177             elsif (($uri_ext eq "") and $config{CheckExtensionlessURIs}) {$match = 1; }
178             # we keep URIs without extension, if asked to
179             # otherwise, we check their mime type through the wire
180             elsif ($self->HEAD_check($uri) ) {$match = 1; }
181             if ($match eq 0){
182             foreach my $ext (@authorized_extensions)
183             {
184             if (($ext eq $uri_ext) or ($ext eq "*")) { $match = 1; }
185             }
186             }
187             if ($match)
188             {
189             foreach my $area (@excluded_areas)
190             {
191             if ($uri =~ /$area/)
192             {
193             my $slasharea = $area;
194             $slasharea =~ s/\\\//\//g;
195             $slasharea =~ s/\\././g;
196             print "Ignoring $uri matching $slasharea \n" if ($verbose > 2) ;
197             $match = 0;
198             }
199             }
200             }
201              
202             if (defined $HTTPcodes{$uri})
203             {
204             if (($HTTPcodes{$uri} ne "200") and ($HTTPcodes{$uri} =~ /\d+/))
205             {
206             $match = 0;
207             if ($verbose > 2) {
208             print "$uri returned code $HTTPcodes{$uri}, ignoring \n";
209             }
210             }
211             }
212             push @trimmed_uris,$uri if ($match);
213             }
214             print "trimmed list to ", scalar @trimmed_uris. " URIs\n";
215             return @trimmed_uris;
216             }
217              
218             #########################################
219             # Actual subroutine to check the list of uris #
220             #########################################
221              
222             sub process_list
223             {
224             my $self = shift;
225             print "Now using the HTML Validator module... " if $verbose;
226             print "\n" if ($verbose > 1);
227             my @uris = undef;
228             my %hits;
229             # Opening the file with the hits and URIs data
230             if (defined ($config{tmpfile}))
231             {
232             my $tmp_file = $config{tmpfile};
233             tie (%hits, 'DB_File', "$tmp_file", O_RDONLY) ||
234             die ("Cannot create or open $tmp_file");
235             @uris = sort { $hits{$b} <=> $hits{$a} } keys %hits;
236             }
237             elsif ($self->uris())
238             {
239             @uris = $self->uris();
240             foreach my $uri (@uris) { $hits{$uri} = 0 }
241             }
242             print "processing ", scalar @uris, " URIs\n" if ($verbose >= 1);
243             print "\n (This may take a long time if you have many files to validate)\n" if ($verbose eq 1);
244             print "\n" if ($verbose > 2); # trying to breathe in the debug volume...
245             use LWP::UserAgent;
246             use URI::Escape;
247             my $max_invalid = undef;
248             if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}}
249             else {$max_invalid = 0}
250             my $max_documents = undef;
251             if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}}
252             else {$max_documents = 0}
253             my $name = "";
254             if (exists $config{ServerName}) {$name = $config{ServerName}}
255             @uris = $self->trim_uris(@uris);
256             my @result;
257             my @result_head;
258             my @whatweshow;
259             my $whatweshow_str = "";
260              
261             push @whatweshow, "valid" if ($config{ShowValid} eq "Yes");
262             push @whatweshow, "invalid" if ($config{ShowInvalid} eq "Yes");
263             push @whatweshow, "non-validable" if ($config{ShowAborted} eq "Yes");
264             if (@whatweshow eq 3) {
265             $whatweshow_str = "$whatweshow[0], $whatweshow[1] or $whatweshow[2]";
266             }
267             elsif (@whatweshow eq 2) {
268             $whatweshow_str = "$whatweshow[0] or $whatweshow[1]";
269             }
270             elsif (@whatweshow eq 1) {
271             $whatweshow_str = "$whatweshow[0]";
272             }
273             my $intro="Here are the most popular $whatweshow_str document(s) that I could find in the
274             logs for $name.";
275              
276             my $outro;
277             push @result_head, "Rank";
278             push @result_head, "Hits";
279             push @result_head, "#Error(s)";
280             push @result_head, "Address";
281             my $invalid_census = 0; # number of invalid docs
282             my $last_invalid_position = 0; # latest position at which we found an invalid doc
283             my $total_census = 0; # number of documents checked
284             my $ua = new LWP::UserAgent;
285             # $ua->timeout([30]); # instead of 180. 3 minutes timeout is too long.
286             my $uri = undef;
287             while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) )
288             {
289             $uri = shift (@uris);
290             $self->new_doc();
291             my $uri_orig = $uri;
292             $total_census++;
293             print " processing #$total_census $uri..." if ($verbose > 1);
294             # escaping URI
295             $uri = uri_escape($uri);
296             # creating the HTTP query string with all parameters
297             my $string=join ("", "http://",$config{ValidatorHost},":",$config{ValidatorPort},
298             $config{ValidatorString},$uri,$config{ValidatorPostString});
299             my $method = $config{ValidatorMethod};
300             my $request = new HTTP::Request("$method", "$string");
301             my $response = new HTTP::Response;
302             $response = $ua->simple_request($request);
303             if ($response->is_success) # not an error, we could contact the server
304             {
305             # set both valid and error number according to response
306             $self->valid($response->header('X-W3C-Validator-Status'));
307             $self->valid_err_num($response->header('X-W3C-Validator-Errors'));
308             # we know the validator has been able to (in)validate if $self->valid is not NULL
309             if ( ($self->valid)) # we got an answer about validation (valid, invalid or abort)
310             {
311             if (
312             (($self->valid =~ /Invalid/i) and ($config{ShowInvalid} eq "Yes"))
313             or (($self->valid =~ /Valid/i) and ($config{ShowValid} eq "Yes"))
314             or (($self->valid =~ /Abort/i) and ($config{ShowAborted} eq "Yes"))
315             ) {
316             my @result_tmp;
317             push @result_tmp, $total_census;
318             push @result_tmp, $hits{$uri_orig};
319             if ($self->valid =~ /Abort/i) {
320             push @result_tmp, "Abort";
321             }
322             else {
323             push @result_tmp, $self->valid_err_num;
324             }
325             push @result_tmp, $uri_orig;
326             push @result, [@result_tmp];
327             $invalid_census++;
328             $last_invalid_position = $total_census;
329             }
330             }
331            
332             printf (" %s!", $self->valid) if ( ($verbose > 1) and (defined ($self->valid)));
333             print " Could not validate (validation failed)!" if (($verbose > 1) and(!defined ($self->valid)));
334              
335             if (($verbose > 1) and ($self->valid_err_num)) # verbose or debug
336             {printf ", %s errors!",$self->valid_err_num}
337             }
338             else {
339             print " Could not validate (no response from validator)!" if ($verbose > 1) ;
340             if ($config{ShowAborted} eq "Yes") {
341             my @result_tmp;
342             push @result_tmp, $total_census;
343             push @result_tmp, $hits{$uri_orig};
344             push @result_tmp, "Abort";
345             push @result_tmp, $uri_orig;
346             push @result, [@result_tmp];
347             $invalid_census++;
348             $last_invalid_position = $total_census;
349             }
350             }
351             print "\n" if ($verbose > 1);
352              
353             $self->valid_head($response->as_string); # for debug
354             if ($verbose > 2) {printf "%s :\n%s", $string, $self->valid_head;} # debug
355             sleep(1); # do not kill validator.w3.org
356              
357             }
358             print "Done!\n" if $verbose;
359             print "invalid_census $invalid_census \n" if ($verbose > 2 );
360             if ($invalid_census) # we found invalid docs
361             {
362             if ($invalid_census eq 1) # let's repect grammar here
363             {
364             $intro=~ s/are/is/;
365             $intro=~ s/ //;
366             $intro=~ s/document\(s\)/document/;
367             }
368             $intro =~s//$invalid_census/;
369             my $ratio = 10000*$invalid_census/$total_census;
370             $ratio = int($ratio)/100;
371             if ($last_invalid_position eq $total_census )
372             # usual case
373             {
374             $outro="Conclusion :
375             I had to check $last_invalid_position document(s) in order to find $invalid_census $whatweshow_str HTML documents.
376             This means that about $ratio\% of your most popular documents were $whatweshow_str.";
377             }
378             else
379             # we didn't find as many invalid docs as requested
380             {
381             if ($max_invalid) {
382             $outro="Conclusion :
383             You asked for $max_invalid $whatweshow_str HTML document but I could only find $invalid_census
384             by processing (all the) $total_census document(s) in your logs.
385             This means that about $ratio\% of your most popular documents were $whatweshow_str.";}
386             else # max_invalid set to 0, user asked for all invalid docs
387             {$outro="Conclusion :
388             I found $invalid_census $whatweshow_str HTML document(s)
389             by processing (all the) $total_census document(s) in your logs.
390             This means that about $ratio\% of your most popular documents were $whatweshow_str.";}
391             }
392             }
393             elsif (!$total_census)
394             {
395             $intro="There was nothing to validate in this log.";
396             $outro="";
397             }
398             else # everything was actually valid!
399             {
400             $intro=~s/ //;
401             $outro="I couldn't find any $whatweshow_str document in this log.";
402             }
403             if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents
404             {
405             $outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n Maybe you could set MaxDocuments to a higher value?";
406             }
407             if (defined ($config{tmpfile}))
408             {
409             untie %hits;
410             }
411             my %returnhash;
412             $returnhash{"name"}="HTMLValidator";
413             $returnhash{"intro"}=$intro;
414             $returnhash{"outro"}=$outro;
415             @{$returnhash{"thead"}}=@result_head;
416             @{$returnhash{"trows"}}=@result;
417             return %returnhash;
418             }
419              
420             package W3C::LogValidator::HTMLValidator;
421              
422             1;
423              
424             __END__