line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Copyright (c) 2002-2005 the World Wide Web Consortium : |
2
|
|
|
|
|
|
|
# Keio University, |
3
|
|
|
|
|
|
|
# European Research Consortium for Informatics and Mathematics |
4
|
|
|
|
|
|
|
# Massachusetts Institute of Technology. |
5
|
|
|
|
|
|
|
# written by Olivier Thereaux for W3C |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
# $Id: HTMLValidator.pm,v 1.29 2008/11/18 16:48:56 ot Exp $ |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
package W3C::LogValidator::HTMLValidator; |
10
|
1
|
|
|
1
|
|
907
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
11
|
1
|
|
|
1
|
|
2493
|
use DB_File; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
require Exporter; |
15
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
16
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw() ] ); |
17
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
18
|
|
|
|
|
|
|
our @EXPORT = qw(); |
19
|
|
|
|
|
|
|
our $VERSION = sprintf "%d.%03d",q$Revision: 1.29 $ =~ /(\d+)\.(\d+)/; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
########################### |
24
|
|
|
|
|
|
|
# usual package interface # |
25
|
|
|
|
|
|
|
########################### |
26
|
|
|
|
|
|
|
our $verbose = 1; |
27
|
|
|
|
|
|
|
our %config; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub new |
30
|
|
|
|
|
|
|
{ |
31
|
|
|
|
|
|
|
my $self = {}; |
32
|
|
|
|
|
|
|
my $proto = shift; |
33
|
|
|
|
|
|
|
my $class = ref($proto) || $proto; |
34
|
|
|
|
|
|
|
# mandatory vars for the API |
35
|
|
|
|
|
|
|
$self->{RESULT} = undef; |
36
|
|
|
|
|
|
|
# internal stuff |
37
|
|
|
|
|
|
|
$self->{VALID} = undef; |
38
|
|
|
|
|
|
|
$self->{VALID_ERR_NUM} = undef; |
39
|
|
|
|
|
|
|
$self->{VALID_SUCCESS} = undef; |
40
|
|
|
|
|
|
|
$self->{VALID_HEAD} = undef; |
41
|
|
|
|
|
|
|
# configuration for this module |
42
|
|
|
|
|
|
|
if (@_) {%config = %{(shift)};} |
43
|
|
|
|
|
|
|
$config{ValidatorMethod} = "HEAD" ; |
44
|
|
|
|
|
|
|
$config{ValidatorHost} = "validator.w3.org" if (! exists $config{ValidatorHost}); |
45
|
|
|
|
|
|
|
$config{ValidatorPort} = "80" if (!exists $config{ValidatorPort}); |
46
|
|
|
|
|
|
|
$config{ValidatorString} = "/check\?uri=" if (!exists $config{ValidatorString}); |
47
|
|
|
|
|
|
|
$config{ValidatorPostString} = "\;output=xml" if (!exists $config{ValidatorPostString}); |
48
|
|
|
|
|
|
|
if (exists $config{AuthorizedExtensions}) |
49
|
|
|
|
|
|
|
{ |
50
|
|
|
|
|
|
|
$self->{AUTH_EXT} = $config{AuthorizedExtensions}; |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
else |
53
|
|
|
|
|
|
|
{ |
54
|
|
|
|
|
|
|
$self->{AUTH_EXT} = ".html .xhtml .phtml .htm .shtml .php .svg .xml /"; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
$config{ShowInvalid} = "Yes" if (!exists $config{ShowInvalid}); |
57
|
|
|
|
|
|
|
$config{ShowAborted} = "No" if (!exists $config{ShowAborted}); |
58
|
|
|
|
|
|
|
$config{ShowValid} = "No" if (!exists $config{ShowValid}); |
59
|
|
|
|
|
|
|
$config{CheckExtensionlessURIs} = "No" if (!exists $config{CheckExtensionlessURIs}); |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
if (exists $config{verbose}) {$verbose = $config{verbose}} |
63
|
|
|
|
|
|
|
@{$self->{URIs}} = undef; |
64
|
|
|
|
|
|
|
bless($self, $class); |
65
|
|
|
|
|
|
|
return $self; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub uris |
69
|
|
|
|
|
|
|
{ |
70
|
|
|
|
|
|
|
my $self = shift; |
71
|
|
|
|
|
|
|
if (@_) { @{$self->{URIs}} = @_ } |
72
|
|
|
|
|
|
|
return @{$self->{URIs}}; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub valid |
77
|
|
|
|
|
|
|
{ |
78
|
|
|
|
|
|
|
my $self = shift; |
79
|
|
|
|
|
|
|
if (@_) { $self->{VALID} = shift } |
80
|
|
|
|
|
|
|
return $self->{VALID}; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub valid_err_num |
84
|
|
|
|
|
|
|
{ |
85
|
|
|
|
|
|
|
my $self = shift; |
86
|
|
|
|
|
|
|
if (@_) { $self->{VALID_ERR_NUM} = shift } |
87
|
|
|
|
|
|
|
return $self->{VALID_ERR_NUM}; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub valid_success |
91
|
|
|
|
|
|
|
{ |
92
|
|
|
|
|
|
|
my $self = shift; |
93
|
|
|
|
|
|
|
if (@_) { $self->{VALID_SUCCESS} = shift } |
94
|
|
|
|
|
|
|
return $self->{VALID_SUCCESS}; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub valid_head |
98
|
|
|
|
|
|
|
{ |
99
|
|
|
|
|
|
|
my $self = shift; |
100
|
|
|
|
|
|
|
if (@_) { $self->{VALID_HEAD} = shift } |
101
|
|
|
|
|
|
|
return $self->{VALID_HEAD}; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub auth_ext |
106
|
|
|
|
|
|
|
{ |
107
|
|
|
|
|
|
|
my $self=shift; |
108
|
|
|
|
|
|
|
if (@_) { $self->{AUTH_EXT} = shift} |
109
|
|
|
|
|
|
|
return $self->{AUTH_EXT}; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub new_doc{ |
113
|
|
|
|
|
|
|
my $self=shift; |
114
|
|
|
|
|
|
|
$self->{VALID} = undef; |
115
|
|
|
|
|
|
|
$self->{VALID_ERR_NUM} = undef; |
116
|
|
|
|
|
|
|
$self->{VALID_SUCCESS} = undef; |
117
|
|
|
|
|
|
|
$self->{VALID_HEAD} = undef; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub HEAD_check { |
121
|
|
|
|
|
|
|
## Checking whether a document with no extension is actually an HTML/XML document |
122
|
|
|
|
|
|
|
## causes a lot of requests, but internal - should be OK? |
123
|
|
|
|
|
|
|
my $self = shift; |
124
|
|
|
|
|
|
|
my $check_uri; |
125
|
|
|
|
|
|
|
use LWP::UserAgent; |
126
|
|
|
|
|
|
|
if (@_) { $check_uri = shift } |
127
|
|
|
|
|
|
|
my $ua = new LWP::UserAgent; |
128
|
|
|
|
|
|
|
my $method = "HEAD"; |
129
|
|
|
|
|
|
|
my $request = new HTTP::Request("$method", "$check_uri"); |
130
|
|
|
|
|
|
|
my $response = new HTTP::Response; |
131
|
|
|
|
|
|
|
$response = $ua->simple_request($request); |
132
|
|
|
|
|
|
|
my $is_html = 0; |
133
|
|
|
|
|
|
|
if ($response->is_success) # not an error, we could contact the server |
134
|
|
|
|
|
|
|
{ |
135
|
|
|
|
|
|
|
my $type = $response->header('Content-Type'); |
136
|
|
|
|
|
|
|
if ($type =~ /text\/html|application\/xhtml+xml|text\/xml/) #should be enough for a start |
137
|
|
|
|
|
|
|
{ |
138
|
|
|
|
|
|
|
$is_html = 1; |
139
|
|
|
|
|
|
|
# print "URI with no extension $check_uri has content-type $type\n" if ($verbose > 2); # debug |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
return $is_html; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub trim_uris |
146
|
|
|
|
|
|
|
{ |
147
|
|
|
|
|
|
|
my $self = shift; |
148
|
|
|
|
|
|
|
my @authorized_extensions = split(" ", $self->auth_ext); |
149
|
|
|
|
|
|
|
my @trimmed_uris; |
150
|
|
|
|
|
|
|
my $exclude_regexp = ""; |
151
|
|
|
|
|
|
|
my @excluded_areas; |
152
|
|
|
|
|
|
|
$exclude_regexp = $config{ExcludeAreas}; |
153
|
|
|
|
|
|
|
if ($exclude_regexp){ |
154
|
|
|
|
|
|
|
$exclude_regexp =~ s/\//\\\//g ; |
155
|
|
|
|
|
|
|
@excluded_areas = split(" ", $exclude_regexp); |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
else { print "nothing to exclude\n" if ($verbose >2);} |
158
|
|
|
|
|
|
|
my $uri; |
159
|
|
|
|
|
|
|
my %HTTPcodes; |
160
|
|
|
|
|
|
|
if (defined ($config{tmpfile_HTTP_codes})) |
161
|
|
|
|
|
|
|
{ |
162
|
|
|
|
|
|
|
my $tmp_file_HTTP_codes = $config{tmpfile_HTTP_codes}; |
163
|
|
|
|
|
|
|
tie (%HTTPcodes, 'DB_File', "$tmp_file_HTTP_codes", O_RDONLY) || |
164
|
|
|
|
|
|
|
die ("Cannot create or open $tmp_file_HTTP_codes"); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
while ($uri = shift) |
169
|
|
|
|
|
|
|
{ |
170
|
|
|
|
|
|
|
my $uri_ext = ""; |
171
|
|
|
|
|
|
|
my $match = 0; |
172
|
|
|
|
|
|
|
if ($uri =~ /(\.[0-9a-zA-Z]+)(\?.*)?$/) |
173
|
|
|
|
|
|
|
{ |
174
|
|
|
|
|
|
|
$uri_ext = $1; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
elsif ($uri =~ /\/$/) { $uri_ext = "/";} |
177
|
|
|
|
|
|
|
elsif (($uri_ext eq "") and $config{CheckExtensionlessURIs}) {$match = 1; } |
178
|
|
|
|
|
|
|
# we keep URIs without extension, if asked to |
179
|
|
|
|
|
|
|
# otherwise, we check their mime type through the wire |
180
|
|
|
|
|
|
|
elsif ($self->HEAD_check($uri) ) {$match = 1; } |
181
|
|
|
|
|
|
|
if ($match eq 0){ |
182
|
|
|
|
|
|
|
foreach my $ext (@authorized_extensions) |
183
|
|
|
|
|
|
|
{ |
184
|
|
|
|
|
|
|
if (($ext eq $uri_ext) or ($ext eq "*")) { $match = 1; } |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
if ($match) |
188
|
|
|
|
|
|
|
{ |
189
|
|
|
|
|
|
|
foreach my $area (@excluded_areas) |
190
|
|
|
|
|
|
|
{ |
191
|
|
|
|
|
|
|
if ($uri =~ /$area/) |
192
|
|
|
|
|
|
|
{ |
193
|
|
|
|
|
|
|
my $slasharea = $area; |
194
|
|
|
|
|
|
|
$slasharea =~ s/\\\//\//g; |
195
|
|
|
|
|
|
|
$slasharea =~ s/\\././g; |
196
|
|
|
|
|
|
|
print "Ignoring $uri matching $slasharea \n" if ($verbose > 2) ; |
197
|
|
|
|
|
|
|
$match = 0; |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
if (defined $HTTPcodes{$uri}) |
203
|
|
|
|
|
|
|
{ |
204
|
|
|
|
|
|
|
if (($HTTPcodes{$uri} ne "200") and ($HTTPcodes{$uri} =~ /\d+/)) |
205
|
|
|
|
|
|
|
{ |
206
|
|
|
|
|
|
|
$match = 0; |
207
|
|
|
|
|
|
|
if ($verbose > 2) { |
208
|
|
|
|
|
|
|
print "$uri returned code $HTTPcodes{$uri}, ignoring \n"; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
push @trimmed_uris,$uri if ($match); |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
print "trimmed list to ", scalar @trimmed_uris. " URIs\n"; |
215
|
|
|
|
|
|
|
return @trimmed_uris; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
######################################### |
219
|
|
|
|
|
|
|
# Actual subroutine to check the list of uris # |
220
|
|
|
|
|
|
|
######################################### |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub process_list |
223
|
|
|
|
|
|
|
{ |
224
|
|
|
|
|
|
|
my $self = shift; |
225
|
|
|
|
|
|
|
print "Now using the HTML Validator module... " if $verbose; |
226
|
|
|
|
|
|
|
print "\n" if ($verbose > 1); |
227
|
|
|
|
|
|
|
my @uris = undef; |
228
|
|
|
|
|
|
|
my %hits; |
229
|
|
|
|
|
|
|
# Opening the file with the hits and URIs data |
230
|
|
|
|
|
|
|
if (defined ($config{tmpfile})) |
231
|
|
|
|
|
|
|
{ |
232
|
|
|
|
|
|
|
my $tmp_file = $config{tmpfile}; |
233
|
|
|
|
|
|
|
tie (%hits, 'DB_File', "$tmp_file", O_RDONLY) || |
234
|
|
|
|
|
|
|
die ("Cannot create or open $tmp_file"); |
235
|
|
|
|
|
|
|
@uris = sort { $hits{$b} <=> $hits{$a} } keys %hits; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
elsif ($self->uris()) |
238
|
|
|
|
|
|
|
{ |
239
|
|
|
|
|
|
|
@uris = $self->uris(); |
240
|
|
|
|
|
|
|
foreach my $uri (@uris) { $hits{$uri} = 0 } |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
print "processing ", scalar @uris, " URIs\n" if ($verbose >= 1); |
243
|
|
|
|
|
|
|
print "\n (This may take a long time if you have many files to validate)\n" if ($verbose eq 1); |
244
|
|
|
|
|
|
|
print "\n" if ($verbose > 2); # trying to breathe in the debug volume... |
245
|
|
|
|
|
|
|
use LWP::UserAgent; |
246
|
|
|
|
|
|
|
use URI::Escape; |
247
|
|
|
|
|
|
|
my $max_invalid = undef; |
248
|
|
|
|
|
|
|
if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}} |
249
|
|
|
|
|
|
|
else {$max_invalid = 0} |
250
|
|
|
|
|
|
|
my $max_documents = undef; |
251
|
|
|
|
|
|
|
if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}} |
252
|
|
|
|
|
|
|
else {$max_documents = 0} |
253
|
|
|
|
|
|
|
my $name = ""; |
254
|
|
|
|
|
|
|
if (exists $config{ServerName}) {$name = $config{ServerName}} |
255
|
|
|
|
|
|
|
@uris = $self->trim_uris(@uris); |
256
|
|
|
|
|
|
|
my @result; |
257
|
|
|
|
|
|
|
my @result_head; |
258
|
|
|
|
|
|
|
my @whatweshow; |
259
|
|
|
|
|
|
|
my $whatweshow_str = ""; |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
push @whatweshow, "valid" if ($config{ShowValid} eq "Yes"); |
262
|
|
|
|
|
|
|
push @whatweshow, "invalid" if ($config{ShowInvalid} eq "Yes"); |
263
|
|
|
|
|
|
|
push @whatweshow, "non-validable" if ($config{ShowAborted} eq "Yes"); |
264
|
|
|
|
|
|
|
if (@whatweshow eq 3) { |
265
|
|
|
|
|
|
|
$whatweshow_str = "$whatweshow[0], $whatweshow[1] or $whatweshow[2]"; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
elsif (@whatweshow eq 2) { |
268
|
|
|
|
|
|
|
$whatweshow_str = "$whatweshow[0] or $whatweshow[1]"; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
elsif (@whatweshow eq 1) { |
271
|
|
|
|
|
|
|
$whatweshow_str = "$whatweshow[0]"; |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
my $intro="Here are the most popular $whatweshow_str document(s) that I could find in the |
274
|
|
|
|
|
|
|
logs for $name."; |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
my $outro; |
277
|
|
|
|
|
|
|
push @result_head, "Rank"; |
278
|
|
|
|
|
|
|
push @result_head, "Hits"; |
279
|
|
|
|
|
|
|
push @result_head, "#Error(s)"; |
280
|
|
|
|
|
|
|
push @result_head, "Address"; |
281
|
|
|
|
|
|
|
my $invalid_census = 0; # number of invalid docs |
282
|
|
|
|
|
|
|
my $last_invalid_position = 0; # latest position at which we found an invalid doc |
283
|
|
|
|
|
|
|
my $total_census = 0; # number of documents checked |
284
|
|
|
|
|
|
|
my $ua = new LWP::UserAgent; |
285
|
|
|
|
|
|
|
# $ua->timeout([30]); # instead of 180. 3 minutes timeout is too long. |
286
|
|
|
|
|
|
|
my $uri = undef; |
287
|
|
|
|
|
|
|
while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) ) |
288
|
|
|
|
|
|
|
{ |
289
|
|
|
|
|
|
|
$uri = shift (@uris); |
290
|
|
|
|
|
|
|
$self->new_doc(); |
291
|
|
|
|
|
|
|
my $uri_orig = $uri; |
292
|
|
|
|
|
|
|
$total_census++; |
293
|
|
|
|
|
|
|
print " processing #$total_census $uri..." if ($verbose > 1); |
294
|
|
|
|
|
|
|
# escaping URI |
295
|
|
|
|
|
|
|
$uri = uri_escape($uri); |
296
|
|
|
|
|
|
|
# creating the HTTP query string with all parameters |
297
|
|
|
|
|
|
|
my $string=join ("", "http://",$config{ValidatorHost},":",$config{ValidatorPort}, |
298
|
|
|
|
|
|
|
$config{ValidatorString},$uri,$config{ValidatorPostString}); |
299
|
|
|
|
|
|
|
my $method = $config{ValidatorMethod}; |
300
|
|
|
|
|
|
|
my $request = new HTTP::Request("$method", "$string"); |
301
|
|
|
|
|
|
|
my $response = new HTTP::Response; |
302
|
|
|
|
|
|
|
$response = $ua->simple_request($request); |
303
|
|
|
|
|
|
|
if ($response->is_success) # not an error, we could contact the server |
304
|
|
|
|
|
|
|
{ |
305
|
|
|
|
|
|
|
# set both valid and error number according to response |
306
|
|
|
|
|
|
|
$self->valid($response->header('X-W3C-Validator-Status')); |
307
|
|
|
|
|
|
|
$self->valid_err_num($response->header('X-W3C-Validator-Errors')); |
308
|
|
|
|
|
|
|
# we know the validator has been able to (in)validate if $self->valid is not NULL |
309
|
|
|
|
|
|
|
if ( ($self->valid)) # we got an answer about validation (valid, invalid or abort) |
310
|
|
|
|
|
|
|
{ |
311
|
|
|
|
|
|
|
if ( |
312
|
|
|
|
|
|
|
(($self->valid =~ /Invalid/i) and ($config{ShowInvalid} eq "Yes")) |
313
|
|
|
|
|
|
|
or (($self->valid =~ /Valid/i) and ($config{ShowValid} eq "Yes")) |
314
|
|
|
|
|
|
|
or (($self->valid =~ /Abort/i) and ($config{ShowAborted} eq "Yes")) |
315
|
|
|
|
|
|
|
) { |
316
|
|
|
|
|
|
|
my @result_tmp; |
317
|
|
|
|
|
|
|
push @result_tmp, $total_census; |
318
|
|
|
|
|
|
|
push @result_tmp, $hits{$uri_orig}; |
319
|
|
|
|
|
|
|
if ($self->valid =~ /Abort/i) { |
320
|
|
|
|
|
|
|
push @result_tmp, "Abort"; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
else { |
323
|
|
|
|
|
|
|
push @result_tmp, $self->valid_err_num; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
push @result_tmp, $uri_orig; |
326
|
|
|
|
|
|
|
push @result, [@result_tmp]; |
327
|
|
|
|
|
|
|
$invalid_census++; |
328
|
|
|
|
|
|
|
$last_invalid_position = $total_census; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
printf (" %s!", $self->valid) if ( ($verbose > 1) and (defined ($self->valid))); |
333
|
|
|
|
|
|
|
print " Could not validate (validation failed)!" if (($verbose > 1) and(!defined ($self->valid))); |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
if (($verbose > 1) and ($self->valid_err_num)) # verbose or debug |
336
|
|
|
|
|
|
|
{printf ", %s errors!",$self->valid_err_num} |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
else { |
339
|
|
|
|
|
|
|
print " Could not validate (no response from validator)!" if ($verbose > 1) ; |
340
|
|
|
|
|
|
|
if ($config{ShowAborted} eq "Yes") { |
341
|
|
|
|
|
|
|
my @result_tmp; |
342
|
|
|
|
|
|
|
push @result_tmp, $total_census; |
343
|
|
|
|
|
|
|
push @result_tmp, $hits{$uri_orig}; |
344
|
|
|
|
|
|
|
push @result_tmp, "Abort"; |
345
|
|
|
|
|
|
|
push @result_tmp, $uri_orig; |
346
|
|
|
|
|
|
|
push @result, [@result_tmp]; |
347
|
|
|
|
|
|
|
$invalid_census++; |
348
|
|
|
|
|
|
|
$last_invalid_position = $total_census; |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
print "\n" if ($verbose > 1); |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
$self->valid_head($response->as_string); # for debug |
354
|
|
|
|
|
|
|
if ($verbose > 2) {printf "%s :\n%s", $string, $self->valid_head;} # debug |
355
|
|
|
|
|
|
|
sleep(1); # do not kill validator.w3.org |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
print "Done!\n" if $verbose; |
359
|
|
|
|
|
|
|
print "invalid_census $invalid_census \n" if ($verbose > 2 ); |
360
|
|
|
|
|
|
|
if ($invalid_census) # we found invalid docs |
361
|
|
|
|
|
|
|
{ |
362
|
|
|
|
|
|
|
if ($invalid_census eq 1) # let's repect grammar here |
363
|
|
|
|
|
|
|
{ |
364
|
|
|
|
|
|
|
$intro=~ s/are/is/; |
365
|
|
|
|
|
|
|
$intro=~ s/ //; |
366
|
|
|
|
|
|
|
$intro=~ s/document\(s\)/document/; |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
$intro =~s//$invalid_census/; |
369
|
|
|
|
|
|
|
my $ratio = 10000*$invalid_census/$total_census; |
370
|
|
|
|
|
|
|
$ratio = int($ratio)/100; |
371
|
|
|
|
|
|
|
if ($last_invalid_position eq $total_census ) |
372
|
|
|
|
|
|
|
# usual case |
373
|
|
|
|
|
|
|
{ |
374
|
|
|
|
|
|
|
$outro="Conclusion : |
375
|
|
|
|
|
|
|
I had to check $last_invalid_position document(s) in order to find $invalid_census $whatweshow_str HTML documents. |
376
|
|
|
|
|
|
|
This means that about $ratio\% of your most popular documents were $whatweshow_str."; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
else |
379
|
|
|
|
|
|
|
# we didn't find as many invalid docs as requested |
380
|
|
|
|
|
|
|
{ |
381
|
|
|
|
|
|
|
if ($max_invalid) { |
382
|
|
|
|
|
|
|
$outro="Conclusion : |
383
|
|
|
|
|
|
|
You asked for $max_invalid $whatweshow_str HTML document but I could only find $invalid_census |
384
|
|
|
|
|
|
|
by processing (all the) $total_census document(s) in your logs. |
385
|
|
|
|
|
|
|
This means that about $ratio\% of your most popular documents were $whatweshow_str.";} |
386
|
|
|
|
|
|
|
else # max_invalid set to 0, user asked for all invalid docs |
387
|
|
|
|
|
|
|
{$outro="Conclusion : |
388
|
|
|
|
|
|
|
I found $invalid_census $whatweshow_str HTML document(s) |
389
|
|
|
|
|
|
|
by processing (all the) $total_census document(s) in your logs. |
390
|
|
|
|
|
|
|
This means that about $ratio\% of your most popular documents were $whatweshow_str.";} |
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
elsif (!$total_census) |
394
|
|
|
|
|
|
|
{ |
395
|
|
|
|
|
|
|
$intro="There was nothing to validate in this log."; |
396
|
|
|
|
|
|
|
$outro=""; |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
else # everything was actually valid! |
399
|
|
|
|
|
|
|
{ |
400
|
|
|
|
|
|
|
$intro=~s/ //; |
401
|
|
|
|
|
|
|
$outro="I couldn't find any $whatweshow_str document in this log."; |
402
|
|
|
|
|
|
|
} |
403
|
|
|
|
|
|
|
if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents |
404
|
|
|
|
|
|
|
{ |
405
|
|
|
|
|
|
|
$outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n Maybe you could set MaxDocuments to a higher value?"; |
406
|
|
|
|
|
|
|
} |
407
|
|
|
|
|
|
|
if (defined ($config{tmpfile})) |
408
|
|
|
|
|
|
|
{ |
409
|
|
|
|
|
|
|
untie %hits; |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
my %returnhash; |
412
|
|
|
|
|
|
|
$returnhash{"name"}="HTMLValidator"; |
413
|
|
|
|
|
|
|
$returnhash{"intro"}=$intro; |
414
|
|
|
|
|
|
|
$returnhash{"outro"}=$outro; |
415
|
|
|
|
|
|
|
@{$returnhash{"thead"}}=@result_head; |
416
|
|
|
|
|
|
|
@{$returnhash{"trows"}}=@result; |
417
|
|
|
|
|
|
|
return %returnhash; |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
package W3C::LogValidator::HTMLValidator; |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
1; |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
__END__ |