line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::Link::Tester::Complex; |
2
|
|
|
|
|
|
|
$REVISION=q$Revision: 1.8 $ ; $VERSION = sprintf ( "%d.%02d", $REVISION =~ /(\d+).(\d+)/ ); |
3
|
|
|
|
|
|
|
|
4
|
3
|
|
|
3
|
|
18
|
use Carp qw(carp cluck croak); |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
378
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
WWW::Link::Tester::Complex - a careful tester for broken links |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use WWW::Link::Test::Complex |
13
|
|
|
|
|
|
|
$ua=create_a_user_agent(); |
14
|
|
|
|
|
|
|
$link=get_a_link_object(); |
15
|
|
|
|
|
|
|
WWW::Link::Test::Complex::test_link($ua, $link); |
16
|
|
|
|
|
|
|
WWW::Link::Tester::Simple::Test($url) |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 DESCRIPTION |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
This is a link testing module based on the work of Phil Mitchell at |
22
|
|
|
|
|
|
|
Harvard College. The aim is to test very carefully if a link is |
23
|
|
|
|
|
|
|
really there. |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
N.B. I have done the minimum reasonable edits on the file so that any |
26
|
|
|
|
|
|
|
later improvements can be easily added. This means that the module |
27
|
|
|
|
|
|
|
contains and sections of code which are not relevant to |
28
|
|
|
|
|
|
|
LinkController. |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 ROBOT LOGIC |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
This system should be controlled by the robot logic of the user agent it |
33
|
|
|
|
|
|
|
uses provided that the robot returns a 4xx response code. |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 AUTHOR |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
Copyright (c) 2000 by the President and Fellows of Harvard College |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
40
|
|
|
|
|
|
|
it under the terms of the GNU General Public License as published by |
41
|
|
|
|
|
|
|
the Free Software Foundation; either version 2 of the License, or (at |
42
|
|
|
|
|
|
|
your option) any later version. |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Please see the source code for further details |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=cut |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
############################################################################ |
49
|
|
|
|
|
|
|
# |
50
|
|
|
|
|
|
|
# Copyright (c) 2000 by the President and Fellows of Harvard College |
51
|
|
|
|
|
|
|
# |
52
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
53
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
54
|
|
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or (at |
55
|
|
|
|
|
|
|
# your option) any later version. |
56
|
|
|
|
|
|
|
# |
57
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, but |
58
|
|
|
|
|
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of |
59
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
60
|
|
|
|
|
|
|
# General Public License for more details. |
61
|
|
|
|
|
|
|
# |
62
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
63
|
|
|
|
|
|
|
# along with this program; if not, write to the Free Software |
64
|
|
|
|
|
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
65
|
|
|
|
|
|
|
# USA. |
66
|
|
|
|
|
|
|
# |
67
|
|
|
|
|
|
|
# Contact information: |
68
|
|
|
|
|
|
|
# |
69
|
|
|
|
|
|
|
# Phil Mitchell |
70
|
|
|
|
|
|
|
# Office for Information Systems |
71
|
|
|
|
|
|
|
# Harvard University |
72
|
|
|
|
|
|
|
# philip_mitchell at harvard.edu |
73
|
|
|
|
|
|
|
# |
74
|
|
|
|
|
|
|
############################################################################# |
75
|
|
|
|
|
|
|
# |
76
|
|
|
|
|
|
|
# When called without args, this script reads a list of URLs, one per line, |
77
|
|
|
|
|
|
|
# from $INPUT_FILE, extracts the url from each record, and tries to access |
78
|
|
|
|
|
|
|
# the url using the appropriate protocol. This includes following redirects |
79
|
|
|
|
|
|
|
# until either: |
80
|
|
|
|
|
|
|
# 1. the target page is successfully received; or |
81
|
|
|
|
|
|
|
# 2. a page cycle is detected; or |
82
|
|
|
|
|
|
|
# 3. a bad server or page request is detected; or |
83
|
|
|
|
|
|
|
# 4. a maximum number of redirects ($MAX_REDIRECTS) is exceeded. |
84
|
|
|
|
|
|
|
# |
85
|
|
|
|
|
|
|
#...deleted... |
86
|
|
|
|
|
|
|
# |
87
|
|
|
|
|
|
|
# Protocols supported: http, https, ftp, gopher, file, telnet. |
88
|
|
|
|
|
|
|
# |
89
|
|
|
|
|
|
|
# Status codes: |
90
|
|
|
|
|
|
|
# Success: All successful response codes have the form: |
91
|
|
|
|
|
|
|
# 2xx. Because we limit the size of responses we accept, we get a |
92
|
|
|
|
|
|
|
# lot of 206's in addition to 200's. |
93
|
|
|
|
|
|
|
# UNSUPPORTED_PROTOCOL: |
94
|
|
|
|
|
|
|
# Linkcheck handles {http, https, ftp, gopher, file, telnet}. Other |
95
|
|
|
|
|
|
|
# protocols will get this error. More commonly, it is the result of a |
96
|
|
|
|
|
|
|
# typo (eg. "thttp://"). |
97
|
|
|
|
|
|
|
# MALFORMED_URL: The url is syntactically incorrect. EG., |
98
|
|
|
|
|
|
|
# "http:/www.domain.com". |
99
|
|
|
|
|
|
|
# TELNET_FAILURE: Couldn't open the requested telnet connection. |
100
|
|
|
|
|
|
|
# HTTP_0_9_FAIL: Failed HTTP/0.9 connection (0.9 does not return |
101
|
|
|
|
|
|
|
# status codes). |
102
|
|
|
|
|
|
|
# REDIRECT_LIMIT_EXCEEDED: |
103
|
|
|
|
|
|
|
# Too many redirections. This error code should not normally be |
104
|
|
|
|
|
|
|
# received, it is in place to catch infinite redirect cycles. |
105
|
|
|
|
|
|
|
# UNKNOWN_ERROR: Rarely, LWP or HTTP modules will die, reporting an |
106
|
|
|
|
|
|
|
# error that is not useful to us. This error code should |
107
|
|
|
|
|
|
|
# not normally be received; it |
108
|
|
|
|
|
|
|
# will generally be corrected in subsequent passes. |
109
|
|
|
|
|
|
|
# |
110
|
|
|
|
|
|
|
# There are various configurable parameters documented below. In |
111
|
|
|
|
|
|
|
# addition to setting the input and output filenames, the most |
112
|
|
|
|
|
|
|
# important ones are those that control the timeout, the number of |
113
|
|
|
|
|
|
|
# retries, and the time between retries. These settings have an |
114
|
|
|
|
|
|
|
# important effect on the accuracy of results. |
115
|
|
|
|
|
|
|
# |
116
|
|
|
|
|
|
|
# Accuracy of results: |
117
|
|
|
|
|
|
|
# |
118
|
|
|
|
|
|
|
# Informal tests (results can be found at the end of this script) have |
119
|
|
|
|
|
|
|
# shown that: (1) a timeout of 30 sec is adequate; increasing to 60 |
120
|
|
|
|
|
|
|
# sec is not useful; 10 seconds is too short. (2) The absolute number |
121
|
|
|
|
|
|
|
# of recheck passes is less important than spreading them over |
122
|
|
|
|
|
|
|
# time. Reasonable results are obtained with 3 recheck passes, each |
123
|
|
|
|
|
|
|
# separated by 8 hours of sleep. |
124
|
|
|
|
|
|
|
# |
125
|
|
|
|
|
|
|
# In our set of about 10,000 urls, a first pass produces about 800 |
126
|
|
|
|
|
|
|
# (8%) bad urls. Subsequent passes will reduce that to about 650 |
127
|
|
|
|
|
|
|
# (6.5%). The use of telnet retry will reach another 25% of those |
128
|
|
|
|
|
|
|
# apparently bad urls. The estimate of total bad urls in our sample is |
129
|
|
|
|
|
|
|
# thus 4.5%. That list of bad urls is consistent across distinct runs |
130
|
|
|
|
|
|
|
# of the link checker at greater than 99%. Handchecking of a large |
131
|
|
|
|
|
|
|
# sample from this final list indicates a high degree of accuracy. |
132
|
|
|
|
|
|
|
# |
133
|
|
|
|
|
|
|
# Notes: |
134
|
|
|
|
|
|
|
# |
135
|
|
|
|
|
|
|
# - A "page cycle" is the use of a redirect or refresh tag to cycle through |
136
|
|
|
|
|
|
|
# a list of one or more pages for data refresh purposes. |
137
|
|
|
|
|
|
|
# |
138
|
|
|
|
|
|
|
# Design Notes: |
139
|
|
|
|
|
|
|
# |
140
|
|
|
|
|
|
|
# - Cookies: This version accepts all cookies. This allows it to handle some |
141
|
|
|
|
|
|
|
# URLs which require cookies. |
142
|
|
|
|
|
|
|
# |
143
|
|
|
|
|
|
|
# - Timeout bug: Due to an apparent bug in the interaction between |
144
|
|
|
|
|
|
|
# Solaris and certain web servers, some http responses come back |
145
|
|
|
|
|
|
|
# improperly terminated. As a result, LWP times out and reports a |
146
|
|
|
|
|
|
|
# server error when a (nearly) valid response has been received. To |
147
|
|
|
|
|
|
|
# avoid this, we open a telnet connection to the relevant port |
148
|
|
|
|
|
|
|
# (usually 80) and do a manual GET on the url. Telnet will also time |
149
|
|
|
|
|
|
|
# out in this case, but telnet.pm provides a dump of the partial |
150
|
|
|
|
|
|
|
# response received, and we use this. |
151
|
|
|
|
|
|
|
# |
152
|
|
|
|
|
|
|
# - WWW unreliability: Any given access to a server on the web is |
153
|
|
|
|
|
|
|
# subject to various kinds of flakiness. To avoid false reports of |
154
|
|
|
|
|
|
|
# bad servers, it is essential to re-test all errors, preferably over |
155
|
|
|
|
|
|
|
# a period of hours or days. This script completes a first pass |
156
|
|
|
|
|
|
|
# through all urls, typically taking 8 hours or more on 10,000 |
157
|
|
|
|
|
|
|
# urls. Then it performs additional ($RECHECKS) passes on all urls |
158
|
|
|
|
|
|
|
# that received error codes. It sleeps ($HOURS_TO_SLEEP) between |
159
|
|
|
|
|
|
|
# passes to improve the chances of getting a valid return code. |
160
|
|
|
|
|
|
|
# |
161
|
|
|
|
|
|
|
# - Redirects and cycles: The challenge is to follow redirects all |
162
|
|
|
|
|
|
|
# the way to the end of the line, but know when to stop. It is |
163
|
|
|
|
|
|
|
# complicated by the fact that some sites use the meta refresh tag |
164
|
|
|
|
|
|
|
# for their redirection, and by the fact that some sites have |
165
|
|
|
|
|
|
|
# infinite loop cycles for page refresh purposes. Five distinct cases |
166
|
|
|
|
|
|
|
# have been identified: |
167
|
|
|
|
|
|
|
# |
168
|
|
|
|
|
|
|
# 1. Proper redirect, using Location header. (Action: Follow redirect.) |
169
|
|
|
|
|
|
|
# 2. Proper meta refresh, on a single page. (Action: Detect cycle |
170
|
|
|
|
|
|
|
# and exit.) |
171
|
|
|
|
|
|
|
# 3. Proper meta refresh, on a cycle of pages. (Action:Detect |
172
|
|
|
|
|
|
|
# cycle and exit.) |
173
|
|
|
|
|
|
|
# 4. Redirect using meta refresh. (Action: Follow redirect.) |
174
|
|
|
|
|
|
|
# 5. Redirect loop on a single page for setting cookies. (Action: |
175
|
|
|
|
|
|
|
# Follow redirect.) |
176
|
|
|
|
|
|
|
# |
177
|
|
|
|
|
|
|
# Maintenance and Future Development Notes: |
178
|
|
|
|
|
|
|
# |
179
|
|
|
|
|
|
|
# - 401's and 403's: Currently does not handle authentication; just |
180
|
|
|
|
|
|
|
# reports these as errors. |
181
|
|
|
|
|
|
|
# |
182
|
|
|
|
|
|
|
# - Cookie warnings: With perl's -w option, many warnings will be |
183
|
|
|
|
|
|
|
# received about Cookies.pm. This seems to be due to the fact that |
184
|
|
|
|
|
|
|
# Cookies.pm does not cleanly handle incorrectly formatted |
185
|
|
|
|
|
|
|
# cookies. As far as I know, these warnings may be safely ignored. |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# Author: Phil Mitchell |
188
|
|
|
|
|
|
|
# Date: 02/22/01 |
189
|
|
|
|
|
|
|
# Version: 1.5 |
190
|
|
|
|
|
|
|
# |
191
|
|
|
|
|
|
|
############################################################################# |
192
|
|
|
|
|
|
|
|
193
|
3
|
|
|
3
|
|
16
|
use WWW::Link::Tester; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
230
|
|
194
|
|
|
|
|
|
|
@ISA="WWW::Link::Tester"; |
195
|
|
|
|
|
|
|
|
196
|
3
|
|
|
3
|
|
15
|
use strict; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
96
|
|
197
|
3
|
|
|
3
|
|
3629
|
use LWP::UserAgent; |
|
3
|
|
|
|
|
59750
|
|
|
3
|
|
|
|
|
113
|
|
198
|
3
|
|
|
3
|
|
33
|
use HTTP::Response; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
76
|
|
199
|
3
|
|
|
3
|
|
16
|
use HTTP::Message; |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
78
|
|
200
|
3
|
|
|
3
|
|
16
|
use HTTP::Status; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
1317
|
|
201
|
3
|
|
|
3
|
|
30
|
use HTTP::Headers; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
141
|
|
202
|
3
|
|
|
3
|
|
18
|
use HTTP::Request; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
76
|
|
203
|
3
|
|
|
3
|
|
4036
|
use HTTP::Cookies; |
|
3
|
|
|
|
|
27558
|
|
|
3
|
|
|
|
|
104
|
|
204
|
3
|
|
|
3
|
|
5056
|
use Net::Telnet; |
|
3
|
|
|
|
|
176731
|
|
|
3
|
|
|
|
|
304
|
|
205
|
|
|
|
|
|
|
#use LWP::Debug qw(+); |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
########################################### |
208
|
|
|
|
|
|
|
# Global variables |
209
|
|
|
|
|
|
|
########################################### |
210
|
|
|
|
|
|
|
|
211
|
3
|
|
|
|
|
1608
|
use vars qw( |
212
|
|
|
|
|
|
|
%url_hash |
213
|
|
|
|
|
|
|
$HTTP_DEFAULT_PORT |
214
|
|
|
|
|
|
|
$HTTP_VERSION |
215
|
|
|
|
|
|
|
$ADMIN_EMAIL |
216
|
|
|
|
|
|
|
$MAX_REDIRECTS |
217
|
|
|
|
|
|
|
$RECHECKS |
218
|
|
|
|
|
|
|
$HOURS_TO_SLEEP |
219
|
|
|
|
|
|
|
$AGENT_TIMEOUT |
220
|
|
|
|
|
|
|
$AGENT_MAX_RESPONSE |
221
|
|
|
|
|
|
|
$INPUT_FILE |
222
|
|
|
|
|
|
|
$OUTPUT_FILE |
223
|
|
|
|
|
|
|
$TMP_FILE |
224
|
|
|
|
|
|
|
$TELNET_LOGFILE |
225
|
|
|
|
|
|
|
$ADMIN_LOGFILE |
226
|
|
|
|
|
|
|
$REDIRECT_LIMIT_EXCEEDED |
227
|
|
|
|
|
|
|
$UNSUPPORTED_PROTOCOL |
228
|
|
|
|
|
|
|
$MALFORMED_URL |
229
|
|
|
|
|
|
|
$HTTP_0_9_OKAY |
230
|
|
|
|
|
|
|
$HTTP_0_9_FAIL |
231
|
|
|
|
|
|
|
$UNKNOWN_ERROR |
232
|
|
|
|
|
|
|
$VERBOSE |
233
|
|
|
|
|
|
|
$DEBUG |
234
|
|
|
|
|
|
|
$LOGGING |
235
|
|
|
|
|
|
|
$TELNET_SUCCESS |
236
|
|
|
|
|
|
|
$TELNET_FAILURE |
237
|
|
|
|
|
|
|
$agent |
238
|
|
|
|
|
|
|
$telnetAgent |
239
|
|
|
|
|
|
|
$cookieJar |
240
|
|
|
|
|
|
|
$redirectCount |
241
|
3
|
|
|
3
|
|
28
|
); |
|
3
|
|
|
|
|
9
|
|
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
########################################### |
246
|
|
|
|
|
|
|
# Configurable parameters |
247
|
|
|
|
|
|
|
########################################### |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
$ADMIN_EMAIL = ''; # If non-empty, script will send confirmation and result stats. |
250
|
|
|
|
|
|
|
$AGENT_TIMEOUT = 10; # In seconds, time for http agent to wait. 10 secs is often too |
251
|
|
|
|
|
|
|
# short, leads to spurious reports of server errors. Longer than |
252
|
|
|
|
|
|
|
# 30 secs not usually helpful. |
253
|
|
|
|
|
|
|
$AGENT_MAX_RESPONSE = 524288; # In bytes, max response to accept. Mainly want to |
254
|
|
|
|
|
|
|
# avoid being swamped by something huge. |
255
|
|
|
|
|
|
|
$MAX_REDIRECTS = 15; # Number of redirects to tolerate before giving up. Should never hit |
256
|
|
|
|
|
|
|
# this limit; it's here to avoid infinite loop. |
257
|
|
|
|
|
|
|
$RECHECKS = 3; # Number of recheck passes to recheck urls that return error codes. Note |
258
|
|
|
|
|
|
|
# that every server error automatically gets one retry via telnet. |
259
|
|
|
|
|
|
|
$HOURS_TO_SLEEP = 0; # Number of hours to sleep between recheck passes. |
260
|
|
|
|
|
|
|
$HTTP_DEFAULT_PORT = 80; |
261
|
|
|
|
|
|
|
$HTTP_VERSION = 'HTTP/1.0'; # Perl's HTTP module defaults to 0.9 |
262
|
|
|
|
|
|
|
$INPUT_FILE = "CURRENT.URLS.TXT"; |
263
|
|
|
|
|
|
|
$INPUT_FILE = "smalltest.txt"; |
264
|
|
|
|
|
|
|
$OUTPUT_FILE = "OUT.URLS.TXT"; |
265
|
|
|
|
|
|
|
$ADMIN_LOGFILE = "admin_logfile.txt"; # Log for result stats. |
266
|
|
|
|
|
|
|
$VERBOSE = 1; # If 1, print processing status to stdout |
267
|
|
|
|
|
|
|
$DEBUG = 0; # If 1, provides additional output to stdout; mainly HTTP headers. |
268
|
|
|
|
|
|
|
$LOGGING = 1; # Enable logging to $ADMIN_LOGFILE. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
########################################### |
271
|
|
|
|
|
|
|
# Misc. initializations |
272
|
|
|
|
|
|
|
########################################### |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
$TMP_FILE = "tmp.txt"; |
275
|
|
|
|
|
|
|
$TELNET_LOGFILE = "telnet_logfile.txt"; # Used internally to buffer data. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
# Response codes. All successful response codes have the form: 2xx. |
278
|
|
|
|
|
|
|
$REDIRECT_LIMIT_EXCEEDED = 'REDIRECT_LIMIT_EXCEEDED'; |
279
|
|
|
|
|
|
|
$UNSUPPORTED_PROTOCOL = 'UNSUPPORTED_PROTOCOL'; |
280
|
|
|
|
|
|
|
$MALFORMED_URL = 'MALFORMED_URL'; |
281
|
|
|
|
|
|
|
$TELNET_FAILURE = 'TELNET_FAILURE'; |
282
|
|
|
|
|
|
|
$HTTP_0_9_FAIL = 'HTTP_0_9_FAIL'; |
283
|
|
|
|
|
|
|
$UNKNOWN_ERROR = 'UNKNOWN_ERROR'; |
284
|
|
|
|
|
|
|
$TELNET_SUCCESS = 299; # Mimic a successful HTTP code |
285
|
|
|
|
|
|
|
$HTTP_0_9_OKAY = 298; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=head1 test_link |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
This function acts as glue between follow_url and LinkController. It |
290
|
|
|
|
|
|
|
returns a constructed HTTP::Response. This will mean that information |
291
|
|
|
|
|
|
|
is lost since we actually often have created the code from another |
292
|
|
|
|
|
|
|
response. |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=cut |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub new { |
297
|
4
|
|
|
4
|
0
|
84
|
my $proto = shift; |
298
|
4
|
|
33
|
|
|
28
|
my $class = ref($proto) || $proto; |
299
|
4
|
|
|
|
|
9
|
my $self = {}; |
300
|
4
|
|
|
|
|
14
|
$self->{"user_agent"}=shift; |
301
|
4
|
|
|
|
|
21
|
bless $self, $class; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
3
|
|
|
3
|
|
19
|
use vars qw($redirect_count $redirects %convert); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
9772
|
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
%convert=( |
307
|
|
|
|
|
|
|
$REDIRECT_LIMIT_EXCEEDED => RC_REDIRECT_LIMIT_EXCEEDED, |
308
|
|
|
|
|
|
|
$UNSUPPORTED_PROTOCOL => RC_PROTOCOL_UNSUPPORTED, |
309
|
|
|
|
|
|
|
$MALFORMED_URL => RC_PROTOCOL_UNSUPPORTED, |
310
|
|
|
|
|
|
|
$TELNET_FAILURE => RC_NOT_FOUND, |
311
|
|
|
|
|
|
|
$HTTP_0_9_FAIL => RC_INTERNAL_SERVER_ERROR, |
312
|
|
|
|
|
|
|
$UNKNOWN_ERROR => RC_BAD_REQUEST, |
313
|
|
|
|
|
|
|
); |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
sub get_response { |
317
|
14
|
|
|
14
|
0
|
20
|
my $self=shift; |
318
|
14
|
|
|
|
|
18
|
my $link=shift; |
319
|
14
|
|
|
|
|
22
|
$redirects=[]; |
320
|
14
|
|
|
|
|
21
|
$redirect_count=0; |
321
|
14
|
|
|
|
|
40
|
%url_hash=(); |
322
|
14
|
|
|
|
|
38
|
my $code=$self->follow_url($link->url()); |
323
|
14
|
|
|
|
|
25
|
scalar (keys %convert); |
324
|
14
|
|
|
|
|
51
|
CONVERT: while (my ($key,$value) = each %convert) { |
325
|
81
|
100
|
|
|
|
345
|
$code eq $key && do { |
326
|
4
|
|
|
|
|
6
|
$code=$value; |
327
|
4
|
|
|
|
|
9
|
last CONVERT; |
328
|
|
|
|
|
|
|
}; |
329
|
|
|
|
|
|
|
} |
330
|
14
|
50
|
|
|
|
217
|
print STDERR "COMPLEX generated response code $code\n" |
331
|
|
|
|
|
|
|
if $self->{verbose}; |
332
|
|
|
|
|
|
|
#cluck and die here generate coredumps!!!???! in perl 5.6.0 on Linux |
333
|
|
|
|
|
|
|
# cluck STDERR "COMPLEX generated response code $code"; |
334
|
14
|
50
|
|
|
|
57
|
die "non numeric response code generated" . $code |
335
|
|
|
|
|
|
|
unless $code =~ m/[1-9][0-9]+/; |
336
|
14
|
|
|
|
|
52
|
my $response=HTTP::Response->new($code); |
337
|
|
|
|
|
|
|
|
338
|
14
|
50
|
|
|
|
573
|
die "response: $response not reference" unless ref $response ; |
339
|
|
|
|
|
|
|
|
340
|
14
|
|
|
|
|
51
|
return $response, @$redirects |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# Set up the web agents and helpers. |
344
|
|
|
|
|
|
|
# $agent = new LWP::UserAgent; |
345
|
|
|
|
|
|
|
# $agent->timeout($AGENT_TIMEOUT); |
346
|
|
|
|
|
|
|
# $agent->max_size($AGENT_MAX_RESPONSE); |
347
|
|
|
|
|
|
|
$cookieJar = new HTTP::Cookies; |
348
|
|
|
|
|
|
|
$telnetAgent = new Net::Telnet(Timeout => $AGENT_TIMEOUT, |
349
|
|
|
|
|
|
|
Errmode => 'return'); |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
my ($url, $result, $newResult, %results, $outputStr, $urlCount, |
352
|
|
|
|
|
|
|
$count, $recheckCount, %resultSummary); |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
########################################### |
355
|
|
|
|
|
|
|
# check_for_meta_refresh |
356
|
|
|
|
|
|
|
########################################### |
357
|
|
|
|
|
|
|
# Routine that searches input string for something of the form: |
358
|
|
|
|
|
|
|
# |
359
|
|
|
|
|
|
|
# It is tolerant of extra whitespace, single or no quotes instead of |
360
|
|
|
|
|
|
|
# doublequotes, spaces around equals signs, and extra verbiage, and is |
361
|
|
|
|
|
|
|
# case-insensitive. |
362
|
|
|
|
|
|
|
# Call with: String of content to be searched |
363
|
|
|
|
|
|
|
# Returns: url, if a meta refresh is found; otherwise returns |
364
|
|
|
|
|
|
|
# empty string. |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
sub check_for_meta_refresh { |
367
|
8
|
50
|
|
8
|
0
|
85
|
if ($DEBUG) { print "check_for_meta_refresh()...\n"; } |
|
0
|
|
|
|
|
0
|
|
368
|
8
|
|
|
|
|
8
|
my $inputStr = shift; |
369
|
8
|
50
|
|
|
|
17
|
if ($inputStr =~ |
370
|
|
|
|
|
|
|
m{ #" |
371
|
|
|
|
|
|
|
]+? url |
372
|
|
|
|
|
|
|
\s* = \s* ["']? ([^"' >]+) ["']? [^>]+? > |
373
|
|
|
|
|
|
|
}ix) |
374
|
|
|
|
|
|
|
{ |
375
|
0
|
|
|
|
|
0
|
return $1; |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
else { |
378
|
8
|
|
|
|
|
16
|
return ""; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
}#end check_for_meta_refresh |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
########################################### |
383
|
|
|
|
|
|
|
# follow_url |
384
|
|
|
|
|
|
|
########################################### |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
# Tries to access a given url. The main case is HTTP protocol, but |
387
|
|
|
|
|
|
|
# also handles any protocol handled by LWP, plus telnet. For telnet, |
388
|
|
|
|
|
|
|
# just tries to open a connection. For HTTP, follows redirects until |
389
|
|
|
|
|
|
|
# a final status code is received or until $MAX_REDIRECTS is |
390
|
|
|
|
|
|
|
# exceeded. Accepts all cookies. To avoid infinite loops, detects page |
391
|
|
|
|
|
|
|
# refresh cycles. |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
# Call with: url, and optional second arg of referring url which is |
394
|
|
|
|
|
|
|
# used to absolutize url. |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# Returns: HTTP status code, or internal response codes (see above). |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
sub follow_url { |
399
|
30
|
|
|
30
|
0
|
39
|
my $self=shift; |
400
|
30
|
|
|
|
|
65
|
my $agent=$self->{"user_agent"}; |
401
|
30
|
|
|
|
|
46
|
my ($url, $referrer) = @_; |
402
|
30
|
|
|
|
|
38
|
my $VERBOSE=$self->{"verbose"}; |
403
|
|
|
|
|
|
|
|
404
|
30
|
50
|
|
|
|
90
|
return $MALFORMED_URL unless $url; |
405
|
30
|
|
|
|
|
36
|
my ($response, $protocol, $host, $port, $ping, $telnetResult, |
406
|
|
|
|
|
|
|
$request, $statusCode, $new_url); |
407
|
30
|
50
|
33
|
|
|
105
|
if ($VERBOSE || $DEBUG) { print "follow_url(): $url\n"; } |
|
0
|
|
|
|
|
0
|
|
408
|
30
|
|
|
|
|
93
|
$url_hash{$url} = 1; # Track all urls in each run, to detect cycles. |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# Note: It is crucial to hash this url BEFORE absolutizing it, b/c |
411
|
|
|
|
|
|
|
# we will test for cycles before absolutizing. |
412
|
|
|
|
|
|
|
|
413
|
30
|
100
|
|
|
|
53
|
if ($referrer) { $url = make_url_absolute($url, $referrer); } |
|
16
|
|
|
|
|
34
|
|
414
|
30
|
100
|
|
|
|
74
|
if (keys(%url_hash) > $MAX_REDIRECTS) { |
415
|
1
|
50
|
|
|
|
7
|
if ($VERBOSE) { print "Redirect limit exceeded.\n"; } |
|
0
|
|
|
|
|
0
|
|
416
|
1
|
|
|
|
|
118
|
return $REDIRECT_LIMIT_EXCEEDED; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
# EXTRACT PROTOCOL, HOST, AND (OPTIONAL) PORT. |
420
|
29
|
|
|
|
|
184
|
$url =~ m{ ^\s* ([a-z]+) :// ([^/:]+) }ix; |
421
|
29
|
100
|
66
|
|
|
142
|
if (!($1 && $2)) { |
422
|
3
|
50
|
|
|
|
8
|
if ($VERBOSE) { print "URL not well-formed.\n"; } |
|
0
|
|
|
|
|
0
|
|
423
|
3
|
|
|
|
|
9
|
return $MALFORMED_URL; |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
else { |
426
|
26
|
|
|
|
|
46
|
$protocol = $1; |
427
|
26
|
|
|
|
|
45
|
$host = $2; |
428
|
|
|
|
|
|
|
} |
429
|
26
|
|
|
|
|
125
|
$url =~ m{ \w+ :// [^/]+ : (\d+) }x; # Extract port |
430
|
26
|
50
|
|
|
|
58
|
if ($1) { $port = $1; } |
|
26
|
|
|
|
|
51
|
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
# HANDLE TELNET REQUESTS -- just see if we can open the connection. |
433
|
26
|
50
|
|
|
|
50
|
if ($protocol =~ /^telnet$/i) { |
434
|
0
|
0
|
|
|
|
0
|
if ($port) { |
435
|
0
|
|
|
|
|
0
|
$ping = $telnetAgent->open(Host => $host, |
436
|
|
|
|
|
|
|
Port => $port); |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
else { |
439
|
0
|
|
|
|
|
0
|
$ping = $telnetAgent->open(Host => $host); |
440
|
|
|
|
|
|
|
} |
441
|
0
|
0
|
|
|
|
0
|
if (!$ping) { return $TELNET_FAILURE; } |
|
0
|
|
|
|
|
0
|
|
442
|
0
|
|
|
|
|
0
|
else { return $TELNET_SUCCESS; } |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# HANDLE ALL OTHER REQUESTS (HTTP, HTTPS, FTP, GOPHER, FILE) |
446
|
26
|
50
|
|
|
|
73
|
if (!$agent->is_protocol_supported($protocol)) { |
447
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "Protocol not supported.\n"; } |
|
0
|
|
|
|
|
0
|
|
448
|
0
|
|
|
|
|
0
|
return $UNSUPPORTED_PROTOCOL; |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
# Use eval to avoid aborting if LWP or HTTP sends "die". |
451
|
26
|
|
|
|
|
237
|
eval { |
452
|
26
|
|
|
|
|
88
|
$request = HTTP::Request->new(GET => $url); |
453
|
26
|
|
|
|
|
2780
|
$request->protocol($HTTP_VERSION); |
454
|
26
|
|
|
|
|
255
|
$cookieJar->add_cookie_header($request); |
455
|
26
|
50
|
|
|
|
4130
|
if ($DEBUG) { print "\nRequest: \n", $request->as_string; } |
|
0
|
|
|
|
|
0
|
|
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# Use simple_request so we don't follow redirects automatically |
458
|
26
|
|
|
|
|
76
|
$response = $agent->simple_request($request); |
459
|
26
|
|
|
|
|
2067
|
$cookieJar->extract_cookies($response); |
460
|
26
|
|
|
|
|
2309
|
$statusCode = $response->code; |
461
|
|
|
|
|
|
|
}; |
462
|
26
|
50
|
|
|
|
236
|
if ($@) { |
463
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "LWP or HTTP error: $@\n"; } |
|
0
|
|
|
|
|
0
|
|
464
|
0
|
0
|
|
|
|
0
|
if ($LOGGING) { print STDERR "LWP or HTTP error: $@\n"; } |
|
0
|
|
|
|
|
0
|
|
465
|
0
|
|
|
|
|
0
|
return $UNKNOWN_ERROR; |
466
|
|
|
|
|
|
|
} |
467
|
26
|
50
|
|
|
|
51
|
if ($DEBUG) { print "Status: $statusCode\n"; } |
|
0
|
|
|
|
|
0
|
|
468
|
26
|
50
|
|
|
|
44
|
if ($DEBUG) { print "\nResponse Header: \n", $response->headers->as_string; } |
|
0
|
|
|
|
|
0
|
|
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
# Note: In case of timeout, agent sets $statusCode to server error. |
471
|
26
|
100
|
|
|
|
128
|
if ($statusCode =~ /2../) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
472
|
8
|
50
|
|
|
|
17
|
if ($VERBOSE) { print "Good response, checking for meta refresh tag...\n"; } |
|
0
|
|
|
|
|
0
|
|
473
|
8
|
|
|
|
|
30
|
$new_url = check_for_meta_refresh($response->content); |
474
|
8
|
50
|
|
|
|
18
|
if ($new_url ne "") { |
475
|
0
|
0
|
|
|
|
0
|
if (exists($url_hash{$new_url})) { |
476
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "This url already visited ... returning $statusCode.\n"; } |
|
0
|
|
|
|
|
0
|
|
477
|
0
|
|
|
|
|
0
|
return $statusCode; } |
478
|
|
|
|
|
|
|
else { |
479
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "Refresh to: $new_url\n"; } |
|
0
|
|
|
|
|
0
|
|
480
|
0
|
|
|
|
|
0
|
return $self->follow_url($new_url, $url); |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
} |
483
|
8
|
|
|
|
|
60
|
else { return $statusCode;} |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
elsif ($statusCode =~ /3../) { |
486
|
16
|
|
|
|
|
19
|
$redirect_count++; |
487
|
16
|
50
|
|
|
|
32
|
if ($VERBOSE) { print "Proper redirect...\n"; } |
|
0
|
|
|
|
|
0
|
|
488
|
|
|
|
|
|
|
# Note that we don't check for page cycles here. Some sites |
489
|
|
|
|
|
|
|
# will redirect to the same page while setting cookies, but |
490
|
|
|
|
|
|
|
# eventually they'll stop. |
491
|
16
|
|
|
|
|
38
|
$new_url = $response->headers->header('Location'); |
492
|
16
|
|
|
|
|
741
|
push @$redirects, $new_url; |
493
|
16
|
50
|
|
|
|
112
|
if ($VERBOSE) { print "Redirect to: $new_url\n"; } |
|
0
|
|
|
|
|
0
|
|
494
|
16
|
|
|
|
|
158
|
return $self->follow_url($new_url, $url); |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
elsif ($statusCode =~ /4../) { |
497
|
2
|
50
|
|
|
|
8
|
if ($VERBOSE) { print "Client error...\n"; } |
|
0
|
|
|
|
|
0
|
|
498
|
2
|
|
|
|
|
15
|
return $statusCode; |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
elsif ($statusCode =~ /5../) { |
501
|
0
|
0
|
|
|
|
0
|
if ($VERBOSE) { print "Server error...\n"; } |
|
0
|
|
|
|
|
0
|
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
# You might be tempted to do a retry right here. It is problematic |
504
|
|
|
|
|
|
|
# b/c you need to do another follow_url, but that will clash with |
505
|
|
|
|
|
|
|
# url_hash -- it will look like a page cycle. But if you do the |
506
|
|
|
|
|
|
|
# retry by hand w/ a simple request, you don't handle all the |
507
|
|
|
|
|
|
|
# cases properly. What we do is retry once using telnet, and leave |
508
|
|
|
|
|
|
|
# other retries to subsequent passes following main loop. |
509
|
|
|
|
|
|
|
|
510
|
0
|
0
|
|
|
|
0
|
if ($protocol =~ /^http$/i) { # Only works for HTTP requests. |
511
|
0
|
|
|
|
|
0
|
$telnetResult = |
512
|
|
|
|
|
|
|
$self->telnet_http_retry($host, $url, $request, $port); |
513
|
0
|
0
|
|
|
|
0
|
if ($telnetResult ne 'FAIL') { |
514
|
0
|
|
|
|
|
0
|
$statusCode = $telnetResult; |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
} |
517
|
0
|
|
|
|
|
0
|
return $statusCode; |
518
|
|
|
|
|
|
|
} # end 5xx case. |
519
|
|
|
|
|
|
|
else { # Everything else case. |
520
|
0
|
|
|
|
|
0
|
return $statusCode; |
521
|
|
|
|
|
|
|
} |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
} # end sub follow_url |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
########################################### |
526
|
|
|
|
|
|
|
# get_location_header |
527
|
|
|
|
|
|
|
########################################### |
528
|
|
|
|
|
|
|
# Extracts the url from the Location field of an HTTP redirect. |
529
|
|
|
|
|
|
|
# Call with: ref to array of header lines, w or w/o body at end. |
530
|
|
|
|
|
|
|
# Returns: URL found in Location header, or empty string. |
531
|
|
|
|
|
|
|
sub get_location_header { |
532
|
|
|
|
|
|
|
|
533
|
0
|
0
|
0
|
0
|
0
|
0
|
if ($VERBOSE || $DEBUG) { print "Looking for location header... \n"; } |
|
0
|
|
|
|
|
0
|
|
534
|
0
|
|
|
|
|
0
|
my ($headersRef) = @_; |
535
|
0
|
|
|
|
|
0
|
my $line; |
536
|
|
|
|
|
|
|
|
537
|
0
|
|
|
|
|
0
|
while ($line = shift @$headersRef) { |
538
|
0
|
0
|
|
|
|
0
|
if ($DEBUG) { print "Checking line: $line\n"; } |
|
0
|
|
|
|
|
0
|
|
539
|
0
|
0
|
|
|
|
0
|
last if $line =~ /^\s$/; |
540
|
0
|
0
|
|
|
|
0
|
if ($line =~ m{^Location: \s* (\S+)}x) { |
541
|
0
|
0
|
|
|
|
0
|
if ($DEBUG) { print "Line found: $line\n"; } |
|
0
|
|
|
|
|
0
|
|
542
|
0
|
|
|
|
|
0
|
return $1; |
543
|
|
|
|
|
|
|
} |
544
|
|
|
|
|
|
|
} |
545
|
0
|
|
|
|
|
0
|
return ""; |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
} # end sub get_location_header |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
########################################### |
550
|
|
|
|
|
|
|
# make_url_absolute |
551
|
|
|
|
|
|
|
########################################### |
552
|
|
|
|
|
|
|
# Make a relative url absolute by appending it to path of old url. |
553
|
|
|
|
|
|
|
# Call with: a fully qualified url as second arg, which will provide |
554
|
|
|
|
|
|
|
# path info for relative url which is first arg. |
555
|
|
|
|
|
|
|
# Returns: new absolute url |
556
|
|
|
|
|
|
|
sub make_url_absolute { |
557
|
|
|
|
|
|
|
|
558
|
16
|
50
|
|
16
|
0
|
33
|
if ($DEBUG) { print "make_url_absolute()...\n"; } |
|
0
|
|
|
|
|
0
|
|
559
|
16
|
|
|
|
|
22
|
my ($new_url, $old_url) = @_; |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
# Test to see if it's already absolute (starts w/ a syntactically correct scheme) |
562
|
16
|
50
|
|
|
|
82
|
if ($new_url =~ m{^[a-z]+://}i) { |
563
|
16
|
|
|
|
|
34
|
return $new_url; |
564
|
|
|
|
|
|
|
} |
565
|
|
|
|
|
|
|
|
566
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { print "Adding path to relative url: $new_url\n"; } |
|
0
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
# Case 1: new url is relative to root; it starts with slash, and |
568
|
|
|
|
|
|
|
# should be appended to raw domain name. |
569
|
0
|
0
|
|
|
|
|
if ($new_url =~ m{^/} ) { |
|
|
0
|
|
|
|
|
|
570
|
0
|
|
|
|
|
|
$old_url =~ m{ (\w+ :// [^/]+) }x; |
571
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { print "Case 1: append to $1\n"; } |
|
0
|
|
|
|
|
|
|
572
|
0
|
|
|
|
|
|
return $1 . $new_url; |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
# For cases 2 & 3, assume new url is relative to current directory; |
575
|
|
|
|
|
|
|
# Case 2: old url contains a trailing slash, eg. http://www.fib.com/bigfib/; |
576
|
|
|
|
|
|
|
# may or may not contain trailing filename |
577
|
|
|
|
|
|
|
elsif ($old_url =~ m{ (\w+://\S+/) }x ) { |
578
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { print "Case 2: append to $1\n"; } |
|
0
|
|
|
|
|
|
|
579
|
0
|
|
|
|
|
|
return $1 . $new_url; |
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
# Case 3: old url has no trailing slash, eg. http://www.fab.net |
582
|
|
|
|
|
|
|
else { |
583
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { print "Case 3: append to $old_url/\n"; } |
|
0
|
|
|
|
|
|
|
584
|
0
|
|
|
|
|
|
return "$old_url/$new_url"; |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
} # End make_url_absolute |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
########################################### |
590
|
|
|
|
|
|
|
# telnet_http_retry |
591
|
|
|
|
|
|
|
########################################### |
592
|
|
|
|
|
|
|
# Open a telnet connection to a host and try an HTTP GET for an |
593
|
|
|
|
|
|
|
# url. The response is processed according to status code similarly to |
594
|
|
|
|
|
|
|
# follow_url, and calls follow_url to handle redirects. Uses an LWP |
595
|
|
|
|
|
|
|
# request object b/c that's a convenient way to stick cookies into the |
596
|
|
|
|
|
|
|
# request string. |
597
|
|
|
|
|
|
|
# Note: Handles the Solaris/LWP bug (cf notes above) by reading the |
598
|
|
|
|
|
|
|
# telnet.pm input_log if telnet times out. |
599
|
|
|
|
|
|
|
# Call with: hostname, absolute url, LWP request object, and optional |
600
|
|
|
|
|
|
|
# port (default is $HTTP_DEFAULT_PORT). |
601
|
|
|
|
|
|
|
# Returns: status code, or 'FAIL' if can't make telnet connection. |
602
|
|
|
|
|
|
|
sub telnet_http_retry { |
603
|
0
|
|
|
0
|
0
|
|
my $self=shift; |
604
|
0
|
0
|
0
|
|
|
|
if ($VERBOSE || $DEBUG) { |
605
|
0
|
|
|
|
|
|
print "Telnet HTTP retry...\n"; |
606
|
|
|
|
|
|
|
} |
607
|
0
|
|
|
|
|
|
my ($host, $url, $request, $port) = @_; |
608
|
0
|
|
|
|
|
|
my ($telnetAgent, @lines, @buffer, $statusLine, $line, $logfileHandle, |
609
|
|
|
|
|
|
|
$httpVersion, $statusCode, $message, $contentStr, $new_url); |
610
|
0
|
0
|
|
|
|
|
open(LOGFILE, "+>$TELNET_LOGFILE") || warn "Can't open $TELNET_LOGFILE.\n"; |
611
|
0
|
0
|
0
|
|
|
|
if (!$port || $port !~ /^\d+$/) { |
612
|
0
|
|
|
|
|
|
$port = $HTTP_DEFAULT_PORT; |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
# Create agent and open connection. |
615
|
0
|
|
|
|
|
|
$telnetAgent = Net::Telnet->new(Host => $host, |
616
|
|
|
|
|
|
|
Port => $port, |
617
|
|
|
|
|
|
|
Input_log => $TELNET_LOGFILE, |
618
|
|
|
|
|
|
|
Timeout => $AGENT_TIMEOUT, |
619
|
|
|
|
|
|
|
Errmode => "return"); |
620
|
0
|
0
|
|
|
|
|
return 'FAIL' unless $telnetAgent; # Can't open telnet connection. |
621
|
0
|
|
|
|
|
|
$telnetAgent->max_buffer_length($AGENT_MAX_RESPONSE); |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
# Send the request. |
624
|
0
|
|
|
|
|
|
$telnetAgent->print($request->as_string, "\n"); |
625
|
|
|
|
|
|
|
# Get the response as array of lines. |
626
|
0
|
|
|
|
|
|
while (@buffer = $telnetAgent->getlines) { |
627
|
0
|
|
|
|
|
|
push (@lines, @buffer); |
628
|
|
|
|
|
|
|
} |
629
|
0
|
0
|
|
|
|
|
if ($telnetAgent->timed_out) { |
630
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
631
|
0
|
|
|
|
|
|
print "Telnet http timed out. Using input log...\n"; |
632
|
|
|
|
|
|
|
} |
633
|
0
|
|
|
|
|
|
undef @lines; |
634
|
0
|
|
|
|
|
|
while () { |
635
|
0
|
|
|
|
|
|
push (@lines, $_); |
636
|
|
|
|
|
|
|
} |
637
|
0
|
0
|
|
|
|
|
close LOGFILE or warn "Problem closing $TELNET_LOGFILE.\n"; |
638
|
|
|
|
|
|
|
} |
639
|
0
|
0
|
|
|
|
|
if (!@lines) { |
640
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
641
|
0
|
|
|
|
|
|
print "No data received.\n"; |
642
|
|
|
|
|
|
|
} |
643
|
0
|
|
|
|
|
|
return 'FAIL'; |
644
|
|
|
|
|
|
|
} |
645
|
0
|
0
|
|
|
|
|
if ($DEBUG) { |
646
|
0
|
|
|
|
|
|
print @lines,"\n"; |
647
|
|
|
|
|
|
|
} |
648
|
0
|
|
|
|
|
|
$statusLine = shift @lines; |
649
|
|
|
|
|
|
|
# We can only process status line and headers if the response is HTTP/1.0 or |
650
|
|
|
|
|
|
|
# better. This regexp copied from LWP::Protocol::http.pm. |
651
|
0
|
0
|
|
|
|
|
if ($statusLine =~ /^(HTTP\/\d+\.\d+)[ \t]+(\d+)[ \t]*([^\012]*)\012/) { |
652
|
|
|
|
|
|
|
# HTTP/1.0 response or better |
653
|
0
|
|
|
|
|
|
($httpVersion, $statusCode, $message) = ($1, $2, $3); |
654
|
0
|
|
|
|
|
|
chomp $message; |
655
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
656
|
0
|
|
|
|
|
|
print "Status line: $httpVersion $statusCode $message \n\n"; |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
|
659
|
0
|
0
|
|
|
|
|
if ($statusCode =~ /2../) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
660
|
0
|
|
|
|
|
|
while ($line = shift @lines) { # Flatten array of lines. |
661
|
0
|
|
|
|
|
|
$contentStr .= $line; |
662
|
|
|
|
|
|
|
} |
663
|
0
|
|
|
|
|
|
$new_url = check_for_meta_refresh($contentStr); |
664
|
0
|
0
|
|
|
|
|
if ($new_url ne "") { |
665
|
0
|
0
|
|
|
|
|
if (exists($url_hash{$new_url})) { |
666
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
667
|
0
|
|
|
|
|
|
print "This url already visited ... returning $statusCode.\n"; |
668
|
|
|
|
|
|
|
} |
669
|
0
|
|
|
|
|
|
return $statusCode; |
670
|
|
|
|
|
|
|
} else { |
671
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
672
|
0
|
|
|
|
|
|
print "Refresh to: $new_url\n"; |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
# Return whatever status code we get from new url |
675
|
0
|
|
|
|
|
|
return $self->follow_url($new_url, $url); |
676
|
|
|
|
|
|
|
} |
677
|
|
|
|
|
|
|
} else { |
678
|
0
|
|
|
|
|
|
return $statusCode; |
679
|
|
|
|
|
|
|
} |
680
|
|
|
|
|
|
|
} elsif ($statusCode =~ /3../) { |
681
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
682
|
0
|
|
|
|
|
|
print "Proper redirect...\n"; |
683
|
|
|
|
|
|
|
} |
684
|
0
|
|
|
|
|
|
$new_url = get_location_header(\@lines); |
685
|
0
|
0
|
|
|
|
|
if ($new_url ne "") { |
686
|
0
|
0
|
|
|
|
|
if (exists($url_hash{$new_url})) { |
687
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
688
|
0
|
|
|
|
|
|
print "This url already visited ... returning $statusCode.\n"; |
689
|
|
|
|
|
|
|
} |
690
|
0
|
|
|
|
|
|
return $statusCode; |
691
|
|
|
|
|
|
|
} else { |
692
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
693
|
0
|
|
|
|
|
|
print "Redirect to: $new_url\n"; |
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
# Return whatever status code we get from new url |
696
|
0
|
|
|
|
|
|
return $self->follow_url($new_url, $url); |
697
|
|
|
|
|
|
|
} |
698
|
|
|
|
|
|
|
} else { |
699
|
0
|
|
|
|
|
|
return $statusCode; |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
} elsif ($statusCode =~ m{4.. | 5..}x) { |
702
|
0
|
|
|
|
|
|
return $statusCode; |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
} # if valid status line |
705
|
|
|
|
|
|
|
else { |
706
|
0
|
|
|
|
|
|
unshift(@lines, $statusLine); |
707
|
|
|
|
|
|
|
} |
708
|
|
|
|
|
|
|
# If no status line, could be HTTP/0.9 server, which just sends |
709
|
|
|
|
|
|
|
# back content. If it contains a tag like , assume it's |
710
|
|
|
|
|
|
|
# okay. |
711
|
0
|
0
|
|
|
|
|
if ($VERBOSE) { |
712
|
0
|
|
|
|
|
|
print "Assuming HTTP/0.9 or less... \n"; |
713
|
|
|
|
|
|
|
} |
714
|
0
|
|
|
|
|
|
while ($line = shift @lines) { # Flatten array of lines. |
715
|
0
|
|
|
|
|
|
$contentStr .= $line; |
716
|
|
|
|
|
|
|
} |
717
|
0
|
0
|
|
|
|
|
if ($contentStr =~ /
|
718
|
0
|
|
|
|
|
|
return $HTTP_0_9_OKAY; |
719
|
|
|
|
|
|
|
} else { |
720
|
0
|
|
|
|
|
|
return $HTTP_0_9_FAIL; |
721
|
|
|
|
|
|
|
} |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
} # end sub telnet_http_retry |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
########################################### |
726
|
|
|
|
|
|
|
# END (Unused snippets and test results, below) |
727
|
|
|
|
|
|
|
########################################### |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# NOTES: |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
# 1. It would be nice to have a robust facility for absolutizing |
732
|
|
|
|
|
|
|
# URLs. I tried using URI.pm for this purpose and found it to be not |
733
|
|
|
|
|
|
|
# robust. EG., it allows the construction of: http:/www.yahoo.com, |
734
|
|
|
|
|
|
|
# which is not well-formed. |
735
|
|
|
|
|
|
|
# 2. Tolerance of meta refresh tag match? |
736
|
|
|
|
|
|
|
# 3. some duplicate code went from follow_url to the |
737
|
|
|
|
|
|
|
# telnet_http_retry; could be factored. |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
1; #Spoilt children / happy / required even |