File Coverage

blib/lib/WWW/Link/Tester/Complex.pm
Criterion Covered Total %
statement 116 251 46.2
branch 36 158 22.7
condition 4 18 22.2
subroutine 18 20 90.0
pod 0 7 0.0
total 174 454 38.3


line stmt bran cond sub pod time code
1             package WWW::Link::Tester::Complex;
2             $REVISION=q$Revision: 1.8 $ ; $VERSION = sprintf ( "%d.%02d", $REVISION =~ /(\d+).(\d+)/ );
3              
4 3     3   18 use Carp qw(carp cluck croak);
  3         7  
  3         378  
5              
6             =head1 NAME
7              
8             WWW::Link::Tester::Complex - a careful tester for broken links
9              
10             =head1 SYNOPSIS
11              
12             use WWW::Link::Test::Complex
13             $ua=create_a_user_agent();
14             $link=get_a_link_object();
15             WWW::Link::Test::Complex::test_link($ua, $link);
16             WWW::Link::Tester::Simple::Test($url)
17              
18              
19             =head1 DESCRIPTION
20              
21             This is a link testing module based on the work of Phil Mitchell at
22             Harvard College. The aim is to test very carefully if a link is
23             really there.
24              
25             N.B. I have done the minimum reasonable edits on the file so that any
26             later improvements can be easily added. This means that the module
27             contains and sections of code which are not relevant to
28             LinkController.
29              
30             =head1 ROBOT LOGIC
31              
32             This system should be controlled by the robot logic of the user agent it
33             uses provided that the robot returns a 4xx response code.
34              
35             =head1 AUTHOR
36              
37             Copyright (c) 2000 by the President and Fellows of Harvard College
38              
39             This program is free software; you can redistribute it and/or modify
40             it under the terms of the GNU General Public License as published by
41             the Free Software Foundation; either version 2 of the License, or (at
42             your option) any later version.
43              
44             Please see the source code for further details
45              
46             =cut
47              
48             ############################################################################
49             #
50             # Copyright (c) 2000 by the President and Fellows of Harvard College
51             #
52             # This program is free software; you can redistribute it and/or modify
53             # it under the terms of the GNU General Public License as published by
54             # the Free Software Foundation; either version 2 of the License, or (at
55             # your option) any later version.
56             #
57             # This program is distributed in the hope that it will be useful, but
58             # WITHOUT ANY WARRANTY; without even the implied warranty of
59             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
60             # General Public License for more details.
61             #
62             # You should have received a copy of the GNU General Public License
63             # along with this program; if not, write to the Free Software
64             # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
65             # USA.
66             #
67             # Contact information:
68             #
69             # Phil Mitchell
70             # Office for Information Systems
71             # Harvard University
72             # philip_mitchell at harvard.edu
73             #
74             #############################################################################
75             #
76             # When called without args, this script reads a list of URLs, one per line,
77             # from $INPUT_FILE, extracts the url from each record, and tries to access
78             # the url using the appropriate protocol. This includes following redirects
79             # until either:
80             # 1. the target page is successfully received; or
81             # 2. a page cycle is detected; or
82             # 3. a bad server or page request is detected; or
83             # 4. a maximum number of redirects ($MAX_REDIRECTS) is exceeded.
84             #
85             #...deleted...
86             #
87             # Protocols supported: http, https, ftp, gopher, file, telnet.
88             #
89             # Status codes:
90             # Success: All successful response codes have the form:
91             # 2xx. Because we limit the size of responses we accept, we get a
92             # lot of 206's in addition to 200's.
93             # UNSUPPORTED_PROTOCOL:
94             # Linkcheck handles {http, https, ftp, gopher, file, telnet}. Other
95             # protocols will get this error. More commonly, it is the result of a
96             # typo (eg. "thttp://").
97             # MALFORMED_URL: The url is syntactically incorrect. EG.,
98             # "http:/www.domain.com".
99             # TELNET_FAILURE: Couldn't open the requested telnet connection.
100             # HTTP_0_9_FAIL: Failed HTTP/0.9 connection (0.9 does not return
101             # status codes).
102             # REDIRECT_LIMIT_EXCEEDED:
103             # Too many redirections. This error code should not normally be
104             # received, it is in place to catch infinite redirect cycles.
105             # UNKNOWN_ERROR: Rarely, LWP or HTTP modules will die, reporting an
106             # error that is not useful to us. This error code should
107             # not normally be received; it
108             # will generally be corrected in subsequent passes.
109             #
110             # There are various configurable parameters documented below. In
111             # addition to setting the input and output filenames, the most
112             # important ones are those that control the timeout, the number of
113             # retries, and the time between retries. These settings have an
114             # important effect on the accuracy of results.
115             #
116             # Accuracy of results:
117             #
118             # Informal tests (results can be found at the end of this script) have
119             # shown that: (1) a timeout of 30 sec is adequate; increasing to 60
120             # sec is not useful; 10 seconds is too short. (2) The absolute number
121             # of recheck passes is less important than spreading them over
122             # time. Reasonable results are obtained with 3 recheck passes, each
123             # separated by 8 hours of sleep.
124             #
125             # In our set of about 10,000 urls, a first pass produces about 800
126             # (8%) bad urls. Subsequent passes will reduce that to about 650
127             # (6.5%). The use of telnet retry will reach another 25% of those
128             # apparently bad urls. The estimate of total bad urls in our sample is
129             # thus 4.5%. That list of bad urls is consistent across distinct runs
130             # of the link checker at greater than 99%. Handchecking of a large
131             # sample from this final list indicates a high degree of accuracy.
132             #
133             # Notes:
134             #
135             # - A "page cycle" is the use of a redirect or refresh tag to cycle through
136             # a list of one or more pages for data refresh purposes.
137             #
138             # Design Notes:
139             #
140             # - Cookies: This version accepts all cookies. This allows it to handle some
141             # URLs which require cookies.
142             #
143             # - Timeout bug: Due to an apparent bug in the interaction between
144             # Solaris and certain web servers, some http responses come back
145             # improperly terminated. As a result, LWP times out and reports a
146             # server error when a (nearly) valid response has been received. To
147             # avoid this, we open a telnet connection to the relevant port
148             # (usually 80) and do a manual GET on the url. Telnet will also time
149             # out in this case, but telnet.pm provides a dump of the partial
150             # response received, and we use this.
151             #
152             # - WWW unreliability: Any given access to a server on the web is
153             # subject to various kinds of flakiness. To avoid false reports of
154             # bad servers, it is essential to re-test all errors, preferably over
155             # a period of hours or days. This script completes a first pass
156             # through all urls, typically taking 8 hours or more on 10,000
157             # urls. Then it performs additional ($RECHECKS) passes on all urls
158             # that received error codes. It sleeps ($HOURS_TO_SLEEP) between
159             # passes to improve the chances of getting a valid return code.
160             #
161             # - Redirects and cycles: The challenge is to follow redirects all
162             # the way to the end of the line, but know when to stop. It is
163             # complicated by the fact that some sites use the meta refresh tag
164             # for their redirection, and by the fact that some sites have
165             # infinite loop cycles for page refresh purposes. Five distinct cases
166             # have been identified:
167             #
168             # 1. Proper redirect, using Location header. (Action: Follow redirect.)
169             # 2. Proper meta refresh, on a single page. (Action: Detect cycle
170             # and exit.)
171             # 3. Proper meta refresh, on a cycle of pages. (Action:Detect
172             # cycle and exit.)
173             # 4. Redirect using meta refresh. (Action: Follow redirect.)
174             # 5. Redirect loop on a single page for setting cookies. (Action:
175             # Follow redirect.)
176             #
177             # Maintenance and Future Development Notes:
178             #
179             # - 401's and 403's: Currently does not handle authentication; just
180             # reports these as errors.
181             #
182             # - Cookie warnings: With perl's -w option, many warnings will be
183             # received about Cookies.pm. This seems to be due to the fact that
184             # Cookies.pm does not cleanly handle incorrectly formatted
185             # cookies. As far as I know, these warnings may be safely ignored.
186              
187             # Author: Phil Mitchell
188             # Date: 02/22/01
189             # Version: 1.5
190             #
191             #############################################################################
192              
193 3     3   16 use WWW::Link::Tester;
  3         6  
  3         230  
194             @ISA="WWW::Link::Tester";
195              
196 3     3   15 use strict;
  3         6  
  3         96  
197 3     3   3629 use LWP::UserAgent;
  3         59750  
  3         113  
198 3     3   33 use HTTP::Response;
  3         8  
  3         76  
199 3     3   16 use HTTP::Message;
  3         9  
  3         78  
200 3     3   16 use HTTP::Status;
  3         6  
  3         1317  
201 3     3   30 use HTTP::Headers;
  3         6  
  3         141  
202 3     3   18 use HTTP::Request;
  3         6  
  3         76  
203 3     3   4036 use HTTP::Cookies;
  3         27558  
  3         104  
204 3     3   5056 use Net::Telnet;
  3         176731  
  3         304  
205             #use LWP::Debug qw(+);
206              
207             ###########################################
208             # Global variables
209             ###########################################
210              
211 3         1608 use vars qw(
212             %url_hash
213             $HTTP_DEFAULT_PORT
214             $HTTP_VERSION
215             $ADMIN_EMAIL
216             $MAX_REDIRECTS
217             $RECHECKS
218             $HOURS_TO_SLEEP
219             $AGENT_TIMEOUT
220             $AGENT_MAX_RESPONSE
221             $INPUT_FILE
222             $OUTPUT_FILE
223             $TMP_FILE
224             $TELNET_LOGFILE
225             $ADMIN_LOGFILE
226             $REDIRECT_LIMIT_EXCEEDED
227             $UNSUPPORTED_PROTOCOL
228             $MALFORMED_URL
229             $HTTP_0_9_OKAY
230             $HTTP_0_9_FAIL
231             $UNKNOWN_ERROR
232             $VERBOSE
233             $DEBUG
234             $LOGGING
235             $TELNET_SUCCESS
236             $TELNET_FAILURE
237             $agent
238             $telnetAgent
239             $cookieJar
240             $redirectCount
241 3     3   28 );
  3         9  
242              
243              
244              
245             ###########################################
246             # Configurable parameters
247             ###########################################
248              
249             $ADMIN_EMAIL = ''; # If non-empty, script will send confirmation and result stats.
250             $AGENT_TIMEOUT = 10; # In seconds, time for http agent to wait. 10 secs is often too
251             # short, leads to spurious reports of server errors. Longer than
252             # 30 secs not usually helpful.
253             $AGENT_MAX_RESPONSE = 524288; # In bytes, max response to accept. Mainly want to
254             # avoid being swamped by something huge.
255             $MAX_REDIRECTS = 15; # Number of redirects to tolerate before giving up. Should never hit
256             # this limit; it's here to avoid infinite loop.
257             $RECHECKS = 3; # Number of recheck passes to recheck urls that return error codes. Note
258             # that every server error automatically gets one retry via telnet.
259             $HOURS_TO_SLEEP = 0; # Number of hours to sleep between recheck passes.
260             $HTTP_DEFAULT_PORT = 80;
261             $HTTP_VERSION = 'HTTP/1.0'; # Perl's HTTP module defaults to 0.9
262             $INPUT_FILE = "CURRENT.URLS.TXT";
263             $INPUT_FILE = "smalltest.txt";
264             $OUTPUT_FILE = "OUT.URLS.TXT";
265             $ADMIN_LOGFILE = "admin_logfile.txt"; # Log for result stats.
266             $VERBOSE = 1; # If 1, print processing status to stdout
267             $DEBUG = 0; # If 1, provides additional output to stdout; mainly HTTP headers.
268             $LOGGING = 1; # Enable logging to $ADMIN_LOGFILE.
269              
270             ###########################################
271             # Misc. initializations
272             ###########################################
273              
274             $TMP_FILE = "tmp.txt";
275             $TELNET_LOGFILE = "telnet_logfile.txt"; # Used internally to buffer data.
276              
277             # Response codes. All successful response codes have the form: 2xx.
278             $REDIRECT_LIMIT_EXCEEDED = 'REDIRECT_LIMIT_EXCEEDED';
279             $UNSUPPORTED_PROTOCOL = 'UNSUPPORTED_PROTOCOL';
280             $MALFORMED_URL = 'MALFORMED_URL';
281             $TELNET_FAILURE = 'TELNET_FAILURE';
282             $HTTP_0_9_FAIL = 'HTTP_0_9_FAIL';
283             $UNKNOWN_ERROR = 'UNKNOWN_ERROR';
284             $TELNET_SUCCESS = 299; # Mimic a successful HTTP code
285             $HTTP_0_9_OKAY = 298;
286              
287             =head1 test_link
288              
289             This function acts as glue between follow_url and LinkController. It
290             returns a constructed HTTP::Response. This will mean that information
291             is lost since we actually often have created the code from another
292             response.
293              
294             =cut
295              
296             sub new {
297 4     4 0 84 my $proto = shift;
298 4   33     28 my $class = ref($proto) || $proto;
299 4         9 my $self = {};
300 4         14 $self->{"user_agent"}=shift;
301 4         21 bless $self, $class;
302             }
303              
304 3     3   19 use vars qw($redirect_count $redirects %convert);
  3         5  
  3         9772  
305              
306             %convert=(
307             $REDIRECT_LIMIT_EXCEEDED => RC_REDIRECT_LIMIT_EXCEEDED,
308             $UNSUPPORTED_PROTOCOL => RC_PROTOCOL_UNSUPPORTED,
309             $MALFORMED_URL => RC_PROTOCOL_UNSUPPORTED,
310             $TELNET_FAILURE => RC_NOT_FOUND,
311             $HTTP_0_9_FAIL => RC_INTERNAL_SERVER_ERROR,
312             $UNKNOWN_ERROR => RC_BAD_REQUEST,
313             );
314              
315              
316             sub get_response {
317 14     14 0 20 my $self=shift;
318 14         18 my $link=shift;
319 14         22 $redirects=[];
320 14         21 $redirect_count=0;
321 14         40 %url_hash=();
322 14         38 my $code=$self->follow_url($link->url());
323 14         25 scalar (keys %convert);
324 14         51 CONVERT: while (my ($key,$value) = each %convert) {
325 81 100       345 $code eq $key && do {
326 4         6 $code=$value;
327 4         9 last CONVERT;
328             };
329             }
330 14 50       217 print STDERR "COMPLEX generated response code $code\n"
331             if $self->{verbose};
332             #cluck and die here generate coredumps!!!???! in perl 5.6.0 on Linux
333             # cluck STDERR "COMPLEX generated response code $code";
334 14 50       57 die "non numeric response code generated" . $code
335             unless $code =~ m/[1-9][0-9]+/;
336 14         52 my $response=HTTP::Response->new($code);
337              
338 14 50       573 die "response: $response not reference" unless ref $response ;
339              
340 14         51 return $response, @$redirects
341             }
342              
343             # Set up the web agents and helpers.
344             # $agent = new LWP::UserAgent;
345             # $agent->timeout($AGENT_TIMEOUT);
346             # $agent->max_size($AGENT_MAX_RESPONSE);
347             $cookieJar = new HTTP::Cookies;
348             $telnetAgent = new Net::Telnet(Timeout => $AGENT_TIMEOUT,
349             Errmode => 'return');
350              
351             my ($url, $result, $newResult, %results, $outputStr, $urlCount,
352             $count, $recheckCount, %resultSummary);
353              
354             ###########################################
355             # check_for_meta_refresh
356             ###########################################
357             # Routine that searches input string for something of the form:
358             #
359             # It is tolerant of extra whitespace, single or no quotes instead of
360             # doublequotes, spaces around equals signs, and extra verbiage, and is
361             # case-insensitive.
362             # Call with: String of content to be searched
363             # Returns: url, if a meta refresh is found; otherwise returns
364             # empty string.
365              
366             sub check_for_meta_refresh {
367 8 50   8 0 85 if ($DEBUG) { print "check_for_meta_refresh()...\n"; }
  0         0  
368 8         8 my $inputStr = shift;
369 8 50       17 if ($inputStr =~
370             m{ #"
371             ]+? url
372             \s* = \s* ["']? ([^"' >]+) ["']? [^>]+? >
373             }ix)
374             {
375 0         0 return $1;
376             }
377             else {
378 8         16 return "";
379             }
380             }#end check_for_meta_refresh
381              
382             ###########################################
383             # follow_url
384             ###########################################
385              
386             # Tries to access a given url. The main case is HTTP protocol, but
387             # also handles any protocol handled by LWP, plus telnet. For telnet,
388             # just tries to open a connection. For HTTP, follows redirects until
389             # a final status code is received or until $MAX_REDIRECTS is
390             # exceeded. Accepts all cookies. To avoid infinite loops, detects page
391             # refresh cycles.
392              
393             # Call with: url, and optional second arg of referring url which is
394             # used to absolutize url.
395              
396             # Returns: HTTP status code, or internal response codes (see above).
397              
398             sub follow_url {
399 30     30 0 39 my $self=shift;
400 30         65 my $agent=$self->{"user_agent"};
401 30         46 my ($url, $referrer) = @_;
402 30         38 my $VERBOSE=$self->{"verbose"};
403              
404 30 50       90 return $MALFORMED_URL unless $url;
405 30         36 my ($response, $protocol, $host, $port, $ping, $telnetResult,
406             $request, $statusCode, $new_url);
407 30 50 33     105 if ($VERBOSE || $DEBUG) { print "follow_url(): $url\n"; }
  0         0  
408 30         93 $url_hash{$url} = 1; # Track all urls in each run, to detect cycles.
409              
410             # Note: It is crucial to hash this url BEFORE absolutizing it, b/c
411             # we will test for cycles before absolutizing.
412              
413 30 100       53 if ($referrer) { $url = make_url_absolute($url, $referrer); }
  16         34  
414 30 100       74 if (keys(%url_hash) > $MAX_REDIRECTS) {
415 1 50       7 if ($VERBOSE) { print "Redirect limit exceeded.\n"; }
  0         0  
416 1         118 return $REDIRECT_LIMIT_EXCEEDED;
417             }
418              
419             # EXTRACT PROTOCOL, HOST, AND (OPTIONAL) PORT.
420 29         184 $url =~ m{ ^\s* ([a-z]+) :// ([^/:]+) }ix;
421 29 100 66     142 if (!($1 && $2)) {
422 3 50       8 if ($VERBOSE) { print "URL not well-formed.\n"; }
  0         0  
423 3         9 return $MALFORMED_URL;
424             }
425             else {
426 26         46 $protocol = $1;
427 26         45 $host = $2;
428             }
429 26         125 $url =~ m{ \w+ :// [^/]+ : (\d+) }x; # Extract port
430 26 50       58 if ($1) { $port = $1; }
  26         51  
431              
432             # HANDLE TELNET REQUESTS -- just see if we can open the connection.
433 26 50       50 if ($protocol =~ /^telnet$/i) {
434 0 0       0 if ($port) {
435 0         0 $ping = $telnetAgent->open(Host => $host,
436             Port => $port);
437             }
438             else {
439 0         0 $ping = $telnetAgent->open(Host => $host);
440             }
441 0 0       0 if (!$ping) { return $TELNET_FAILURE; }
  0         0  
442 0         0 else { return $TELNET_SUCCESS; }
443             }
444              
445             # HANDLE ALL OTHER REQUESTS (HTTP, HTTPS, FTP, GOPHER, FILE)
446 26 50       73 if (!$agent->is_protocol_supported($protocol)) {
447 0 0       0 if ($VERBOSE) { print "Protocol not supported.\n"; }
  0         0  
448 0         0 return $UNSUPPORTED_PROTOCOL;
449             }
450             # Use eval to avoid aborting if LWP or HTTP sends "die".
451 26         237 eval {
452 26         88 $request = HTTP::Request->new(GET => $url);
453 26         2780 $request->protocol($HTTP_VERSION);
454 26         255 $cookieJar->add_cookie_header($request);
455 26 50       4130 if ($DEBUG) { print "\nRequest: \n", $request->as_string; }
  0         0  
456              
457             # Use simple_request so we don't follow redirects automatically
458 26         76 $response = $agent->simple_request($request);
459 26         2067 $cookieJar->extract_cookies($response);
460 26         2309 $statusCode = $response->code;
461             };
462 26 50       236 if ($@) {
463 0 0       0 if ($VERBOSE) { print "LWP or HTTP error: $@\n"; }
  0         0  
464 0 0       0 if ($LOGGING) { print STDERR "LWP or HTTP error: $@\n"; }
  0         0  
465 0         0 return $UNKNOWN_ERROR;
466             }
467 26 50       51 if ($DEBUG) { print "Status: $statusCode\n"; }
  0         0  
468 26 50       44 if ($DEBUG) { print "\nResponse Header: \n", $response->headers->as_string; }
  0         0  
469              
470             # Note: In case of timeout, agent sets $statusCode to server error.
471 26 100       128 if ($statusCode =~ /2../) {
    100          
    50          
    0          
472 8 50       17 if ($VERBOSE) { print "Good response, checking for meta refresh tag...\n"; }
  0         0  
473 8         30 $new_url = check_for_meta_refresh($response->content);
474 8 50       18 if ($new_url ne "") {
475 0 0       0 if (exists($url_hash{$new_url})) {
476 0 0       0 if ($VERBOSE) { print "This url already visited ... returning $statusCode.\n"; }
  0         0  
477 0         0 return $statusCode; }
478             else {
479 0 0       0 if ($VERBOSE) { print "Refresh to: $new_url\n"; }
  0         0  
480 0         0 return $self->follow_url($new_url, $url);
481             }
482             }
483 8         60 else { return $statusCode;}
484             }
485             elsif ($statusCode =~ /3../) {
486 16         19 $redirect_count++;
487 16 50       32 if ($VERBOSE) { print "Proper redirect...\n"; }
  0         0  
488             # Note that we don't check for page cycles here. Some sites
489             # will redirect to the same page while setting cookies, but
490             # eventually they'll stop.
491 16         38 $new_url = $response->headers->header('Location');
492 16         741 push @$redirects, $new_url;
493 16 50       112 if ($VERBOSE) { print "Redirect to: $new_url\n"; }
  0         0  
494 16         158 return $self->follow_url($new_url, $url);
495             }
496             elsif ($statusCode =~ /4../) {
497 2 50       8 if ($VERBOSE) { print "Client error...\n"; }
  0         0  
498 2         15 return $statusCode;
499             }
500             elsif ($statusCode =~ /5../) {
501 0 0       0 if ($VERBOSE) { print "Server error...\n"; }
  0         0  
502              
503             # You might be tempted to do a retry right here. It is problematic
504             # b/c you need to do another follow_url, but that will clash with
505             # url_hash -- it will look like a page cycle. But if you do the
506             # retry by hand w/ a simple request, you don't handle all the
507             # cases properly. What we do is retry once using telnet, and leave
508             # other retries to subsequent passes following main loop.
509              
510 0 0       0 if ($protocol =~ /^http$/i) { # Only works for HTTP requests.
511 0         0 $telnetResult =
512             $self->telnet_http_retry($host, $url, $request, $port);
513 0 0       0 if ($telnetResult ne 'FAIL') {
514 0         0 $statusCode = $telnetResult;
515             }
516             }
517 0         0 return $statusCode;
518             } # end 5xx case.
519             else { # Everything else case.
520 0         0 return $statusCode;
521             }
522              
523             } # end sub follow_url
524              
525             ###########################################
526             # get_location_header
527             ###########################################
528             # Extracts the url from the Location field of an HTTP redirect.
529             # Call with: ref to array of header lines, w or w/o body at end.
530             # Returns: URL found in Location header, or empty string.
531             sub get_location_header {
532              
533 0 0 0 0 0 0 if ($VERBOSE || $DEBUG) { print "Looking for location header... \n"; }
  0         0  
534 0         0 my ($headersRef) = @_;
535 0         0 my $line;
536              
537 0         0 while ($line = shift @$headersRef) {
538 0 0       0 if ($DEBUG) { print "Checking line: $line\n"; }
  0         0  
539 0 0       0 last if $line =~ /^\s$/;
540 0 0       0 if ($line =~ m{^Location: \s* (\S+)}x) {
541 0 0       0 if ($DEBUG) { print "Line found: $line\n"; }
  0         0  
542 0         0 return $1;
543             }
544             }
545 0         0 return "";
546              
547             } # end sub get_location_header
548              
549             ###########################################
550             # make_url_absolute
551             ###########################################
552             # Make a relative url absolute by appending it to path of old url.
553             # Call with: a fully qualified url as second arg, which will provide
554             # path info for relative url which is first arg.
555             # Returns: new absolute url
556             sub make_url_absolute {
557              
558 16 50   16 0 33 if ($DEBUG) { print "make_url_absolute()...\n"; }
  0         0  
559 16         22 my ($new_url, $old_url) = @_;
560              
561             # Test to see if it's already absolute (starts w/ a syntactically correct scheme)
562 16 50       82 if ($new_url =~ m{^[a-z]+://}i) {
563 16         34 return $new_url;
564             }
565            
566 0 0         if ($VERBOSE) { print "Adding path to relative url: $new_url\n"; }
  0            
567             # Case 1: new url is relative to root; it starts with slash, and
568             # should be appended to raw domain name.
569 0 0         if ($new_url =~ m{^/} ) {
    0          
570 0           $old_url =~ m{ (\w+ :// [^/]+) }x;
571 0 0         if ($VERBOSE) { print "Case 1: append to $1\n"; }
  0            
572 0           return $1 . $new_url;
573             }
574             # For cases 2 & 3, assume new url is relative to current directory;
575             # Case 2: old url contains a trailing slash, eg. http://www.fib.com/bigfib/;
576             # may or may not contain trailing filename
577             elsif ($old_url =~ m{ (\w+://\S+/) }x ) {
578 0 0         if ($VERBOSE) { print "Case 2: append to $1\n"; }
  0            
579 0           return $1 . $new_url;
580             }
581             # Case 3: old url has no trailing slash, eg. http://www.fab.net
582             else {
583 0 0         if ($VERBOSE) { print "Case 3: append to $old_url/\n"; }
  0            
584 0           return "$old_url/$new_url";
585             }
586            
587             } # End make_url_absolute
588              
589             ###########################################
590             # telnet_http_retry
591             ###########################################
592             # Open a telnet connection to a host and try an HTTP GET for an
593             # url. The response is processed according to status code similarly to
594             # follow_url, and calls follow_url to handle redirects. Uses an LWP
595             # request object b/c that's a convenient way to stick cookies into the
596             # request string.
597             # Note: Handles the Solaris/LWP bug (cf notes above) by reading the
598             # telnet.pm input_log if telnet times out.
599             # Call with: hostname, absolute url, LWP request object, and optional
600             # port (default is $HTTP_DEFAULT_PORT).
601             # Returns: status code, or 'FAIL' if can't make telnet connection.
602             sub telnet_http_retry {
603 0     0 0   my $self=shift;
604 0 0 0       if ($VERBOSE || $DEBUG) {
605 0           print "Telnet HTTP retry...\n";
606             }
607 0           my ($host, $url, $request, $port) = @_;
608 0           my ($telnetAgent, @lines, @buffer, $statusLine, $line, $logfileHandle,
609             $httpVersion, $statusCode, $message, $contentStr, $new_url);
610 0 0         open(LOGFILE, "+>$TELNET_LOGFILE") || warn "Can't open $TELNET_LOGFILE.\n";
611 0 0 0       if (!$port || $port !~ /^\d+$/) {
612 0           $port = $HTTP_DEFAULT_PORT;
613             }
614             # Create agent and open connection.
615 0           $telnetAgent = Net::Telnet->new(Host => $host,
616             Port => $port,
617             Input_log => $TELNET_LOGFILE,
618             Timeout => $AGENT_TIMEOUT,
619             Errmode => "return");
620 0 0         return 'FAIL' unless $telnetAgent; # Can't open telnet connection.
621 0           $telnetAgent->max_buffer_length($AGENT_MAX_RESPONSE);
622              
623             # Send the request.
624 0           $telnetAgent->print($request->as_string, "\n");
625             # Get the response as array of lines.
626 0           while (@buffer = $telnetAgent->getlines) {
627 0           push (@lines, @buffer);
628             }
629 0 0         if ($telnetAgent->timed_out) {
630 0 0         if ($VERBOSE) {
631 0           print "Telnet http timed out. Using input log...\n";
632             }
633 0           undef @lines;
634 0           while () {
635 0           push (@lines, $_);
636             }
637 0 0         close LOGFILE or warn "Problem closing $TELNET_LOGFILE.\n";
638             }
639 0 0         if (!@lines) {
640 0 0         if ($VERBOSE) {
641 0           print "No data received.\n";
642             }
643 0           return 'FAIL';
644             }
645 0 0         if ($DEBUG) {
646 0           print @lines,"\n";
647             }
648 0           $statusLine = shift @lines;
649             # We can only process status line and headers if the response is HTTP/1.0 or
650             # better. This regexp copied from LWP::Protocol::http.pm.
651 0 0         if ($statusLine =~ /^(HTTP\/\d+\.\d+)[ \t]+(\d+)[ \t]*([^\012]*)\012/) {
652             # HTTP/1.0 response or better
653 0           ($httpVersion, $statusCode, $message) = ($1, $2, $3);
654 0           chomp $message;
655 0 0         if ($VERBOSE) {
656 0           print "Status line: $httpVersion $statusCode $message \n\n";
657             }
658              
659 0 0         if ($statusCode =~ /2../) {
    0          
    0          
660 0           while ($line = shift @lines) { # Flatten array of lines.
661 0           $contentStr .= $line;
662             }
663 0           $new_url = check_for_meta_refresh($contentStr);
664 0 0         if ($new_url ne "") {
665 0 0         if (exists($url_hash{$new_url})) {
666 0 0         if ($VERBOSE) {
667 0           print "This url already visited ... returning $statusCode.\n";
668             }
669 0           return $statusCode;
670             } else {
671 0 0         if ($VERBOSE) {
672 0           print "Refresh to: $new_url\n";
673             }
674             # Return whatever status code we get from new url
675 0           return $self->follow_url($new_url, $url);
676             }
677             } else {
678 0           return $statusCode;
679             }
680             } elsif ($statusCode =~ /3../) {
681 0 0         if ($VERBOSE) {
682 0           print "Proper redirect...\n";
683             }
684 0           $new_url = get_location_header(\@lines);
685 0 0         if ($new_url ne "") {
686 0 0         if (exists($url_hash{$new_url})) {
687 0 0         if ($VERBOSE) {
688 0           print "This url already visited ... returning $statusCode.\n";
689             }
690 0           return $statusCode;
691             } else {
692 0 0         if ($VERBOSE) {
693 0           print "Redirect to: $new_url\n";
694             }
695             # Return whatever status code we get from new url
696 0           return $self->follow_url($new_url, $url);
697             }
698             } else {
699 0           return $statusCode;
700             }
701             } elsif ($statusCode =~ m{4.. | 5..}x) {
702 0           return $statusCode;
703             }
704             } # if valid status line
705             else {
706 0           unshift(@lines, $statusLine);
707             }
708             # If no status line, could be HTTP/0.9 server, which just sends
709             # back content. If it contains a tag like , assume it's
710             # okay.
711 0 0         if ($VERBOSE) {
712 0           print "Assuming HTTP/0.9 or less... \n";
713             }
714 0           while ($line = shift @lines) { # Flatten array of lines.
715 0           $contentStr .= $line;
716             }
717 0 0         if ($contentStr =~ /
718 0           return $HTTP_0_9_OKAY;
719             } else {
720 0           return $HTTP_0_9_FAIL;
721             }
722              
723             } # end sub telnet_http_retry
724              
725             ###########################################
726             # END (Unused snippets and test results, below)
727             ###########################################
728              
729             # NOTES:
730              
731             # 1. It would be nice to have a robust facility for absolutizing
732             # URLs. I tried using URI.pm for this purpose and found it to be not
733             # robust. EG., it allows the construction of: http:/www.yahoo.com,
734             # which is not well-formed.
735             # 2. Tolerance of meta refresh tag match?
736             # 3. some duplicate code went from follow_url to the
737             # telnet_http_retry; could be factored.
738              
739              
740             1; #Spoilt children / happy / required even