File Coverage

blib/lib/Combine/UA.pm
Criterion Covered Total %
statement 12 88 13.6
branch 0 40 0.0
condition 0 9 0.0
subroutine 4 7 57.1
pod 0 3 0.0
total 16 147 10.8


line stmt bran cond sub pod time code
1             # Copyright (c) 1996-1998 LUB NetLab
2             #
3             # This program is free software; you can redistribute it and/or modify
4             # it under the terms of the GNU General Public License as published by
5             # the Free Software Foundation; either version 1, or (at your option)
6             # any later version.
7             #
8             # This program is distributed in the hope that it will be useful,
9             # but WITHOUT ANY WARRANTY; without even the implied warranty of
10             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11             # GNU General Public License for more details.
12             #
13             # You should have received a copy of the GNU General Public License
14             # along with this program; if not, write to the Free Software
15             # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16             #
17             #
18             # NO WARRANTY
19             #
20             # BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
21             # FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
22             # OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
23             # PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
24             # OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
25             # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
26             # TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
27             # PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
28             # REPAIR OR CORRECTION.
29             #
30             # IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
31             # WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
32             # REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
33             # INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
34             # OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
35             # TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
36             # YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
37             # PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
38             # POSSIBILITY OF SUCH DAMAGES.
39             #
40             # Copyright (c) 1996-1998 LUB NetLab
41              
42             # $Id: UA.pm 257 2008-09-03 08:23:32Z anders $
43              
44              
45             # COMB/XWI/UA.pm - harvesting robots with XWI interface
46             # v0.01 by Yong Cao, 1997-08-08
47              
48             package Combine::UA;
49              
50 1     1   632 use strict;
  1         1  
  1         39  
51 1     1   511 use Combine::Config;
  1         4  
  1         30  
52 1     1   1289 use LWP::UserAgent;
  1         55460  
  1         39  
53 1     1   11 use HTTP::Date;
  1         2  
  1         1060  
54              
55             my $expGar;
56             my $userAgentGetIfModifiedSince;
57              
58             sub TruncatingUserAgent {
59              
60             # This function returns an LWP::UserAgent that truncates incoming data
61             # when a number of bytes, that's dictated by Combine's configuration set,
62             # has been received.
63             #
64             # Experiments (1999-02-02) have shown that the truncation is approximate
65             # in that the resulting document size may vary up or down a few percents
66             # or kilobytes.
67              
68 0     0 0   my $ua = new LWP::UserAgent();
69             # $ua->max_size(COMB::Config::GetMaxDocSize()); #Problem with webservers returning 206 partial content in a multipart
70 0           $ua->timeout(Combine::Config::Get('UAtimeout'));
71 0           $ua->agent("Combine/3 http://combine.it.lth.se/");
72 0           $ua->from(Combine::Config::Get('Operator-Email'));
73 0           $ua->default_header('Accept-Encoding' => 'gzip');
74 0 0         if (Combine::Config::Get('httpProxy')) {
75 0           $ua->proxy(['http', 'https'], Combine::Config::Get('httpProxy'));
76             }
77 0           $expGar = Combine::Config::Get('WaitIntervalExpirationGuaranteed');
78 0           $userAgentGetIfModifiedSince = Combine::Config::Get('UserAgentGetIfModifiedSince');
79 0           return $ua;
80             }
81              
82              
83             sub fetch { # use get-if-modified-since
84 0     0 0   my ($xwi, $since) = @_;
85 0           my ($url_str, $ua, $req, $resp, $code, $msg, $method, $type, $ext);
86 0           $ua = TruncatingUserAgent();
87             #FIX! $since = $jcf->ftime unless $since;
88 0 0         $since = time - $expGar unless $since;
89 0           $url_str = $xwi->url;
90 0           $type = ''; #FIX $jcf->typ;
91 0           $method = "GET";
92 0 0         if ( $type ) {
93 0 0         $method = "HEAD" unless defined(${Combine::Config::Get('converters')}{$type});
  0            
94             } else {
95 0 0         if ( $url_str =~ m/\.([^\/\s\.]+)\s*$/ ) {
96 0           $ext = $1;
97 0           $ext =~ tr/A-Z/a-z/;
98 0 0         $method = "HEAD" if defined(${Combine::Config::Get('binext')}{$ext});
  0            
99             }
100             }
101 0 0         if ( $method eq "HEAD" ) {
102 0           $req = new HTTP::Request 'HEAD'=> $url_str;
103 0 0         $req->header('If-Modified-Since' => &time2str($since))
104             if $userAgentGetIfModifiedSince;
105 0 0         if (Combine::Config::Get('UserAgentFollowRedirects')) { $resp = $ua->request($req); }
  0            
106 0           else { $resp = $ua->simple_request($req); }
107 0           $code = $resp->code;
108 0           $msg = $resp->message();
109 0           $method = "";
110 0 0         if ( $code eq "200" ) {
111 0           $type = $resp->header("content-type");
112 0 0 0       $method = "GET" if $type and defined(${Combine::Config::Get('converters')}{$type});
  0            
113             }
114             }
115 0 0         if ( $method eq "GET" ) {
116 0           $req = new HTTP::Request 'GET'=> $url_str;
117 0 0         $req->header('If-Modified-Since' => &time2str($since))
118             if $userAgentGetIfModifiedSince;
119 0 0         if (Combine::Config::Get('UserAgentFollowRedirects')) { $resp = $ua->request($req); }
  0            
120 0           else { $resp = $ua->simple_request($req); }
121 0           $code = $resp->code;
122 0           $msg = $resp->message();
123             # print "$url_str; " . &time2str($since) ."; $code; $msg\n";
124             }
125              
126 0           my @cs=$resp->header('Content-Type');
127 0           foreach my $c (@cs) {
128 0           $xwi->meta_add('content-type',$c);
129             }
130              
131 0           $xwi->stat($code);
132             #BEHÖVS??? $xwi->url($url_str);
133 0           $xwi->server($resp->header("server"));
134 0           $xwi->etag($resp->header("etag"));
135 0           my $t = $resp->content_type;
136 0           $xwi->type($t);
137 0           $t = $resp->content_language;
138 0 0         if (defined($t)) {$xwi->meta_add('content-language',$t);}
  0            
139 0           $xwi->length($resp->header("content-length"));
140 0           $xwi->location($resp->header("location"));
141 0           $xwi->base($resp->base);
142             #Numeric gives error message '... too small'
143             # $xwi->expiryDate(&check_date($resp->expires));
144 0           $xwi->modifiedDate(&check_date($resp->header("last-modified")));
145 0           $xwi->expiryDate(&check_date($resp->header("expires")));
146             #? $xwi->checkedDate(&check_date($resp->header("date")));
147 0 0         $xwi->checkedDate(time) unless $xwi->checkedDate;
148 0 0 0       if ($code eq "200" or $code eq "206") {
149 0 0 0       if ( $method eq "GET" and length($resp->content_ref) > 0 ) {
150 0           $xwi->truncated($resp->headers()->header('X-Content-Range'));
151             }
152 0 0         if ($resp->decoded_content( 'ref' => 1 )) {
153 0           $xwi->content($resp->decoded_content( 'ref' => 1 ));
154             } else {
155 0           $xwi->content($resp->content_ref);
156             #CHECK if gzip encoded anyhow?
157             }
158             }
159 0           return ($code, $msg);
160             }
161              
162             sub check_date { # makes sure the date is in a correct format (UnixTime)
163 0     0 0   my ($str) = @_;
164 0           my $tim = undef;
165 0 0         if ( $str ) {
166 0           eval { $tim = &str2time( $str ) };
  0            
167 0           return $tim;
168             }
169             }
170              
171             1;