File Coverage

blib/lib/LWP/UserAgent/Cached.pm
Criterion Covered Total %
statement 129 134 96.2
branch 48 56 85.7
condition 13 18 72.2
subroutine 16 16 100.0
pod 6 6 100.0
total 212 230 92.1


line stmt bran cond sub pod time code
1             package LWP::UserAgent::Cached;
2              
3 2     2   62406 use strict;
  2         6  
  2         79  
4 2     2   12 use Carp;
  2         3  
  2         178  
5 2     2   22 use Digest::MD5;
  2         5  
  2         64  
6 2     2   2258 use HTTP::Response;
  2         43278  
  2         69  
7 2     2   16 use base 'LWP::UserAgent';
  2         4  
  2         5465  
8              
9             our $VERSION = '0.06';
10              
11             sub new {
12 3     3 1 155340 my ($class, %opts) = @_;
13            
14 3         8 my $cache_dir = delete $opts{cache_dir};
15 3         7 my $nocache_if = delete $opts{nocache_if};
16 3         6 my $recache_if = delete $opts{recache_if};
17 3         9 my $on_uncached = delete $opts{on_uncached};
18 3         7 my $cachename_spec = delete $opts{cachename_spec};
19 3         25 my $self = $class->SUPER::new(%opts, parse_head => 0);
20            
21 3         106 $self->{cache_dir} = $cache_dir;
22 3         10 $self->{nocache_if} = $nocache_if;
23 3         6 $self->{recache_if} = $recache_if;
24 3         6 $self->{on_uncached} = $on_uncached;
25 3         6 $self->{cachename_spec} = $cachename_spec;
26            
27 3         13 return $self;
28             }
29              
30             # generate getters and setters
31             foreach my $opt_name (qw(cache_dir nocache_if recache_if on_uncached cachename_spec)) {
32 2     2   95690 no strict 'refs';
  2         4  
  2         2882  
33             *$opt_name = sub {
34 21     21   4989 my $self = shift;
35 21 100       56 if (@_) {
36 13         31 my $opt_val = $self->{$opt_name};
37 13         24 $self->{$opt_name} = shift;
38 13         45 return $opt_val;
39             }
40            
41 8         28 return $self->{$opt_name};
42             }
43             }
44              
45             sub parse_head {
46 3     3 1 107705 my ($self, $bool) = @_;
47            
48 3 50       15 if ($bool) {
49 0         0 die "parse_head() is disabled, because it may cause encoding troubles while saving cache";
50             }
51            
52 3         20 $self->SUPER::parse_head($bool);
53             }
54              
55             sub simple_request {
56 29     29 1 33456 my $self = shift;
57 29 50       106 unless (defined $self->{cache_dir}) {
58 0         0 return $self->SUPER::simple_request(@_);
59             }
60            
61 29         48 my $request = $_[0];
62 29         48 eval{ $self->prepare_request($request) };
  29         95  
63 29         14660 my $fpath = $self->_get_cache_name($request);
64 29         2380 my $response;
65             my $no_collision_suffix;
66            
67 29 100       98 unless ($self->{was_redirect}) {
68 23         38 @{$self->{last_cached}} = ();
  23         59  
69 23         32 @{$self->{last_used_cache}} = ();
  23         51  
70             }
71            
72 29 100       1230 if (-e $fpath) {
73 17 100       61 unless ($response = $self->_parse_cached_response($fpath, $request)) {
74             # collision
75 3 100       379 if (my @cache_list = <$fpath-*>) {
76 2         7 foreach my $cache_file (@cache_list) {
77 3 100       9 if ($response = $self->_parse_cached_response($cache_file, $request)) {
78 1         2 $fpath = $cache_file;
79 1         3 last;
80             }
81             }
82            
83 2 100       9 unless ($response) {
84 1         7 $no_collision_suffix = sprintf('-%03d', substr($cache_list[-1], -3) + 1);
85             }
86             }
87             else {
88 1         3 $no_collision_suffix = '-001';
89             }
90             }
91            
92 17 100 100     124 if ($response && defined($self->{recache_if}) && $self->{recache_if}->($response, $fpath, $request)) {
      66        
93 1         1583 $response = undef;
94             }
95             }
96            
97 29 100       75 unless ($response) {
98 15 100       47 if (defined $self->{on_uncached}) {
99 1         5 $self->{on_uncached}->($request);
100             }
101            
102 15         93 $response = $self->SUPER::simple_request(@_);
103            
104 15 100 66     6210 if (!defined($self->{nocache_if}) || !$self->{nocache_if}->($response)) {
105 14 100       87 if (defined $no_collision_suffix) {
106 2         4 $fpath .= $no_collision_suffix;
107             }
108            
109 14 50       1635 if (open my $fh, '>:raw', $fpath) {
110 14         62 print $fh $request->url, "\n";
111 14         279 print $fh $response->as_string;
112 14         1716 close $fh;
113            
114 14         23 push @{$self->{last_cached}}, $fpath;
  14         51  
115 14         27 push @{$self->{last_used_cache}}, $fpath;
  14         73  
116             }
117             else {
118 0         0 carp "open('$fpath', 'w'): $!";
119             }
120             }
121             }
122             else {
123 14         16 push @{$self->{last_used_cache}}, $fpath;
  14         43  
124             }
125            
126 29   66     131 $self->{was_redirect} = $response->is_redirect && _in($request->method, $self->requests_redirectable);
127 29         303 return $response;
128             }
129              
130             sub last_cached {
131 3     3 1 1869 my $self = shift;
132 3         142 return exists $self->{last_cached} ?
133 3 50       13 @{$self->{last_cached}} : ();
134             }
135              
136             sub last_used_cache {
137 2     2 1 5 my $self = shift;
138 2         11 return exists $self->{last_used_cache} ?
139 2 50       9 @{$self->{last_used_cache}} : ();
140             }
141              
142             sub uncache {
143 1     1 1 52 my $self = shift;
144 1         5 unlink $_ for $self->last_cached;
145             }
146              
147             sub _get_cache_name {
148 30     30   148 my ($self, $request) = @_;
149            
150 30 100 66     119 if (defined($self->{cachename_spec}) && %{$self->{cachename_spec}}) {
  5         37  
151 5         19 my $tmp_request = $request->clone();
152 5         745 my $leave_only_specified;
153 5 100       22 if (exists $self->{cachename_spec}{_headers}) {
154 3 50       15 ref $self->{cachename_spec}{_headers} eq 'ARRAY'
155             or croak 'cachename_spec->{_headers} should be array ref';
156 3         5 $leave_only_specified = 1;
157             }
158            
159 5         17 foreach my $hname ($tmp_request->headers->header_field_names) {
160 15 100 66     425 if (exists $self->{cachename_spec}{$hname}) {
    100          
161 5 100       17 if (defined $self->{cachename_spec}{$hname}) {
162 4         12 $tmp_request->headers->header($hname, $self->{cachename_spec}{$hname});
163             }
164             else {
165 1         4 $tmp_request->headers->remove_header($hname);
166             }
167             }
168             elsif ($leave_only_specified && !_in($hname, $self->{cachename_spec}{_headers})) {
169 9         25 $tmp_request->headers->remove_header($hname);
170             }
171             }
172            
173 5 100       139 if (exists $self->{cachename_spec}{_body}) {
174 3         12 $tmp_request->content($self->{cachename_spec}{_body});
175             }
176            
177 5         66 return $self->{cache_dir} . '/' . Digest::MD5::md5_hex($tmp_request->as_string);
178             }
179            
180 25         115 return $self->{cache_dir} . '/' . Digest::MD5::md5_hex($request->as_string);
181             }
182              
183             sub _parse_cached_response {
184 20     20   45 my ($self, $cache_file, $request) = @_;
185            
186 20         26 my $fh;
187 20 50       938 unless (open $fh, '<:raw', $cache_file) {
188 0         0 carp "open('$cache_file', 'r'): $!";
189 0         0 return;
190             }
191            
192 20         312 my $url = <$fh>;
193 20         120 $url =~ s/\s+$//;
194 20 100       84 if ($url ne $request->url) {
195 5         131 close $fh;
196 5         32 return;
197             }
198            
199 15         238 local $/ = undef;
200 15         179 my $response_str = <$fh>;
201 15         438 close $fh;
202            
203 15         107 my $response = HTTP::Response->parse($response_str);
204 15         1628 $response->request($request);
205            
206 15 50       161 if ($self->cookie_jar) {
207 15         133 $self->cookie_jar->extract_cookies($response);
208             }
209            
210 15         2610 return $response;
211             }
212              
213             sub _in($$) {
214 15     15   873 my ($what, $where) = @_;
215            
216 15         38 foreach my $item (@$where) {
217 7 100       45 return 1 if ($what eq $item);
218             }
219            
220 9         41 return 0;
221             }
222              
223             1;
224              
225             =pod
226              
227             =head1 NAME
228              
229             LWP::UserAgent::Cached - LWP::UserAgent with simple caching mechanism
230              
231             =head1 SYNOPSIS
232              
233             use LWP::UserAgent::Cached;
234            
235             my $ua = LWP::UserAgent::Cached->new(cache_dir => '/tmp/lwp-cache');
236             my $resp = $ua->get('http://google.com/'); # makes http request
237            
238             ...
239            
240             $resp = $ua->get('http://google.com/'); # no http request - will get it from the cache
241              
242             =head1 DESCRIPTION
243              
244             When you process content from some website, you will get page one by one and extract some data from this
245             page with regexp, DOM parser or smth else. Sometimes we makes errors in our data extractors and realize this
246             only when all 1_000_000 pages were processed. We should fix our extraction logic and start all process from the
247             beginning. Please STOP! How about cache? Yes, you can cache all responses and second, third and other attempts will
248             be very fast.
249              
250             LWP::UserAgent::Cached is yet another LWP::UserAgent subclass with cache support. It stores
251             cache in the files on local filesystem and if response already available in the cache returns it instead of making HTTP request.
252             This module was writed because other available alternatives didn't meet my needs:
253              
254             =over
255              
256             =item L
257              
258             caches responses on local filesystem and gets it from the cache only if online document was not modified
259              
260             =item L
261              
262             same as above but stores cache in memory
263              
264             =item L
265              
266             can record responses in the cache or get responses from the cache, but not both for one useragent
267              
268             =item L
269              
270             seems it may cache responses and get responses from the cache, but has too much dependencies and unclear
271             `delay' parameter
272              
273             =back
274              
275             =head1 METHODS
276              
277             All LWP::UserAgent methods and several new.
278              
279             =head2 new(...)
280              
281             Creates new LWP::UserAgent::Cached object. Since LWP::UserAgent::Cached is LWP::UserAgent subclass it has all same
282             parameters, but in additional it has some new optional pararmeters:
283              
284             L
285              
286             L
287              
288             L
289              
290             L
291              
292             L
293              
294             LWP::UserAgent::Cached creation example:
295              
296             my $ua = LWP::UserAgent::Cached->new(cache_dir => 'cache/lwp', nocache_if => sub {
297             my $response = shift;
298             return $response->code >= 500; # do not cache any bad response
299             }, recache_if => sub {
300             my ($response, $path, $request) = @_;
301             return $response->code == 404 && -M $path > 1; # recache any 404 response older than 1 day
302             }, on_uncached => sub {
303             my $request = shift;
304             sleep 5 if $request->uri =~ '/category/\d+'; # delay before http requests inside "/category"
305             }, cachename_spec => {
306             'User-Agent' => undef, # omit agent while calculating cache name
307             });
308              
309             =head2 cache_dir() or cache_dir($dir)
310              
311             Gets or sets path to the directory where cache will be stored.
312             If not set useragent will behaves as LWP::UserAgent without cache support.
313              
314             =head2 nocache_if() or nocache_if($sub)
315              
316             Gets or sets reference to subroutine which will be called after receiving each non-cached response. First parameter
317             of this subroutine will be HTTP::Response object. This subroutine should return true if this response should
318             not be cached and false otherwise. If not set all responses will be cached.
319              
320             =head2 recache_if() or recache_if($sub)
321              
322             Gets or sets reference to subroutine which will be called for each response available in the cache. First parameter
323             of this subroutine will be HTTP::Response object, second - path to file with cache, third - HTTP::Request object.
324             This subroutine should return true if response needs to be recached (new HTTP request will be made) and false otherwise.
325             This $sub will be called only if response already available in the cache. Here you can also modify request for your needs.
326             This will not change name of the file with cache.
327              
328             =head2 on_uncached() or on_uncached($sub)
329              
330             Gets or sets reference to subroutine which will be called for each non-cached http request, before actually request.
331             First parameter of this subroutine will be HTTP::Request object. Here you can also modify request for your needs.
332             This will not change name of the file with cache.
333              
334             =head2 cachename_spec() or cachename_spec($spec)
335              
336             Gets or sets hash reference to cache naming specification. In fact cache naming for each request based on request content.
337             Internally it is md5_hex($request->as_string). But what if some of request headers in your program changed dinamically, e.g.
338             User-Agent or Cookie? In such case caching will not work properly for you. We need some way to omit this headers when calculating
339             cache name. This option is what you need. Specification hash should contain header name and header value which will be used
340             (instead of values in request) while calculating cache name.
341              
342             For example we already have cache where 'User-Agent' value in the headers was 'Mozilla/5.0', but in the current version of the program
343             it will be changed for each request. So we force specified that for cache name calculation 'User-Agent' should be 'Mozilla/5.0'. Cached
344             request had not 'Accept' header, but in the current version it has. So we force specified do not include this header for cache name
345             calculation.
346              
347             cachename_spec => {
348             'User-Agent' => 'Mozilla/5.0',
349             'Accept' => undef
350             }
351              
352             Specification hash may contain two special keys: '_body' and '_headers'. With '_body' key you can specify body content in the request
353             for cache name calculation. For example to not include body content in cache name calculation set '_body' to undef or empty string.
354             With '_headers' key you can specify which headers should be included in $request for cache name calculation. For example you can say to
355             include only 'Host' and 'Referer'. '_headers' value should be array reference:
356              
357             cachename_spec => {
358             _body => undef, # omit body
359             _headers => ['Host'], # include only host with value from request
360             # It will be smth like:
361             # md5_hex("METHOD url\r\nHost: host\r\n\r\n")
362             # method and url will be included in any case
363             }
364              
365             Another example. Omit body, include only 'Host' and 'User-Agent' headers, use 'Host' value from request and specified 'User-Agent' value,
366             in addition include referrer with specified value ('Referer' not specified in '_headers', but values from main specification hash has
367             higher priority):
368              
369             cachename_spec => {
370             _body => '',
371             _headers => ['Host', 'User-Agent'],
372             'User-Agent' => 'Mozilla/5.0',
373             'Referer' => 'http://www.com'
374             }
375              
376             One more example. Calculate cache name based only on method and url:
377              
378             cachename_spec => {
379             _body =>'',
380             _headers => []
381             }
382              
383             =head2 last_cached()
384              
385             Returns list with pathes to files with cache stored by last noncached response. List may contain more than one
386             element if there was redirect.
387              
388             =head2 last_used_cache()
389              
390             Returns list with pathes to files with cache used in last response. This includes files just stored (last_cached)
391             and files that may be already exists (cached earlier). List may contain more than one element if there was redirect.
392              
393             =head2 uncache()
394              
395             Removes last response from the cache. Use case example:
396              
397             my $page = $ua->get($url)->decoded_content;
398             if ($page =~ /Access for this ip was blocked/) {
399             $ua->uncache();
400             }
401              
402             =head1 Proxy and cache name
403              
404             Here you can see how changing of proxy for useragent will affect cache name
405              
406             =head2 HTTP proxy
407              
408             HTTP proxy support works out of the box and causes no problems. Changing of proxy server will not affect cache name
409              
410             =head2 HTTPS proxy
411              
412             Proper HTTPS proxy support added in LWP since 6.06 and causes no problems. Changing of proxy server will not affect cache name
413              
414             =head2 CONNECT proxy
415              
416             CONNECT proxy support may be added using L. The problem is that this module uses
417             LWP's request() for creation of CONNECT tunnel, so this response will be cached. But in fact it shouldn't. To workaround this
418             you need to install C hook
419              
420             $ua->nocache_if(sub {
421             my $resp = shift;
422             # do not cache creation of tunnel
423             $resp->request->method eq 'CONNECT';
424             });
425              
426             After that it works without problems. Changing of proxy server will not affect cache name
427              
428             =head2 SOCKS proxy
429              
430             SOCKS proxy support may be added using L and causes no problems.
431             Changing of proxy server will not affect cache name
432              
433             =head1 SEE ALSO
434              
435             L
436              
437             =head1 COPYRIGHT
438              
439             Copyright Oleg G .
440              
441             This library is free software; you can redistribute it and/or
442             modify it under the same terms as Perl itself.
443              
444             =cut