File Coverage

blib/lib/LWP/UserAgent/Cached.pm
Criterion Covered Total %
statement 128 133 96.2
branch 48 56 85.7
condition 14 18 77.7
subroutine 16 16 100.0
pod 6 6 100.0
total 212 229 92.5


line stmt bran cond sub pod time code
1             package LWP::UserAgent::Cached;
2              
3 2     2   140606 use strict;
  2         14  
  2         49  
4 2     2   9 use Carp;
  2         2  
  2         90  
5 2     2   9 use Digest::MD5;
  2         3  
  2         46  
6 2     2   421 use HTTP::Response;
  2         22465  
  2         74  
7 2     2   12 use base 'LWP::UserAgent';
  2         4  
  2         1257  
8              
9             our $VERSION = '0.08';
10              
11             sub new {
12 3     3 1 13501 my ($class, %opts) = @_;
13            
14 3         10 my $cache_dir = delete $opts{cache_dir};
15 3         7 my $nocache_if = delete $opts{nocache_if};
16 3         8 my $recache_if = delete $opts{recache_if};
17 3         6 my $on_uncached = delete $opts{on_uncached};
18 3         8 my $cachename_spec = delete $opts{cachename_spec};
19 3         26 my $self = $class->SUPER::new(%opts, parse_head => 0);
20            
21 3         149 $self->{cache_dir} = $cache_dir;
22 3         7 $self->{nocache_if} = $nocache_if;
23 3         6 $self->{recache_if} = $recache_if;
24 3         17 $self->{on_uncached} = $on_uncached;
25 3         6 $self->{cachename_spec} = $cachename_spec;
26            
27 3         32 return $self;
28             }
29              
30             # generate getters and setters
31             foreach my $opt_name (qw(cache_dir nocache_if recache_if on_uncached cachename_spec)) {
32 2     2   32572 no strict 'refs';
  2         5  
  2         1986  
33             *$opt_name = sub {
34 21     21   6967 my $self = shift;
35 21 100       59 if (@_) {
36 13         40 my $opt_val = $self->{$opt_name};
37 13         22 $self->{$opt_name} = shift;
38 13         205 return $opt_val;
39             }
40            
41 8         29 return $self->{$opt_name};
42             }
43             }
44              
45             sub parse_head {
46 3     3 1 10042 my ($self, $bool) = @_;
47            
48 3 50       21 if ($bool) {
49 0         0 die "parse_head() is disabled, because it may cause encoding troubles while saving cache";
50             }
51            
52 3         24 $self->SUPER::parse_head($bool);
53             }
54              
55             sub simple_request {
56 34     34 1 36965 my $self = shift;
57 34 50       101 unless (defined $self->{cache_dir}) {
58 0         0 return $self->SUPER::simple_request(@_);
59             }
60            
61 34         45 my $request = $_[0];
62 34         222 $request = $self->prepare_request($request);
63 34         21165 my $fpath = $self->_get_cache_name($request);
64 34         4878 my $response;
65             my $no_collision_suffix;
66            
67 34 100       91 unless ($self->{was_redirect}) {
68 28         47 @{$self->{last_cached}} = ();
  28         58  
69 28         36 @{$self->{last_used_cache}} = ();
  28         58  
70             }
71            
72 34 100       1122 if (-e $fpath) {
73 19 100       81 unless ($response = $self->_parse_cached_response($fpath, $request)) {
74             # collision
75 3 100       233 if (my @cache_list = <$fpath-*>) {
76 2         8 foreach my $cache_file (@cache_list) {
77 3 100       9 if ($response = $self->_parse_cached_response($cache_file, $request)) {
78 1         3 $fpath = $cache_file;
79 1         2 last;
80             }
81             }
82            
83 2 100       6 unless ($response) {
84 1         8 $no_collision_suffix = sprintf('-%03d', substr($cache_list[-1], -3) + 1);
85             }
86             }
87             else {
88 1         5 $no_collision_suffix = '-001';
89             }
90             }
91            
92 19 100 100     105 if ($response && defined($self->{recache_if}) && $self->{recache_if}->($response, $fpath, $request)) {
      66        
93 1         1733 $response = undef;
94             }
95             }
96            
97 34 100       138 unless ($response) {
98 18 100       57 if (defined $self->{on_uncached}) {
99 1         5 $self->{on_uncached}->($request);
100             }
101            
102 18         310 $response = $self->send_request(@_);
103            
104 18 100 100     764 if (!defined($self->{nocache_if}) || !$self->{nocache_if}->($response)) {
105 17 100       71 if (defined $no_collision_suffix) {
106 2         161 $fpath .= $no_collision_suffix;
107             }
108            
109 17 50       1893 if (open my $fh, '>:raw', $fpath) {
110 17         102 print $fh $request->url, "\n";
111 17         793 print $fh $response->as_string("\n");
112 17         2521 close $fh;
113            
114 17         333 push @{$self->{last_cached}}, $fpath;
  17         70  
115 17         32 push @{$self->{last_used_cache}}, $fpath;
  17         81  
116             }
117             else {
118 0         0 carp "open('$fpath', 'w'): $!";
119             }
120             }
121             }
122             else {
123 16         22 push @{$self->{last_used_cache}}, $fpath;
  16         38  
124             }
125            
126 34   66     131 $self->{was_redirect} = $response->is_redirect && _in($request->method, $self->requests_redirectable);
127 34         368 return $response;
128             }
129              
130             sub last_cached {
131 3     3 1 6445 my $self = shift;
132             return exists $self->{last_cached} ?
133 3 50       55 @{$self->{last_cached}} : ();
  3         75  
134             }
135              
136             sub last_used_cache {
137 2     2 1 7 my $self = shift;
138             return exists $self->{last_used_cache} ?
139 2 50       7 @{$self->{last_used_cache}} : ();
  2         11  
140             }
141              
142             sub uncache {
143 1     1 1 48 my $self = shift;
144 1         4 unlink $_ for $self->last_cached;
145             }
146              
147             sub _get_cache_name {
148 35     35   319 my ($self, $request) = @_;
149            
150 35 100 66     123 if (defined($self->{cachename_spec}) && %{$self->{cachename_spec}}) {
  5         19  
151 5         14 my $tmp_request = $request->clone();
152 5         1230 my $leave_only_specified;
153 5 100       19 if (exists $self->{cachename_spec}{_headers}) {
154 3 50       10 ref $self->{cachename_spec}{_headers} eq 'ARRAY'
155             or croak 'cachename_spec->{_headers} should be array ref';
156 3         5 $leave_only_specified = 1;
157             }
158            
159 5         12 foreach my $hname ($tmp_request->headers->header_field_names) {
160 15 100 66     352 if (exists $self->{cachename_spec}{$hname}) {
    100          
161 5 100       11 if (defined $self->{cachename_spec}{$hname}) {
162 4         11 $tmp_request->headers->header($hname, $self->{cachename_spec}{$hname});
163             }
164             else {
165 1         3 $tmp_request->headers->remove_header($hname);
166             }
167             }
168             elsif ($leave_only_specified && !_in($hname, $self->{cachename_spec}{_headers})) {
169 9         18 $tmp_request->headers->remove_header($hname);
170             }
171             }
172            
173 5 100       164 if (exists $self->{cachename_spec}{_body}) {
174 3         8 $tmp_request->content($self->{cachename_spec}{_body});
175             }
176            
177 5         58 return $self->{cache_dir} . '/' . Digest::MD5::md5_hex($tmp_request->as_string("\n"));
178             }
179            
180 30         162 return $self->{cache_dir} . '/' . Digest::MD5::md5_hex($request->as_string("\n"));
181             }
182              
183             sub _parse_cached_response {
184 22     22   56 my ($self, $cache_file, $request) = @_;
185            
186 22         107 my $fh;
187 22 50       810 unless (open $fh, '<:raw', $cache_file) {
188 0         0 carp "open('$cache_file', 'r'): $!";
189 0         0 return;
190             }
191            
192 22         288 my $url = <$fh>;
193 22         168 $url =~ s/\s+$//;
194 22 100       88 if ($url ne $request->url) {
195 5         116 close $fh;
196 5         32 return;
197             }
198            
199 17         577 local $/ = undef;
200 17         218 my $response_str = <$fh>;
201 17         181 close $fh;
202            
203 17         283 my $response = HTTP::Response->parse($response_str);
204 17         3086 $response->request($request);
205            
206 17 50       167 if ($self->cookie_jar) {
207 17         182 $self->cookie_jar->extract_cookies($response);
208             }
209            
210 17         3427 return $response;
211             }
212              
213             sub _in($$) {
214 15     15   202 my ($what, $where) = @_;
215            
216 15         24 foreach my $item (@$where) {
217 7 100       161 return 1 if ($what eq $item);
218             }
219            
220 9         22 return 0;
221             }
222              
223             1;
224              
225             =pod
226              
227             =head1 NAME
228              
229             LWP::UserAgent::Cached - LWP::UserAgent with simple caching mechanism
230              
231             =head1 SYNOPSIS
232              
233             use LWP::UserAgent::Cached;
234            
235             my $ua = LWP::UserAgent::Cached->new(cache_dir => '/tmp/lwp-cache');
236             my $resp = $ua->get('http://google.com/'); # makes http request
237            
238             ...
239            
240             $resp = $ua->get('http://google.com/'); # no http request - will get it from the cache
241              
242             =head1 DESCRIPTION
243              
244             When you process content from some website, you will get page one by one and extract some data from this
245             page with regexp, DOM parser or smth else. Sometimes we makes errors in our data extractors and realize this
246             only when all 1_000_000 pages were processed. We should fix our extraction logic and start all process from the
247             beginning. Please STOP! How about cache? Yes, you can cache all responses and second, third and other attempts will
248             be very fast.
249              
250             LWP::UserAgent::Cached is yet another LWP::UserAgent subclass with cache support. It stores
251             cache in the files on local filesystem and if response already available in the cache returns it instead of making HTTP request.
252             This module was writed because other available alternatives didn't meet my needs:
253              
254             =over
255              
256             =item L
257              
258             caches responses on local filesystem and gets it from the cache only if online document was not modified
259              
260             =item L
261              
262             same as above but stores cache in memory
263              
264             =item L
265              
266             can record responses in the cache or get responses from the cache, but not both for one useragent
267              
268             =item L
269              
270             seems it may cache responses and get responses from the cache, but has too much dependencies and unclear
271             `delay' parameter
272              
273             =back
274              
275             =head1 METHODS
276              
277             All LWP::UserAgent methods and several new.
278              
279             =head2 new(...)
280              
281             Creates new LWP::UserAgent::Cached object. Since LWP::UserAgent::Cached is LWP::UserAgent subclass it has all same
282             parameters, but in additional it has some new optional pararmeters:
283              
284             L
285              
286             L
287              
288             L
289              
290             L
291              
292             L
293              
294             LWP::UserAgent::Cached creation example:
295              
296             my $ua = LWP::UserAgent::Cached->new(cache_dir => 'cache/lwp', nocache_if => sub {
297             my $response = shift;
298             return $response->code >= 500; # do not cache any bad response
299             }, recache_if => sub {
300             my ($response, $path, $request) = @_;
301             return $response->code == 404 && -M $path > 1; # recache any 404 response older than 1 day
302             }, on_uncached => sub {
303             my $request = shift;
304             sleep 5 if $request->uri =~ '/category/\d+'; # delay before http requests inside "/category"
305             }, cachename_spec => {
306             'User-Agent' => undef, # omit agent while calculating cache name
307             });
308              
309             =head2 cache_dir() or cache_dir($dir)
310              
311             Gets or sets path to the directory where cache will be stored.
312             If not set useragent will behaves as LWP::UserAgent without cache support.
313              
314             =head2 nocache_if() or nocache_if($sub)
315              
316             Gets or sets reference to subroutine which will be called after receiving each non-cached response. First parameter
317             of this subroutine will be HTTP::Response object. This subroutine should return true if this response should
318             not be cached and false otherwise. If not set all responses will be cached.
319              
320             =head2 recache_if() or recache_if($sub)
321              
322             Gets or sets reference to subroutine which will be called for each response available in the cache. First parameter
323             of this subroutine will be HTTP::Response object, second - path to file with cache, third - HTTP::Request object.
324             This subroutine should return true if response needs to be recached (new HTTP request will be made) and false otherwise.
325             This $sub will be called only if response already available in the cache. Here you can also modify request for your needs.
326             This will not change name of the file with cache.
327              
328             =head2 on_uncached() or on_uncached($sub)
329              
330             Gets or sets reference to subroutine which will be called for each non-cached http request, before actually request.
331             First parameter of this subroutine will be HTTP::Request object. Here you can also modify request for your needs.
332             This will not change name of the file with cache.
333              
334             =head2 cachename_spec() or cachename_spec($spec)
335              
336             Gets or sets hash reference to cache naming specification. In fact cache naming for each request based on request content.
337             Internally it is md5_hex($request->as_string("\n")). But what if some of request headers in your program changed dinamically, e.g.
338             User-Agent or Cookie? In such case caching will not work properly for you. We need some way to omit this headers when calculating
339             cache name. This option is what you need. Specification hash should contain header name and header value which will be used
340             (instead of values in request) while calculating cache name.
341              
342             For example we already have cache where 'User-Agent' value in the headers was 'Mozilla/5.0', but in the current version of the program
343             it will be changed for each request. So we force specified that for cache name calculation 'User-Agent' should be 'Mozilla/5.0'. Cached
344             request had not 'Accept' header, but in the current version it has. So we force specified do not include this header for cache name
345             calculation.
346              
347             cachename_spec => {
348             'User-Agent' => 'Mozilla/5.0',
349             'Accept' => undef
350             }
351              
352             Specification hash may contain two special keys: '_body' and '_headers'. With '_body' key you can specify body content in the request
353             for cache name calculation. For example to not include body content in cache name calculation set '_body' to undef or empty string.
354             With '_headers' key you can specify which headers should be included in $request for cache name calculation. For example you can say to
355             include only 'Host' and 'Referer'. '_headers' value should be array reference:
356              
357             cachename_spec => {
358             _body => undef, # omit body
359             _headers => ['Host'], # include only host with value from request
360             # It will be smth like:
361             # md5_hex("METHOD url\r\nHost: host\r\n\r\n")
362             # method and url will be included in any case
363             }
364              
365             Another example. Omit body, include only 'Host' and 'User-Agent' headers, use 'Host' value from request and specified 'User-Agent' value,
366             in addition include referrer with specified value ('Referer' not specified in '_headers', but values from main specification hash has
367             higher priority):
368              
369             cachename_spec => {
370             _body => '',
371             _headers => ['Host', 'User-Agent'],
372             'User-Agent' => 'Mozilla/5.0',
373             'Referer' => 'http://www.com'
374             }
375              
376             One more example. Calculate cache name based only on method and url:
377              
378             cachename_spec => {
379             _body =>'',
380             _headers => []
381             }
382              
383             =head2 last_cached()
384              
385             Returns list with pathes to files with cache stored by last noncached response. List may contain more than one
386             element if there was redirect.
387              
388             =head2 last_used_cache()
389              
390             Returns list with pathes to files with cache used in last response. This includes files just stored (last_cached)
391             and files that may be already exists (cached earlier). List may contain more than one element if there was redirect.
392              
393             =head2 uncache()
394              
395             Removes last response from the cache. Use case example:
396              
397             my $page = $ua->get($url)->decoded_content;
398             if ($page =~ /Access for this ip was blocked/) {
399             $ua->uncache();
400             }
401              
402             =head1 Proxy and cache name
403              
404             Here you can see how changing of proxy for useragent will affect cache name
405              
406             =head2 HTTP proxy
407              
408             HTTP proxy support works out of the box and causes no problems. Changing of proxy server will not affect cache name
409              
410             =head2 HTTPS proxy
411              
412             Proper HTTPS proxy support added in LWP since 6.06 and causes no problems. Changing of proxy server will not affect cache name
413              
414             =head2 CONNECT proxy
415              
416             CONNECT proxy support may be added using L. The problem is that this module uses
417             LWP's request() for creation of CONNECT tunnel, so this response will be cached. But in fact it shouldn't. To workaround this
418             you need to install C hook
419              
420             $ua->nocache_if(sub {
421             my $resp = shift;
422             # do not cache creation of tunnel
423             $resp->request->method eq 'CONNECT';
424             });
425              
426             After that it works without problems. Changing of proxy server will not affect cache name
427              
428             =head2 SOCKS proxy
429              
430             SOCKS proxy support may be added using L and causes no problems.
431             Changing of proxy server will not affect cache name
432              
433             =head1 SEE ALSO
434              
435             L
436              
437             =head1 COPYRIGHT
438              
439             Copyright Oleg G .
440              
441             This library is free software; you can redistribute it and/or
442             modify it under the same terms as Perl itself.
443              
444             =cut