File Coverage

blib/lib/HTTP/Cache/Transparent.pm
Criterion Covered Total %
statement 144 165 87.2
branch 47 90 52.2
condition 17 37 45.9
subroutine 20 21 95.2
pod 1 1 100.0
total 229 314 72.9


line stmt bran cond sub pod time code
1             package HTTP::Cache::Transparent;
2              
3 2     2   125054 use strict;
  2         4  
  2         110  
4              
5             our $VERSION = '1.3';
6              
7             =head1 NAME
8              
9             HTTP::Cache::Transparent - Cache the result of http get-requests persistently.
10              
11             =head1 SYNOPSIS
12              
13             use LWP::Simple;
14             use HTTP::Cache::Transparent;
15              
16             HTTP::Cache::Transparent::init( {
17             BasePath => '/tmp/cache',
18             } );
19              
20             my $data = get( 'http://www.sn.no' );
21              
22             =head1 DESCRIPTION
23              
24             An implementation of http get that keeps a local cache of fetched
25             pages to avoid fetching the same data from the server if it hasn't
26             been updated. The cache is stored on disk and is thus persistent
27             between invocations.
28              
29             Uses the http-headers If-Modified-Since and ETag to let the server
30             decide if the version in the cache is up-to-date or not.
31              
32             The cache is implemented by modifying the LWP::UserAgent class to
33             seamlessly cache the result of all requests that can be cached.
34              
35             =head1 INITIALIZING THE CACHE
36              
37             HTTP::Cache::Transparent provides an init-method that sets the
38             parameters for the cache and overloads a method in LWP::UserAgent
39             to activate the cache.After init has been called, the normal
40             LWP-methods (LWP::Simple as well as the more full-fledged
41             LWP::Request methods) should be used as usual.
42              
43             =over 4
44              
45             =cut
46              
47 2     2   10 use Carp;
  2         4  
  2         164  
48 2     2   540 use LWP::UserAgent;
  2         36595  
  2         80  
49 2     2   13 use HTTP::Status qw/RC_NOT_MODIFIED RC_OK RC_PARTIAL_CONTENT status_message/;
  2         2  
  2         294  
50              
51 2     2   11 use Digest::MD5 qw/md5_hex/;
  2         2  
  2         91  
52 2     2   1097 use IO::File;
  2         7739  
  2         245  
53 2     2   1069 use File::Copy;
  2         5395  
  2         126  
54 2     2   12 use File::Path;
  2         2  
  2         119  
55 2     2   9 use Cwd;
  2         3  
  2         620  
56              
57             # These are the response-headers that we should store in the
58             # cache-entry and recreate when we return a cached response.
59             my @cache_headers = qw/Content-Type Content-Encoding
60             Content-Length Content-Range
61             Last-Modified/;
62              
63             my $basepath;
64             my $maxage;
65             my $verbose;
66             my $noupdate;
67             my $approvecontent;
68              
69             my $org_simple_request;
70              
71             =item init
72              
73             Initialize the HTTP cache. Takes a single parameter which is a
74             hashref containing named arguments to the object.
75              
76             HTTP::Cache::Transparent::init( {
77              
78             # Directory to store the cache in.
79             BasePath => "/tmp/cache",
80              
81             # How many hours should items be kept in the cache
82             # after they were last requested?
83             # Default is 8*24.
84             MaxAge => 8*24,
85              
86             # Print progress-messages to STDERR.
87             # Default is 0.
88             Verbose => 1,
89              
90             # If a request is made for a url that has been requested
91             # from the server less than NoUpdate seconds ago, the
92             # response will be generated from the cache without
93             # contacting the server.
94             # Default is 0.
95             NoUpdate => 15*60,
96              
97             # When a url has been downloaded and the response indicates that
98             # has been modified compared to the content in the cache,
99             # the ApproveContent callback is called with the HTTP::Response.
100             # The callback shall return true if the response shall be used and
101             # stored in the cache or false if the response shall be discarded
102             # and the response in the cache used instead.
103             # This mechanism can be used to work around servers that return errors
104             # intermittently. The default is to accept all responses.
105             ApproveContent => sub { return $_[0]->is_success },
106             } );
107              
108             The directory where the cache is stored must be writable. It must also only
109             contain files created by HTTP::Cache::Transparent.
110              
111             =cut
112              
113             my $initialized = 0;
114             sub init {
115 1     1 1 1005 my( $arg ) = @_;
116              
117             defined( $arg->{BasePath} )
118 1 50       4 or croak( "You must specify a BasePath" );
119              
120 1         2 $basepath = $arg->{BasePath};
121              
122 1 50       12 if( not -d $basepath ) {
123 0         0 eval { mkpath($basepath) };
  0         0  
124 0 0       0 if ($@) {
125 0         0 print STDERR "$basepath is not a directory and cannot be created: $@\n";
126 0         0 exit 1;
127             }
128             }
129              
130             # Append a trailing slash if it is missing.
131 1         7 $basepath =~ s%([^/])$%$1/%;
132              
133 1   50     6 $maxage = $arg->{MaxAge} || 8*24;
134 1   50     4 $verbose = $arg->{Verbose} || 0;
135 1   50     3 $noupdate = $arg->{NoUpdate} || 0;
136 1   50 0   11 $approvecontent = $arg->{ApproveContent} || sub { return 1; };
  0         0  
137              
138             # Make sure that LWP::Simple does not use its simplified
139             # get-method that bypasses LWP::UserAgent.
140 1         2 $LWP::Simple::FULL_LWP++;
141              
142 1 50       4 unless ($initialized++) {
143 1         1 $org_simple_request = \&LWP::UserAgent::simple_request;
144              
145 2     2   18 no warnings;
  2         3  
  2         3497  
146 1         4 *LWP::UserAgent::simple_request = \&_simple_request_cache
147             }
148             }
149              
150             =item Initializing from use-line
151              
152             An alternative way of initializing HTTP::Cache::Transparent is to supply
153             parameters in the use-line. This allows you to write
154              
155             use HTTP::Cache::Transparent ( BasePath => '/tmp/cache' );
156              
157             which is exactly equivalent to
158              
159             use HTTP::Cache::Transparent;
160             HTTP::Cache::Transparent::init( BasePath => '/tmp/cache' );
161              
162             The advantage to using this method is that you can do
163              
164             perl -MHTTP::Cache::Transparent=BasePath,/tmp/cache myscript.pl
165              
166             or even set the environment variable PERL5OPT
167              
168             PERL5OPT=-MHTTP::Cache::Transparent=BasePath,/tmp/cache
169             myscript.pl
170              
171             and have all the http-requests performed by myscript.pl go through the
172             cache without changing myscript.pl
173              
174             =back
175              
176             =cut
177              
178             sub import {
179 2     2   22 my( $module, %args ) = @_;
180 2 50       2528 return if (scalar(keys(%args)) == 0);
181              
182 0         0 HTTP::Cache::Transparent::init( \%args );
183             }
184              
185             END {
186 2     2   1261 _remove_old_entries();
187             }
188              
189             sub _simple_request_cache {
190 3     3   10011753 my($self, $r, $content_cb, $read_size_hint) = @_;
191              
192 3         7 my $res;
193              
194 3 50 33     10 if( $r->method eq "GET" and
      33        
195             not defined( $r->header( 'If-Modified-Since' ) ) and
196             not defined( $content_cb ) ) {
197 3 50       220 print STDERR "Fetching " . $r->uri
198             if( $verbose );
199              
200 3         9 my $url = $r->uri->as_string;
201 3         74 my $key = $url;
202 3 50       10 $key .= "\n" . $r->header('Range')
203             if defined $r->header('Range');
204              
205 3         382 my $filename = $basepath . _urlhash( $url );
206              
207 3         4 my $fh;
208             my $meta;
209              
210 3 100       83 if( -s $filename ) {
211 2 50       21 $fh = new IO::File "< $filename"
212             or die "Failed to read from $filename";
213              
214 2         154 $meta = _read_meta( $fh );
215              
216 2 50       10 if( $meta->{Url} eq $url ) {
217             $meta->{'Range'} = ""
218 2 50       10 unless defined( $meta->{'Range'} );
219              
220             # Check that the Range is the same for this request as
221             # for the one in the cache.
222 2 50 33     8 if( (not defined( $r->header( 'Range' ) ) ) or
223             $r->header( 'Range' ) eq $meta->{'Range'} ) {
224             $r->header( 'If-Modified-Since', $meta->{'Last-Modified'} )
225 2 50       79 if exists( $meta->{'Last-Modified'} );
226              
227             $r->header( 'If-None-Match', $meta->{ETag} )
228 2 50       79 if( exists( $meta->{ETag} ) );
229             }
230             }
231             else {
232 0         0 warn "Cache collision: $url and $meta->{Url} have the same md5sum";
233             }
234             }
235              
236 3 100 100     93 if( defined( $meta->{'X-HCT-LastUpdated'} ) and
237             $noupdate > (time - $meta->{'X-HCT-LastUpdated'} ) ) {
238 1 50       3 print STDERR " from cache without checking with server.\n"
239             if $verbose;
240              
241 1         9 $res = HTTP::Response->new( $meta->{Code} );
242 1         49 $res->request($r);
243 1         10 _get_from_cachefile( $filename, $fh, $res, $meta );
244 1 50       61 $fh->close()
245             if defined $fh;;
246              
247             # Set X-No-Server-Contact header as content delivered without contact with external server
248 1         7 $res->header( "X-No-Server-Contact", 1 );
249              
250 1         63 return $res;
251             }
252              
253 2         7 $res = &$org_simple_request( $self, $r );
254              
255 2 100 33     625472 if( $res->code == RC_NOT_MODIFIED ) {
    50          
256 1 50       27 print STDERR " from cache.\n"
257             if( $verbose );
258              
259 1         9 _get_from_cachefile( $filename, $fh, $res, $meta );
260              
261 1 50       38 $fh->close()
262             if defined $fh;;
263              
264             # We need to rewrite the cache-entry to update X-HCT-LastUpdated
265 1         12 _write_cache_entry( $filename, $url, $r, $res );
266 1         136 return $res;
267             }
268             elsif( defined( $meta->{'X-HCT-LastUpdated'} )
269 0         0 and not &{$approvecontent}( $res ) ) {
270 0 0       0 print STDERR " from cache since the response was not approved.\n"
271             if( $verbose );
272              
273 0         0 _get_from_cachefile( $filename, $fh, $res, $meta );
274              
275 0 0       0 $fh->close()
276             if defined $fh;;
277              
278             # Do NOT update the cache!
279              
280 0         0 return $res;
281             }
282             else {
283 1 50       20 $fh->close()
284             if defined $fh;;
285              
286 1         11 my $content = $res->content;
287 1 50       16 $content = "" if not defined $content;
288              
289 1 50 33     5 if( defined( $meta->{MD5} ) and
290             md5_hex( $content ) eq $meta->{MD5} ) {
291 0         0 $res->header( "X-Content-Unchanged", 1 );
292 0 0       0 print STDERR " unchanged"
293             if( $verbose );
294             }
295              
296 1 50       3 print STDERR " from server.\n"
297             if( $verbose );
298              
299 1 50 33     27 _write_cache_entry( $filename, $url, $r, $res )
300             if( $res->code == RC_OK or
301             $res->code == RC_PARTIAL_CONTENT );
302             }
303             }
304             else {
305             # We won't try to cache this request.
306 0         0 $res = &$org_simple_request( $self, $r,
307             $content_cb, $read_size_hint );
308             }
309              
310 1         134 return $res;
311             }
312              
313             sub _get_from_cachefile {
314 2     2   5 my( $filename, $fh, $res, $meta ) = @_;
315              
316 2         3 my $content;
317             my $buf;
318 2         15 while ( $fh->read( $buf, 1024 ) > 0 ) {
319 4         51 $content .= $buf;
320             }
321              
322 2         14 $fh->close();
323              
324 2 50       26 $content = "" if not defined $content;
325              
326             # Set last-accessed for cache-entry.
327 2         3 my $mtime = time;
328 2         77 utime( $mtime, $mtime, $filename );
329              
330             # modify response
331 2 50       11 if( $HTTP::Message::VERSION >= 1.44 ) {
332 2         14 $res->content_ref( \$content );
333             }
334             else {
335 0         0 $res->content( $content );
336             }
337              
338             # For HTTP::Cache::Transparent earlier than 0.4,
339             # there is no Code in the cache.
340 2 50       39 if( defined( $meta->{Code} ) ) {
341 2         7 $res->code( $meta->{Code} );
342             }
343             else {
344 0         0 $res->code( RC_OK );
345             }
346 2   50     18 $res->message(status_message($res->code) || "Unknown code");
347              
348 2         47 foreach my $h (@cache_headers) {
349             $res->header( $h, $meta->{$h} )
350 10 100       174 if defined( $meta->{ $h } );
351             }
352              
353 2         85 $res->header( "X-Cached", 1 );
354 2         145 $res->header( "X-Content-Unchanged", 1 );
355             }
356              
357             # Read metadata and position filehandle at start of data.
358             sub _read_meta {
359 2     2   4 my( $fh ) = @_;
360 2         4 my %meta;
361              
362 2         3 my( $key, $value );
363 2         3 do {
364 18         62 my $line = <$fh>;
365 18         57 ( $key, $value ) = ($line =~ /(\S+)\s+(.*)[\n\r]*/);
366              
367 18 100       70 $meta{$key} = $value
368             if( defined $value );
369             } while( defined( $value ) );
370              
371 2         4 return \%meta;
372             }
373              
374             # Write metadata and position filehandle where data should be written.
375             sub _write_meta {
376 2     2   2 my( $fh, $meta ) = @_;
377              
378 2         4 foreach my $key (sort keys( %{$meta} ) ) {
  2         15  
379 16         45 print $fh "$key $meta->{$key}\n";
380             }
381              
382 2         6 print $fh "\n";
383             }
384              
385             sub _write_cache_entry {
386 2     2   20 my( $filename, $url, $req, $res ) = @_;
387              
388 2         11 my $out_filename = "$filename.tmp$$";
389 2 50       16 my $fh = new IO::File "> $out_filename"
390             or die "Failed to write to $out_filename";
391              
392 2         361 my $meta;
393 2         10 $meta->{Url} = $url;
394 2 50       8 $meta->{ETag} = $res->header('ETag')
395             if defined( $res->header('ETag') );
396              
397 2         130 my $content = $res->content;
398 2 50       27 $content = "" if not defined $content;
399              
400 2         21 $meta->{MD5} = md5_hex( $content );
401 2 50       11 $meta->{Range} = $req->header('Range')
402             if defined( $req->header('Range') );
403 2         64 $meta->{Code} = $res->code;
404 2         16 $meta->{'X-HCT-LastUpdated'} = time;
405              
406 2         4 foreach my $h (@cache_headers) {
407 10 100       311 $meta->{$h} = $res->header( $h )
408             if defined $res->header( $h );
409             }
410              
411 2         92 _write_meta( $fh, $meta );
412              
413 2         3 print $fh $content;
414 2         13 $fh->close;
415              
416 2 50       107 move( $out_filename, $filename ) || unlink $out_filename;
417             }
418              
419             sub _urlhash {
420 3     3   5 my( $url ) = @_;
421              
422 3         23 return md5_hex( $url );
423             }
424              
425             sub _remove_old_entries {
426 2 100 66 2   45 if( defined( $basepath ) and -d( $basepath ) ) {
427 1         10 my $oldcwd = getcwd();
428 1         15 chdir( $basepath );
429              
430 1         74 my @files = glob("*");
431 1         3 foreach my $file (@files) {
432 1 50       7 if( $file !~ m%^[0-9a-f]{32}$% ) {
433 0         0 print STDERR "HTTP::Cache::Transparent: Unknown file found in cache directory: $basepath$file\n";
434             }
435             else {
436 1         7 my $age = (-M $file);
437             # The file may have disappeared if another process has cleaned
438             # the cache.
439 1 50 33     10 if( defined($age) && ( $age*24 > $maxage ) ) {
440 0 0       0 print STDERR "Deleting $file.\n"
441             if( $verbose );
442 0         0 unlink( $file );
443             }
444             }
445             }
446              
447 1         24 chdir( $oldcwd );
448             }
449             }
450              
451             =head1 INSPECTING CACHE BEHAVIOR
452              
453             The HTTP::Cache::Transparent inserts three special headers in the
454             HTTP::Response object. These can be accessed via the
455             HTTP::Response::header()-method.
456              
457             =over 4
458              
459             =item X-Cached
460              
461             This header is inserted and set to 1 if the response is delivered from
462             the cache instead of from the server.
463              
464             =item X-Content-Unchanged
465              
466             This header is inserted and set to 1 if the content returned is the same
467             as the content returned the last time this url was fetched. This header
468             is always inserted and set to 1 when the response is delivered from
469             the cache.
470              
471             =item X-No-Server-Contact
472              
473             This header is inserted and set to 1 if the content returned has been
474             delivered without any contact with the external server, i.e. no
475             conditional or unconditional HTTP GET request has been sent, the content
476             has been delivered directly from cache. This may be useful when seeking
477             to control loading of the external server.
478              
479             =back
480              
481             =head1 LIMITATIONS
482              
483             This module has a number of limitations that you should be aware of
484             before using it.
485              
486             =over 4
487              
488             =item -
489              
490             There is no upper limit to how much diskspace the cache requires. The
491             only limiting mechanism is that data for urls that haven't been requested
492             in the last MaxAge hours will be removed from the cache the next time
493             the program exits.
494              
495             =item -
496              
497             Currently, only get-requests that store the result in memory (i.e. do
498             not use the option to have the result stored directly in a file or
499             delivered via a callback) is cached. I intend to remove this limitation
500             in a future version.
501              
502             =item -
503              
504             The support for Ranges is a bit primitive. It creates a new object in
505             the cache for each unique combination of url and range. This will work ok
506             as long as you always request the same range(s) for a url.
507              
508             =item -
509              
510             The cache doesn't properly check and store all headers in the HTTP
511             request and response. Therefore, if you request the same url repeatedly
512             with different sets of headers (cookies, accept-encoding etc), and these
513             headers affect the response from the server, the cache may return the
514             wrong response.
515              
516             =item -
517              
518             HTTP::Cache::Transparent has not been tested with threads, and will
519             most likely not work if you use them.
520              
521             =back
522              
523             =head1 CACHE FORMAT
524              
525             The cache is stored on disk as one file per cached object. The filename
526             is equal to the md5sum of the url and the Range-header if it exists.
527             The file contains a set of
528             key/value-pairs with metadata (one entry per line) followed by a blank
529             line and then the actual data returned by the server.
530              
531             The last modified date of the cache file is set to the time when the
532             cache object was last requested by a user.
533              
534             =head1 AUTHOR
535              
536             Mattias Holmlund, E$firstname -at- $lastname -dot- seE
537             L
538              
539             =head1 GIT REPOSITORY
540              
541             A git repository containing the source for this module can be found
542             via http://git.holmlund.se/
543              
544             =head1 COPYRIGHT AND LICENSE
545              
546             Copyright (C) 2004-2007 by Mattias Holmlund
547              
548             This library is free software; you can redistribute it and/or modify
549             it under the same terms as Perl itself, either Perl version 5.8.4 or,
550             at your option, any later version of Perl 5 you may have available.
551              
552              
553             =cut
554              
555             1;