File Coverage

lib/Weather/GHCN/CacheURI.pm
Criterion Covered Total %
statement 138 183 75.4
branch 14 42 33.3
condition 5 15 33.3
subroutine 26 33 78.7
pod 9 9 100.0
total 192 282 68.0


line stmt bran cond sub pod time code
1             # Weather::GHCN::CacheURI.pm - class for fetching from a URI, with file caching
2              
3             ## no critic (Documentation::RequirePodAtEnd)
4              
5             =head1 NAME
6              
7             Weather::GHCN::CacheURI - URI page fetch with file-based caching
8              
9             =head1 VERSION
10              
11             version v0.0.011
12              
13             =head1 SYNOPSIS
14              
15             use Weather::GHCN::CacheURI;
16              
17             # put files cached by fetch() in $cachedir and refresh if not changed this year
18             my $cache_uri = Weather::GHCN::CacheURI->new($cachedir, 'yearly');
19              
20             $cache_uri->clean_cache; # empty the cache
21              
22             # this will cause fetch to do a network access
23             my ($from_cache, $content) = $cache_uri->fetch($uri);
24              
25             # depending on the refresh option, this will either fetch the content
26             # from the cache, or get a fresher copy from the network
27             my ($from_cache, $content) = $cache_uri->fetch($uri);
28              
29             # fetch calls these to access the cached file according to the
30             # refresh rule and the state of the cached file and the web page
31              
32             my $content = $cache_uri->loca($uri);
33             $cache_uri->store($uri, $content);
34              
35             =head1 DESCRIPTION
36              
37             This cache module enables callers to fetch web pages and store the
38             content on the filesystem so that it can be retrieved subsequently
39             without a network access.
40              
41             Unlike caching performed by Fetch::URI or LWP, no Etags or
42             Last-Modified-Date or other data is included with the content data.
43             This metadata can be an obstacle to platform portability.
44             Essentially, just utf-8 page content that is stored. That should be
45             neutral enough that the cache file can be used on another platform.
46             This is a benefit to unit testing, because tests can be constructed
47             to fetch pages, and the cached pages can be packaged with the tests.
48             This allows the tests to run faster, and without network access.
49              
50             The approach is simple, and geared towards accessing and caching
51             the content of the NOAA GHCN weather repository. The files in that
52             repository are simple ASCII files with uncomplicated names. The
53             caching algorithm simply strips off the URI path and stores the file
54             using the filename found in the repository; e.g. 'ghcnd-stations.txt' or
55             'CA006105887.dly'. All files are kept in the cache directory, since
56             all filenames are expected to be unique.
57              
58             =cut
59              
60             ## no critic [ValuesAndExpressions::ProhibitVersionStrings]
61             ## no critic [TestingAndDebugging::RequireUseWarnings]
62              
63 6     6   112321 use v5.18; # minimum for Object::Pad
  6         37  
64 6     6   1216 use Object::Pad 0.66 qw( :experimental(init_expr) );
  6         23483  
  6         32  
65              
66             package Weather::GHCN::CacheURI;
67             class Weather::GHCN::CacheURI;
68              
69             our $VERSION = 'v0.0.011';
70              
71              
72 6     6   2356 use Carp qw(carp croak);
  6         1668  
  6         361  
73 6     6   933 use Const::Fast;
  6         5283  
  6         34  
74 6     6   491 use Fcntl qw( :DEFAULT );
  6         13  
  6         2285  
75 6     6   2972 use File::stat;
  6         47965  
  6         33  
76 6     6   2308 use Path::Tiny;
  6         27751  
  6         321  
77 6     6   1148 use Try::Tiny;
  6         4330  
  6         367  
78 6     6   3417 use Time::Piece 1.32;
  6         73570  
  6         40  
79 6     6   2933 use LWP::Simple;
  6         352046  
  6         60  
80              
81             const my $TRUE => 1; # perl's usual TRUE
82             const my $FALSE => not $TRUE; # a dual-var consisting of '' and 0
83             const my $EMPTY => q();
84             const my $FSLASH => q(/);
85             const my $ONE_DAY => 24*60*60; # number of seconds in a day
86              
87             const my $REFRESH_RE => qr{ \A ( yearly | never | always | \d+ ) \Z }xms;
88              
89             # First return value from _fetch methods indicating whether the fetch
90             # was from the cache or the web page URI
91             const my $FROM_CACHE => $TRUE;
92             const my $FROM_URI => $FALSE;
93              
94 0     0 1 0 field $_cachedir :reader :param {};
  0         0  
95 0     0 1 0 field $_refresh :reader :param {};
  0         0  
96              
97             BUILD ($cachedir, $refresh) {
98             $_cachedir //= $cachedir;
99             $_refresh //= lc $refresh;
100             croak '*E* invalid refresh option' unless $_refresh =~ $REFRESH_RE;
101             }
102              
103             =head1 FIELDS
104              
105             =head2 cachedir
106              
107             Returns the cache location defined when the object was instantiated.
108              
109             =head2 refresh
110              
111             Returns the refresh option that was defined when the object was
112             instantiated. See new(),
113              
114             =cut
115              
116             =head1 METHODS
117              
118             =head2 new ($cachedir, $refresh)
119              
120             New instances of this class must be provided a location for the cache
121             files upon creation ($cachedir). This directory must exist or the
122             new() will fail. Similarly, $refresh must be a valid value, one of:
123              
124             =over 4
125              
126             =item refresh 'yearly'
127              
128             The origin HTTP server is contacted and the page refreshed if the
129             cached file has not been changed within the current year. The
130             rationale for this, and for this being the default, is that the GHCN
131             data for the current year will always be incomplete, and that will
132             skew any statistical analysis and so should normally be truncated.
133             If the user needs the data for the current year, they should use a
134             refresh value of 'always' or a number.
135              
136             =item refresh 'never'
137              
138             The origin HTTP is never contacted, regardless of the page being in
139             cache or not. If the page is missing from cache, the fetch method will
140             return undef. If the page is in cache, that page will be returned, no
141             matter how old it is.
142              
143             =item refresh 'always'
144              
145             If a page is in the cache, the origin HTTP server is always checked for
146             a fresher copy
147              
148             =item refresh
149              
150             The origin HTTP server is not contacted if the page is in cache
151             and the cached page was inserted within the last days.
152             Otherwise the server is checked for a fresher page.
153              
154             =back
155              
156             =head2 clean_cache
157              
158             Removes all the files in the cache, but leaves the cache directory.
159             Returns a list of errors for any files that couldn't be removed.
160              
161             =cut
162              
163 1     1 1 1588 method clean_cache () {
  1         3  
  1         2  
164 1         7 my $re = qr{ \A ( ghcnd-\w+[.]txt | \w+[.]dly ) \Z }xms;
165 1         4 my @files = path($_cachedir)->children( $re );
166 1         244 my @errors;
167 1         3 foreach my $f (@files) {
168             try {
169 3     3   125 $f->remove;
170             } catch {
171 0     0   0 push @errors, "*E* unable to remove $f: $_";
172 3         150 };
173             }
174 1         65 return @errors;
175             }
176              
177             =head2 clean_data_cache
178              
179             Removes all the daily weather data files (*.dly) from the cache, but
180             leaves the cache directory. Returns a list of errors for any files
181             that couldn't be removed.
182              
183             =cut
184              
185 1     1 1 11463 method clean_data_cache () {
  1         3  
  1         3  
186             # delete the daily weather data files in the cache
187 1         8 my $re = qr{ \A \w+[.]dly \Z }xms;
188 1         6 my @files = path($_cachedir)->children( $re );
189 1         217 my @errors;
190 1         4 foreach my $f (@files) {
191             try {
192 1     1   68 $f->remove;
193             } catch {
194 0     0   0 push @errors, "*E* unable to remove $f: $_";
195 1         12 };
196             }
197 1         102 return @errors;
198             }
199              
200             =head2 clean_station_cache
201              
202             Removes the station list and station inventory files (ghcnd-*.txt)
203             from the cache, but leaves the cache directory. Returns a list of
204             errors for any files that couldn't be removed.
205              
206             =cut
207              
208 1     1 1 1310 method clean_station_cache () {
  1         2  
  1         2  
209             # delete the station list and inventory files in the cache
210 1         6 my $re = qr{ \A ghcnd-\w+[.]txt \Z }xms;
211 1         5 my @files = path($_cachedir)->children( $re );
212 1         200 my @errors;
213 1         5 foreach my $f (@files) {
214             try {
215 2     2   89 $f->remove;
216             } catch {
217 0     0   0 push @errors, "*E* unable to remove $f: $_";
218 2         93 };
219             }
220 1         70 return @errors;
221             }
222              
223             =head2 fetch ($uri, $refresh="yearly")
224              
225             Fetch the web page given by the URI $uri, returning its content
226             and caching it. If a cached entry for it exists, and is current
227             according to the refresh option, then the cached entry is returned.
228              
229             =cut
230              
231 55     55 1 4426 method fetch ($uri) {
  55         147  
  55         196  
  55         111  
232              
233 55         142 my $from_cache;
234             my $content;
235              
236 55 50       223 carp '*W* no cache directory specified therefore no caching of HTTP queries available'
237             if not $_cachedir;
238              
239 55 50 33     2945 carp '*W* cache location specified but doesn\'t exist, therefore no caching of HTTP queries available'
240             if $_cachedir and not -d $_cachedir;
241              
242 55 50 33     996 if (not $_cachedir or not -d $_cachedir) {
243 0         0 ($from_cache, $content) = $self->_fetch_without_cache($uri);
244 0         0 return ($from_cache, $content);
245             }
246              
247 55 50       435 if ($_refresh eq 'always') {
    100          
    50          
248 0         0 ($from_cache, $content) = $self->_fetch_refresh_always($uri);
249             }
250             elsif ($_refresh eq 'never') {
251 54         294 ($from_cache, $content) = $self->_fetch_refresh_never($uri);
252             }
253             elsif ($_refresh eq 'yearly') {
254 1         8 my $cutoff_mtime = localtime->truncate( to => 'year' );
255 1         361 ($from_cache, $content) = $self->_fetch_refresh_n_days($uri, $cutoff_mtime);
256             } else {
257 0 0       0 croak unless $_refresh =~ m{ \A \d+ \Z }xms;
258 0         0 my $cutoff_mtime = localtime->truncate( to => 'day') - ( $_refresh * $ONE_DAY );
259 0         0 ($from_cache, $content) = $self->_fetch_refresh_n_days($uri, $cutoff_mtime);
260             }
261              
262 55         458 return ($from_cache, $content);
263             }
264              
265             =head2 load ($uri)
266              
267             Load a previously fetched and stored $uri from the file cache and
268             returns the content. Uses Path::Tiny->slurp_utf8, which will lock
269             the file during the operation and which uses a binmode of
270             :unix:encoding(UTF-8) for platform portability of the files.
271              
272             =cut
273              
274 56     56 1 3378 method load ($uri) {
  56         171  
  56         137  
  56         100  
275 56         292 my $file = $self->_path_to_key($uri);
276              
277 56 100 66     1267 if ( defined $file && -f $file ) {
278 55         344 return _read_file($file);
279             } else {
280 1         6 return;
281             }
282             }
283              
284             =head2 store ($uri, $content)
285              
286             Stores content obtained from a URI using fetch() into a file in the
287             cache. The filename is derived from the tail end of the URI.
288              
289             Uses Path::Tiny->spew_utf8, which writes data to the file atomically.
290             The file is written to a temporary file in the cache directory, then
291             renamed over the original.
292              
293             A binmode of :unix:encoding(UTF-8) (i.e. PerlIO::utf8_strict) is
294             used, unless Unicode::UTF8 0.58+ is installed. In that case, the content
295             will be encoded by Unicode::UTF8 and written using spew_raw.
296              
297             The idea is to store data in a platform-neutral fashion, so cached
298             files can be used for unit testing on multiple platforms.
299              
300             =cut
301              
302 1     1 1 11122 method store ($uri, $content) {
  1         3  
  1         4  
  1         2  
  1         2  
303              
304 1 50       25 croak '*E* cache directory doesn\'t exist: ' . $_cachedir
305             unless -d $_cachedir;
306              
307 1         7 my $store_file = $self->_path_to_key($uri);
308 1 50       4 return if not defined $store_file;
309              
310             # path($dir)->make_path( $dir, mode => $_dir_create_mode )
311             # if not -d $dir;
312              
313 1         5 _write_file( $store_file, $content );
314             }
315              
316             # method purge_cache($mtime) {
317             # delete daily data files older than $mtime
318             # }
319              
320             =head2 remove ($uri)
321              
322             Remove the cache file associated with this URI.
323              
324             =cut
325              
326 1     1 1 1113 method remove ($uri) {
  1         4  
  1         3  
  1         3  
327 1 50       4 my $file = $self->_path_to_key($uri)
328             or return;
329 1         84 unlink $file;
330             }
331              
332              
333              
334             #---------------------------------------------------------------------
335             # Private methods
336             #---------------------------------------------------------------------
337              
338 54     54   184 method _fetch_refresh_never ($uri) {
  54         111  
  54         173  
  54         119  
339             # use the cache only
340 54         261 my $key = $self->_uri_to_key($uri);
341 54         312 my $content = $self->load($key);
342 54         1809041 return ($FROM_CACHE, $content);
343             }
344              
345 0     0   0 method _fetch_refresh_always ($uri) {
  0         0  
  0         0  
  0         0  
346             # check for a fresher copy on the server
347 0         0 my $key = $self->_uri_to_key($uri);
348 0         0 my $file = $self->_path_to_key($key);
349              
350 0         0 my $st = stat $file;
351              
352             # if we have a cached file, check to see if the page is newer
353 0 0       0 if ($st) {
354 0 0       0 my ($ctype, $doclen, $mtime, $exp, $svr) = head($uri)
355             or croak '*E* unable to fetch header for: ' . $uri;
356              
357 0 0       0 if ($mtime > $st->mtime) {
358             # page changed since it was cached
359 0         0 my $content = get($uri);
360 0 0       0 $self->store($key, $content) if $content;
361 0         0 return ($FROM_URI, $content);
362             } else {
363             # page is unchanged, so use the cached file
364 0         0 my $content = $self->load($key);
365 0         0 return ($FROM_CACHE, $content);
366             }
367             }
368              
369             # there's no cached file, so get the page from the URI and cache it
370 0         0 my $content = get($uri);
371 0 0       0 $self->store($key, $content) if $content;
372 0         0 return ($FROM_URI, $content);
373             }
374              
375 1     1   4 method _fetch_refresh_n_days ($uri, $cutoff_mtime) {
  1         11  
  1         5  
  1         3  
  1         2  
376             # check whether the cache or page is older than N days
377             # if the cache file is younger than N days ago, use it
378             # otherwise get the latest page from the server
379             # check the server if the file is older than this year
380              
381 1         6 my $key = $self->_uri_to_key($uri);
382 1         5 my $file = $self->_path_to_key($key);
383              
384 1         7 my $st = stat $file;
385              
386 1 50 33     315 if ($st and $st->mtime >= $cutoff_mtime) {
387             # the cached file we have is at or new than the cutoff, so we'll use it
388 1         131 my $content = $self->load($key);
389 1         100498 return ($FROM_CACHE, $content);
390             }
391              
392             # get the mtime for the URI
393 0 0       0 my ($ctype, $doclen, $mtime, $exp, $svr) = head($uri)
394             or croak '*E* unable to fetch header for: ' . $uri;
395              
396             # our cached file is older than the cutoff, but if it's up to date
397             # with the web page then we can use it
398 0 0 0     0 if ($st and $st->mtime >= $mtime) {
399             # web page hasn't changed since it was cached. so we'll use it
400 0         0 my $content = $self->load($key);
401 0         0 return ($FROM_CACHE, $content);
402             }
403              
404             # there's no cached file, or the cached file is out of date, so
405             # we get the page from the URI and cache it
406 0         0 my $content = get($uri);
407 0 0       0 $self->store($key, $content) if $content;
408 0         0 return ($FROM_URI, $content);
409             }
410              
411 0     0   0 method _fetch_without_cache ($uri) {
  0         0  
  0         0  
  0         0  
412             # check for a fresher copy on the server
413 0         0 my $key = $self->_uri_to_key($uri);
414              
415 0         0 my $content = get($uri);
416 0         0 return ($FROM_URI, $content);
417             }
418              
419 120     120   6171 method _uri_to_key ($uri) {
  120         187  
  120         238  
  120         199  
420 120         897 my @parts = split m{ $FSLASH }xms, $uri;
421 120         311 my $key = $parts[-1]; # use the last part as the key
422              
423             # this transformation is for testing using CPAN pages and is not
424             # necessary for the NOAA GHCN pages we actually deal with
425 120         344 $key =~ s{ [:] }{}xmsg;
426              
427 120         415 return $key;
428             }
429              
430 62     62   2802 method _path_to_key ($uri) {
  62         143  
  62         114  
  62         115  
431 62 50       236 return if not defined $uri;
432              
433 62         208 my $key = $self->_uri_to_key( $uri );
434              
435 62         464 my $filepath = path($_cachedir)->child($key)->stringify;
436              
437 62         7139 return $filepath;
438             }
439              
440             ######################################################################
441             # Private subroutines
442             ######################################################################
443              
444 55     55   119 sub _read_file ( $file ) {
  55         165  
  55         111  
445 55         215 return path($file)->slurp_utf8;
446             }
447              
448 1     1   2 sub _write_file ( $file, $data ) {
  1         2  
  1         2  
  1         2  
449 1         4 return path($file)->spew_utf8( $data );
450             }
451              
452              
453             1;
454              
455             __END__