File Coverage

lib/Weather/GHCN/CacheURI.pm
Criterion Covered Total %
statement 125 183 68.3
branch 11 42 26.1
condition 4 15 26.6
subroutine 25 33 75.7
pod 9 9 100.0
total 174 282 61.7


line stmt bran cond sub pod time code
1             # Weather::GHCN::CacheURI.pm - class for fetching from a URI, with file caching
2              
3             ## no critic (Documentation::RequirePodAtEnd)
4              
5             =head1 NAME
6              
7             Weather::GHCN::CacheURI - URI page fetch with file-based caching
8              
9             =head1 VERSION
10              
11             version v0.0.010
12              
13             =head1 SYNOPSIS
14              
15             use Weather::GHCN::CacheURI;
16              
17             # put files cached by fetch() in $cachedir and refresh if not changed this year
18             my $cache_uri = Weather::GHCN::CacheURI->new($cachedir, 'yearly');
19              
20             $cache_uri->clean_cache; # empty the cache
21              
22             # this will cause fetch to do a network access
23             my ($from_cache, $content) = $cache_uri->fetch($uri);
24              
25             # depending on the refresh option, this will either fetch the content
26             # from the cache, or get a fresher copy from the network
27             my ($from_cache, $content) = $cache_uri->fetch($uri);
28              
29             # fetch calls these to access the cached file according to the
30             # refresh rule and the state of the cached file and the web page
31              
32             my $content = $cache_uri->loca($uri);
33             $cache_uri->store($uri, $content);
34              
35             =head1 DESCRIPTION
36              
37             This cache module enables callers to fetch web pages and store the
38             content on the filesystem so that it can be retrieved subsequently
39             without a network access.
40              
41             Unlike caching performed by Fetch::URI or LWP, no Etags or
42             Last-Modified-Date or other data is included with the content data.
43             This metadata can be an obstacle to platform portability.
44             Essentially, just utf-8 page content that is stored. That should be
45             neutral enough that the cache file can be used on another platform.
46             This is a benefit to unit testing, because tests can be constructed
47             to fetch pages, and the cached pages can be packaged with the tests.
48             This allows the tests to run faster, and without network access.
49              
50             The approach is simple, and geared towards accessing and caching
51             the content of the NOAA GHCN weather repository. The files in that
52             repository are simple ASCII files with uncomplicated names. The
53             caching algorithm simply strips off the URI path and stores the file
54             using the filename found in the repository; e.g. 'ghcnd-stations.txt' or
55             'CA006105887.dly'. All files are kept in the cache directory, since
56             all filenames are expected to be unique.
57              
58             =cut
59              
60             ## no critic [ValuesAndExpressions::ProhibitVersionStrings]
61             ## no critic [TestingAndDebugging::RequireUseWarnings]
62              
63 5     5   110353 use v5.18; # minimum for Object::Pad
  5         34  
64 5     5   1271 use Object::Pad 0.66 qw( :experimental(init_expr) );
  5         23026  
  5         26  
65              
66             package Weather::GHCN::CacheURI;
67             class Weather::GHCN::CacheURI;
68              
69             our $VERSION = 'v0.0.010';
70              
71              
72 5     5   1898 use Carp qw(carp croak);
  5         21  
  5         2134  
73 5     5   932 use Const::Fast;
  5         5590  
  5         26  
74 5     5   369 use Fcntl qw( :DEFAULT );
  5         19  
  5         1841  
75 5     5   2550 use File::stat;
  5         40107  
  5         22  
76 5     5   2223 use Path::Tiny;
  5         27713  
  5         280  
77 5     5   1178 use Try::Tiny;
  5         4367  
  5         298  
78 5     5   2757 use Time::Piece 1.32;
  5         61531  
  5         31  
79 5     5   2386 use LWP::Simple;
  5         293458  
  5         40  
80              
81             const my $TRUE => 1; # perl's usual TRUE
82             const my $FALSE => not $TRUE; # a dual-var consisting of '' and 0
83             const my $EMPTY => q();
84             const my $FSLASH => q(/);
85             const my $ONE_DAY => 24*60*60; # number of seconds in a day
86              
87             const my $REFRESH_RE => qr{ \A ( yearly | never | always | \d+ ) \Z }xms;
88              
89             # First return value from _fetch methods indicating whether the fetch
90             # was from the cache or the web page URI
91             const my $FROM_CACHE => $TRUE;
92             const my $FROM_URI => $FALSE;
93              
94 0     0 1 0 field $_cachedir :reader :param {};
  0         0  
95 0     0 1 0 field $_refresh :reader :param {};
  0         0  
96              
97             BUILD ($cachedir, $refresh) {
98             $_cachedir //= $cachedir;
99             $_refresh //= lc $refresh;
100             croak '*E* cache directory does not exist' unless -d $cachedir;
101             croak '*E* invalid refresh option' unless $_refresh =~ $REFRESH_RE;
102             }
103              
104             =head1 FIELDS
105              
106             =head2 cachedir
107              
108             Returns the cache location defined when the object was instantiated.
109              
110             =head2 refresh
111              
112             Returns the refresh option that was defined when the object was
113             instantiated. See new(),
114              
115             =cut
116              
117             =head1 METHODS
118              
119             =head2 new ($cachedir, $refresh)
120              
121             New instances of this class must be provided a location for the cache
122             files upon creation ($cachedir). This directory must exist or the
123             new() will fail. Similarly, $refresh must be a valid value, one of:
124              
125             =over 4
126              
127             =item refresh 'yearly'
128              
129             The origin HTTP server is contacted and the page refreshed if the
130             cached file has not been changed within the current year. The
131             rationale for this, and for this being the default, is that the GHCN
132             data for the current year will always be incomplete, and that will
133             skew any statistical analysis and so should normally be truncated.
134             If the user needs the data for the current year, they should use a
135             refresh value of 'always' or a number.
136              
137             =item refresh 'never'
138              
139             The origin HTTP is never contacted, regardless of the page being in
140             cache or not. If the page is missing from cache, the fetch method will
141             return undef. If the page is in cache, that page will be returned, no
142             matter how old it is.
143              
144             =item refresh 'always'
145              
146             If a page is in the cache, the origin HTTP server is always checked for
147             a fresher copy
148              
149             =item refresh
150              
151             The origin HTTP server is not contacted if the page is in cache
152             and the cached page was inserted within the last days.
153             Otherwise the server is checked for a fresher page.
154              
155             =back
156              
157             =head2 clean_cache
158              
159             Removes all the files in the cache, but leaves the cache directory.
160             Returns a list of errors for any files that couldn't be removed.
161              
162             =cut
163              
164 1     1 1 1481 method clean_cache () {
  1         3  
  1         2  
165 1         6 my $re = qr{ \A ( ghcnd-\w+[.]txt | \w+[.]dly ) \Z }xms;
166 1         4 my @files = path($_cachedir)->children( $re );
167 1         233 my @errors;
168 1         4 foreach my $f (@files) {
169             try {
170 3     3   129 $f->remove;
171             } catch {
172 0     0   0 push @errors, "*E* unable to remove $f: $_";
173 3         179 };
174             }
175 1         70 return @errors;
176             }
177              
178             =head2 clean_data_cache
179              
180             Removes all the daily weather data files (*.dly) from the cache, but
181             leaves the cache directory. Returns a list of errors for any files
182             that couldn't be removed.
183              
184             =cut
185              
186 1     1 1 10245 method clean_data_cache () {
  1         2  
  1         2  
187             # delete the daily weather data files in the cache
188 1         6 my $re = qr{ \A \w+[.]dly \Z }xms;
189 1         5 my @files = path($_cachedir)->children( $re );
190 1         209 my @errors;
191 1         4 foreach my $f (@files) {
192             try {
193 1     1   69 $f->remove;
194             } catch {
195 0     0   0 push @errors, "*E* unable to remove $f: $_";
196 1         14 };
197             }
198 1         98 return @errors;
199             }
200              
201             =head2 clean_station_cache
202              
203             Removes the station list and station inventory files (ghcnd-*.txt)
204             from the cache, but leaves the cache directory. Returns a list of
205             errors for any files that couldn't be removed.
206              
207             =cut
208              
209 1     1 1 1051 method clean_station_cache () {
  1         3  
  1         3  
210             # delete the station list and inventory files in the cache
211 1         5 my $re = qr{ \A ghcnd-\w+[.]txt \Z }xms;
212 1         5 my @files = path($_cachedir)->children( $re );
213 1         200 my @errors;
214 1         7 foreach my $f (@files) {
215             try {
216 2     2   86 $f->remove;
217             } catch {
218 0     0   0 push @errors, "*E* unable to remove $f: $_";
219 2         87 };
220             }
221 1         70 return @errors;
222             }
223              
224             =head2 fetch ($uri, $refresh="yearly")
225              
226             Fetch the web page given by the URI $uri, returning its content
227             and caching it. If a cached entry for it exists, and is current
228             according to the refresh option, then the cached entry is returned.
229              
230             =cut
231              
232 54     54 1 4045 method fetch ($uri) {
  54         109  
  54         117  
  54         134  
233              
234 54         169 my $from_cache;
235             my $content;
236              
237 54 50       206 carp '*W* no cache directory specified therefore no caching of HTTP queries available'
238             if not $_cachedir;
239              
240 54 50 33     2340 carp '*W* cache location specified but doesn\'t exist, therefore no caching of HTTP queries available'
241             if $_cachedir and not -d $_cachedir;
242              
243 54 50 33     1032 if (not $_cachedir or not -d $_cachedir) {
244 0         0 ($from_cache, $content) = $self->_fetch_without_cache($uri);
245 0         0 return ($from_cache, $content);
246             }
247              
248 54 50       499 if ($_refresh eq 'always') {
    50          
    0          
249 0         0 ($from_cache, $content) = $self->_fetch_refresh_always($uri);
250             }
251             elsif ($_refresh eq 'never') {
252 54         366 ($from_cache, $content) = $self->_fetch_refresh_never($uri);
253             }
254             elsif ($_refresh eq 'yearly') {
255 0         0 my $cutoff_mtime = localtime->truncate( to => 'year' );
256 0         0 ($from_cache, $content) = $self->_fetch_refresh_n_days($uri, $cutoff_mtime);
257             } else {
258 0 0       0 croak unless $_refresh =~ m{ \A \d+ \Z }xms;
259 0         0 my $cutoff_mtime = localtime->truncate( to => 'day') - ( $_refresh * $ONE_DAY );
260 0         0 ($from_cache, $content) = $self->_fetch_refresh_n_days($uri, $cutoff_mtime);
261             }
262              
263 54         460 return ($from_cache, $content);
264             }
265              
266             =head2 load ($uri)
267              
268             Load a previously fetched and stored $uri from the file cache and
269             returns the content. Uses Path::Tiny->slurp_utf8, which will lock
270             the file during the operation and which uses a binmode of
271             :unix:encoding(UTF-8) for platform portability of the files.
272              
273             =cut
274              
275 55     55 1 3190 method load ($uri) {
  55         111  
  55         116  
  55         111  
276 55         303 my $file = $self->_path_to_key($uri);
277              
278 55 100 66     1272 if ( defined $file && -f $file ) {
279 54         360 return _read_file($file);
280             } else {
281 1         6 return;
282             }
283             }
284              
285             =head2 store ($uri, $content)
286              
287             Stores content obtained from a URI using fetch() into a file in the
288             cache. The filename is derived from the tail end of the URI.
289              
290             Uses Path::Tiny->spew_utf8, which writes data to the file atomically.
291             The file is written to a temporary file in the cache directory, then
292             renamed over the original.
293              
294             A binmode of :unix:encoding(UTF-8) (i.e. PerlIO::utf8_strict) is
295             used, unless Unicode::UTF8 0.58+ is installed. In that case, the content
296             will be encoded by Unicode::UTF8 and written using spew_raw.
297              
298             The idea is to store data in a platform-neutral fashion, so cached
299             files can be used for unit testing on multiple platforms.
300              
301             =cut
302              
303 1     1 1 9621 method store ($uri, $content) {
  1         3  
  1         2  
  1         3  
  1         2  
304              
305 1 50       26 croak '*E* cache directory doesn\'t exist: ' . $_cachedir
306             unless -d $_cachedir;
307              
308 1         7 my $store_file = $self->_path_to_key($uri);
309 1 50       3 return if not defined $store_file;
310              
311             # path($dir)->make_path( $dir, mode => $_dir_create_mode )
312             # if not -d $dir;
313              
314 1         5 _write_file( $store_file, $content );
315             }
316              
317             # method purge_cache($mtime) {
318             # delete daily data files older than $mtime
319             # }
320              
321             =head2 remove ($uri)
322              
323             Remove the cache file associated with this URI.
324              
325             =cut
326              
327 1     1 1 989 method remove ($uri) {
  1         3  
  1         3  
  1         3  
328 1 50       4 my $file = $self->_path_to_key($uri)
329             or return;
330 1         78 unlink $file;
331             }
332              
333              
334              
335             #---------------------------------------------------------------------
336             # Private methods
337             #---------------------------------------------------------------------
338              
339 54     54   197 method _fetch_refresh_never ($uri) {
  54         118  
  54         156  
  54         108  
340             # use the cache only
341 54         283 my $key = $self->_uri_to_key($uri);
342 54         256 my $content = $self->load($key);
343 54         1804818 return ($FROM_CACHE, $content);
344             }
345              
346 0     0   0 method _fetch_refresh_always ($uri) {
  0         0  
  0         0  
  0         0  
347             # check for a fresher copy on the server
348 0         0 my $key = $self->_uri_to_key($uri);
349 0         0 my $file = $self->_path_to_key($key);
350              
351 0         0 my $st = stat $file;
352              
353             # if we have a cached file, check to see if the page is newer
354 0 0       0 if ($st) {
355 0 0       0 my ($ctype, $doclen, $mtime, $exp, $svr) = head($uri)
356             or croak '*E* unable to fetch header for: ' . $uri;
357              
358 0 0       0 if ($mtime > $st->mtime) {
359             # page changed since it was cached
360 0         0 my $content = get($uri);
361 0 0       0 $self->store($key, $content) if $content;
362 0         0 return ($FROM_URI, $content);
363             } else {
364             # page is unchanged, so use the cached file
365 0         0 my $content = $self->load($key);
366 0         0 return ($FROM_CACHE, $content);
367             }
368             }
369              
370             # there's no cached file, so get the page from the URI and cache it
371 0         0 my $content = get($uri);
372 0 0       0 $self->store($key, $content) if $content;
373 0         0 return ($FROM_URI, $content);
374             }
375              
376 0     0   0 method _fetch_refresh_n_days ($uri, $cutoff_mtime) {
  0         0  
  0         0  
  0         0  
  0         0  
377             # check whether the cache or page is older than N days
378             # if the cache file is younger than N days ago, use it
379             # otherwise get the latest page from the server
380             # check the server if the file is older than this year
381              
382 0         0 my $key = $self->_uri_to_key($uri);
383 0         0 my $file = $self->_path_to_key($key);
384              
385 0         0 my $st = stat $file;
386              
387 0 0 0     0 if ($st and $st->mtime >= $cutoff_mtime) {
388             # the cached file we have is at or new than the cutoff, so we'll use it
389 0         0 my $content = $self->load($key);
390 0         0 return ($FROM_CACHE, $content);
391             }
392              
393             # get the mtime for the URI
394 0 0       0 my ($ctype, $doclen, $mtime, $exp, $svr) = head($uri)
395             or croak '*E* unable to fetch header for: ' . $uri;
396              
397             # our cached file is older than the cutoff, but if it's up to date
398             # with the web page then we can use it
399 0 0 0     0 if ($st and $st->mtime >= $mtime) {
400             # web page hasn't changed since it was cached. so we'll use it
401 0         0 my $content = $self->load($key);
402 0         0 return ($FROM_CACHE, $content);
403             }
404              
405             # there's no cached file, or the cached file is out of date, so
406             # we get the page from the URI and cache it
407 0         0 my $content = get($uri);
408 0 0       0 $self->store($key, $content) if $content;
409 0         0 return ($FROM_URI, $content);
410             }
411              
412 0     0   0 method _fetch_without_cache ($uri) {
  0         0  
  0         0  
  0         0  
413             # check for a fresher copy on the server
414 0         0 my $key = $self->_uri_to_key($uri);
415              
416 0         0 my $content = get($uri);
417 0         0 return ($FROM_URI, $content);
418             }
419              
420 117     117   5574 method _uri_to_key ($uri) {
  117         213  
  117         179  
  117         183  
421 117         836 my @parts = split m{ $FSLASH }xms, $uri;
422 117         321 my $key = $parts[-1]; # use the last part as the key
423              
424             # this transformation is for testing using CPAN pages and is not
425             # necessary for the NOAA GHCN pages we actually deal with
426 117         311 $key =~ s{ [:] }{}xmsg;
427              
428 117         440 return $key;
429             }
430              
431 60     60   2408 method _path_to_key ($uri) {
  60         141  
  60         127  
  60         161  
432 60 50       226 return if not defined $uri;
433              
434 60         210 my $key = $self->_uri_to_key( $uri );
435              
436 60         512 my $filepath = path($_cachedir)->child($key)->stringify;
437              
438 60         6949 return $filepath;
439             }
440              
441             ######################################################################
442             # Private subroutines
443             ######################################################################
444              
445 54     54   163 sub _read_file ( $file ) {
  54         157  
  54         110  
446 54         239 return path($file)->slurp_utf8;
447             }
448              
449 1     1   2 sub _write_file ( $file, $data ) {
  1         2  
  1         3  
  1         2  
450 1         15 return path($file)->spew_utf8( $data );
451             }
452              
453              
454             1;
455              
456             __END__