File Coverage

blib/lib/Date/RetentionPolicy.pm
Criterion Covered Total %
statement 64 117 54.7
branch 21 46 45.6
condition 16 23 69.5
subroutine 10 12 83.3
pod 2 3 66.6
total 113 201 56.2


line stmt bran cond sub pod time code
1             package Date::RetentionPolicy;
2             $Date::RetentionPolicy::VERSION = '0.01';
3 3     3   3343 use Moo;
  3         29733  
  3         13  
4 3     3   3821 use Scalar::Util 'looks_like_number';
  3         6  
  3         130  
5 3     3   2338 use DateTime;
  3         1283521  
  3         123  
6 3     3   1681 use DateTime::Format::Flexible;
  3         365781  
  3         35  
7              
8             # ABSTRACT: Prune a list of dates down to the ones you want to keep
9              
10              
11             has retain => ( is => 'rw', required => 1 );
12             has time_zone => ( is => 'rw', default => sub { 'floating' } );
13             has reach_factor => ( is => 'rw', default => sub { .5 } );
14             has reference_date => ( is => 'rw' );
15             has auto_sync => ( is => 'rw' );
16              
17             sub reference_date_or_default {
18 341     341 1 829 my $self= shift;
19             # Use override, else 'now' rounded up to next day boundary of timezone
20 341         1076 my $start= $self->reference_date;
21 341 50       1008 return $start->clone if ref $start;
22 341 50       1852 return $self->_coerce_date($start) if defined $start;
23 0         0 return DateTime->now(time_zone => $self->time_zone)
24             ->add(days => 1, seconds => -1)->truncate(to => 'day');
25 0         0 return $start;
26             }
27              
28              
29             sub prune {
30 341     341 0 487977 my ($self, $list)= @_;
31 341         1431 my $processed= $self->_sort_and_mark_retention($list);
32             # Divide the elements into two lists. Make a set of which indexes
33             # we're keeping, then iterate the original list to preserve the caller's
34             # list order.
35 341         976 my (@retain, @prune);
36 341         22948 my %keep= map +($_->[1] => 1), grep $_->[2], @$processed;
37 21765 100       43224 push @{ $keep{$_}? \@retain : \@prune }, $list->[$_]
38 341         2950 for 0..$#$list;
39 341         1963 @$list= @retain;
40 341         6533 return \@prune;
41             }
42              
43             sub _sort_and_mark_retention {
44 341     341   1007 my ($self, $list, $trace)= @_;
45             # Each list element needs to be a date object, (but preserve the original)
46             # and the list needs to be sorted in cronological order.
47 341         2726 my @sorted= sort { $a->[0] <=> $b->[0] }
  21425         32266  
48             # tuple of [ Epoch, ListIndex, KeepBoolean ].
49             # A hash would be more readable but there could be a lot of these.
50             map [ $self->_coerce_to_epoch($list->[$_]), $_, 0 ],
51             0..$#$list;
52             # Never prune things newer than the reference date
53 341         2575 my $ref_date= $self->reference_date_or_default;
54 341   66     2396 for (my $i= $#sorted; $i >= 0 && $sorted[$i][0] > $ref_date->epoch; --$i) {
55 4636         31457 $sorted[$i][2]= 1;
56             }
57             # Set the boolean to true for each element that a rule wants to keep
58             $self->_mark_for_retention($ref_date, $_, \@sorted, $trace)
59 341         2292 for @{ $self->retain };
  341         1966  
60 341         5912 return \@sorted;
61             }
62              
63             sub _mark_for_retention {
64 1019     1019   11982 my ($self, $reference_date, $rule, $list, $trace)= @_;
65 1019         2291 my ($interval, $history, $reach_factor)= @{$rule}{'interval','history','reach_factor'};
  1019         3443  
66 1019 50       4161 $reach_factor= $self->reach_factor unless defined $reach_factor;
67 1019         3841 my $next_date= $reference_date->clone->subtract(%$history)->add(%$interval);
68 1019         2105921 my $epoch= $next_date->epoch;
69 1019         8584 my $search_idx= 0;
70 1019         3500 my $next_epoch= $next_date->add(%$interval)->epoch;
71 1019         969377 my $radius= -($epoch - $next_epoch) * $reach_factor;
72 1019         2322 my $drift= 0; # only used for auto_sync
73 1019         1623 my $rule_key;
74            
75             # The epoch variables track the current date interval, and the _idx
76             # variables track our position in the list.
77 1019   66     2745 while ($epoch-abs($drift) <= $reference_date->epoch && $search_idx < @$list) {
78 24256         197099 my $best;
79 24256   100     99736 for (my $i= $search_idx; $i < @$list and $list->[$i][0] < $epoch+$drift+$radius; ++$i) {
80 102115 100 100     288804 if ($list->[$i][0] >= $epoch+$drift-$radius
      100        
81             and (!defined $best or abs($list->[$i][0] - ($epoch+$drift)) < abs($list->[$best][0] - ($epoch+$drift)))
82             ) {
83 35970         52045 $best= $i;
84             }
85             # update the start_idx for next interval iteration
86 102115 100       374452 $search_idx= $i+1 if $list->[$i][0] < $next_epoch-$radius*2;
87             }
88 24256 100       49643 if (defined $best) {
89 20160         35213 $list->[$best][2]= 1; # mark as a keeper
90             # If option enabled, drift toward the time we found, so that gap between next
91             # is closer to $interval
92 20160 100       66169 $drift= $list->[$best][0] - $epoch
93             if $self->auto_sync;
94             }
95 24256 50       48262 if ($trace) {
96 0 0       0 $rule_key= join ',', map "$_=$interval->{$_}", keys %$interval
97             unless defined $rule_key;
98 0 0       0 if (!$trace->{$rule_key}) {
99 0         0 $trace->{$rule_key}{idx}= scalar keys %$trace;
100 0         0 $trace->{$rule_key}{radius}= $radius;
101 0         0 $trace->{$rule_key}{name}= $rule_key;
102             }
103 0         0 push @{$trace->{$rule_key}{interval}}, { epoch => $epoch, best => $best, drift => $drift };
  0         0  
104             }
105 24256         34273 $epoch= $next_epoch;
106 24256         74411 $next_epoch= $next_date->add(%$interval)->epoch;
107            
108             # if auto_sync enabled, cause drift to decay back toward 0
109 24256 100       23132595 $drift= int($drift * 7/8)
110             if $drift;
111             }
112             }
113              
114              
115             sub visualize {
116 0     0 1 0 my ($self, $list)= @_;
117 0         0 my $trace= {};
118 0         0 my $processed= $self->_sort_and_mark_retention($list, $trace);
119 0         0 $processed->[$_][1]= $_ for 0..$#$processed; # change indexes to index within processed list
120 0         0 my @claimed;
121 0         0 my @things= @$processed;
122 0         0 my @columns;
123             my %rule_to_col;
124             # Convert each trace to a similar arrayref structure as the processed items, for sorting
125 0         0 for my $rule_trace (sort { $a->{idx} <=> $b->{idx} } values %$trace) {
  0         0  
126 0         0 push @columns, { name => $rule_trace->{name}, width => 20 };
127 0         0 $rule_to_col{$rule_trace->{name}}= $#columns;
128 0         0 for (@{ $rule_trace->{interval} }) {
  0         0  
129 0 0       0 push @{$claimed[$_->{best}]}, $rule_trace->{name} if defined $_->{best};
  0         0  
130             push @things,
131             [ $_->{epoch} + $_->{drift} + $rule_trace->{radius}, 'ival-newest', $rule_trace->{name} ],
132             [ $_->{epoch} + $_->{drift}, 'ival', $rule_trace->{name} ],
133 0         0 [ $_->{epoch} + $_->{drift} - $rule_trace->{radius}, 'ival-oldest', $rule_trace->{name} ];
134             }
135             }
136 0         0 push @columns, { name => 'timestamp', width => 20 };
137 0         0 @things= sort { $a->[0] <=> $b->[0] } @things;
  0         0  
138            
139             # Walk from oldest to newest, displaying timestamps alongside the epock interval points
140 0         0 my $cur_time= 0;
141 0         0 my @in_interval= ( 0 ) x @columns;
142 0         0 my @row= map $_->{name}, @columns;
143 0         0 my $format= join(' ', map '%-'.$_->{width}.'s', @columns)."\n";
144 0         0 my $out= '';
145             my $emit= sub {
146             # if in_interval, display a vertical bar as a graphic
147 0     0   0 for (0..$#in_interval) {
148 0 0 0     0 $row[$_] ||= '|' if $in_interval[$_];
149             }
150 0         0 $out .= sprintf $format, @row;
151 0         0 @row= ('') x @columns;
152 0         0 };
153 0         0 for (@things) {
154 0 0       0 $emit->() if $cur_time != $_->[0];
155 0         0 $cur_time= $_->[0];
156 0 0       0 if ($_->[1] eq 'ival') {
    0          
    0          
157 0         0 $row[ $rule_to_col{ $_->[2] } ]= $self->_coerce_date($_->[0]);
158             } elsif ($_->[1] eq 'ival-newest') {
159 0         0 $row[ $rule_to_col{ $_->[2] } ]= '---';
160 0         0 --$in_interval[ $rule_to_col{ $_->[2] } ];
161             } elsif ($_->[1] eq 'ival-oldest') {
162 0         0 $row[ $rule_to_col{ $_->[2] } ]= '---';
163 0         0 ++$in_interval[ $rule_to_col{ $_->[2] } ];
164             } else {
165 0 0       0 $row[-1]= $self->_coerce_date($_->[0]).($_->[2]? ' +':' x');
166 0 0       0 if ($claimed[$_->[1]]) {
167 0         0 $row[-1] .= ' '.join ', ', @{ $claimed[$_->[1]] };
  0         0  
168             }
169             }
170             }
171 0         0 $emit->();
172 0         0 return $out;
173             }
174              
175             sub _coerce_date {
176 1801     1801   3394 my ($self, $thing)= @_;
177 1801 100 33     10033 my $date= ref $thing && ref($thing)->can('set_time_zone')? $_->clone
    50          
178             : looks_like_number($thing)? DateTime->from_epoch(epoch => $thing)
179             : DateTime::Format::Flexible->parse_datetime($thing);
180 1801         10330693 $date->set_time_zone($self->time_zone);
181 1801         77681 return $date;
182             }
183              
184             sub _coerce_to_epoch {
185 21765     21765   50909 my ($self, $thing)= @_;
186 21765 100 66     88729 return $thing if !ref $thing && looks_like_number($thing);
187 1460         3395 return $self->_coerce_date($thing)->epoch;
188             }
189              
190             1;
191              
192             __END__
193              
194             =pod
195              
196             =encoding UTF-8
197              
198             =head1 NAME
199              
200             Date::RetentionPolicy - Prune a list of dates down to the ones you want to keep
201              
202             =head1 VERSION
203              
204             version 0.01
205              
206             =head1 SYNOPSIS
207              
208             my $rp= Date::RetentionPolicy->new(
209             retain => [
210             { interval => { hours => 6 }, history => { months => 3 } },
211             { interval => { days => 1 }, history => { months => 6 } },
212             { interval => { days => 7 }, history => { months => 9 } },
213             ]
214             );
215            
216             my $dates= [ '2018-01-01 03:23:00', '2018-01-01 09:45:00', ... ];
217             my $pruned= $rp->prune($dates);
218             for (@$pruned) {
219             # delete the backup dated $_
220             ...
221             }
222              
223             =head1 DESCRIPTION
224              
225             Often when making backups of a thing, you want to have more frequent snapshots
226             for recent dates, but don't need that frequency further back in time, and want
227             to delete some of the older ones to save space.
228              
229             The problem of deciding which snapshots to delete is non-trivial because
230             backups often don't complete on a timely schedule (despite being started on
231             a schedule) or have discontinuities from production mishaps, and it would be
232             bad if your script wiped out the only backup in an interval just because it
233             didn't look like one of the "main" timestamps. Also it would be bad if the
234             jitter from the time zone or time of day that you run the pruning process
235             caused the script to round differently and delete the backups it had
236             previously decided to keep.
237              
238             This module uses an algorithm where you first define the intervals which
239             should retain a backup, then assign the existing timestamps to those intervals
240             (possibly reaching across the interval boundary a bit in order to preserve
241             a nearby timestamp; see L<reach_factor>) thus making an intelligent decision
242             about which timestamps to keep.
243              
244             =head1 DATES
245              
246             This module currently depends on DateTime, but I'm happy to accept patches
247             to allow it to work with other Date classes.
248              
249             =head1 ATTRIBUTES
250              
251             =head2 retain
252              
253             An arrayref of specifications for what to preserve. Each element should be a
254             hashref containing C<history> and C<interval>. C<history> specifies how far
255             backward from L</reference_date> to apply the intervals, and C<interval>
256             specifies the time difference between the backups that need preserved.
257              
258             As an example, consider
259              
260             retain => [
261             { interval => { days => 1 }, history => { days => 20 } },
262             { interval => { hours => 1 }, history => { hours => 48 } },
263             ]
264              
265             This will attempt to preserve timestamps near the marks of L</reference_date>,
266             an hour before that, an hour before that, and so on for the past 48 hours.
267             It will also attempt to preserve L</reference_date>, a day before that, a day
268             before that, and so on for the past 20 days.
269              
270             There is another setting called L</reach_factor> that determines how far from
271             the desired timestamp the algorithm will look for something to preserve. The
272             default C<reach_factor> of C<0.5> means that it will scan from half an interval
273             back in time until half an interval forward in time looking for the closest
274             timestamp to preserve. In some cases, you may want a narrower or wider search
275             distance, and you can set C<reach_factor> accordingly. You can also supply it
276             as another hash key for a retain rule for per-rule customization.
277              
278             retain => [
279             { interval => { days => 1 }, history => { days => 20 }, reach_factor => .75 }
280             ]
281              
282             =head2 time_zone
283              
284             When date strings are involved, parse them as this time zone before converting
285             to an epoch value used in the calculations. The default is C<'floating'>.
286              
287             =head2 reach_factor
288              
289             The multiplier for how far to look in each direction from an interval point.
290             See discussion in L</retain>.
291              
292             =head2 reference_date
293              
294             The end-point from which all intervals will be calculated. There is no
295             default, to allow L</reference_date_or_default> to always pick up the current
296             time when called.
297              
298             =head2 reference_date_or_default
299              
300             Read-only. Return (a clone of) L</reference_date>, or if it isn't set, return
301             the current date in the designated L</time_zone> rounded up to the next day
302             boundary.
303              
304             =head2 auto_sync
305              
306             While walking backward through time intervals looking for backups, adjust the
307             interval endpoint to be closer to whatever match it found. This might allow
308             the algorithm to essentially adjust the C<reference_date> to match whatever
309             schedule your backups are running on. This is not enabled by default.
310              
311             =head1 METHODS
312              
313             =head1 prune
314              
315             my $pruned_arrayref= $self->prune( \@times );
316              
317             C<@times> may be an array of epoch numbers, DateTime objects, or date strings
318             in any format recognized by L<DateTime::Format::Flexible>. Epochs are
319             currently the most efficient type of argument since that's what the algorithm
320             operates on.
321              
322             =head2 visualize
323              
324             print $rp->visualize( \@list );
325              
326             This method takes a list of timestamps, sorts and marks them for retention,
327             and then returns printable text showing the retention intervals and which
328             increment it decided to keep. The text is simple ascii-art, and requires
329             a monospace font to display correctly.
330              
331             =head1 AUTHOR
332              
333             Michael Conrad <mconrad@intellitree.com>
334              
335             =head1 COPYRIGHT AND LICENSE
336              
337             This software is copyright (c) 2018 by IntelliTree Solutions llc.
338              
339             This is free software; you can redistribute it and/or modify it under
340             the same terms as the Perl 5 programming language system itself.
341              
342             =cut