File Coverage

blib/lib/Math/Round/Fair.pm

Criterion	Covered	Total	%
statement	114	124	91.9
branch	22	30	73.3
condition	4	8	50.0
subroutine	18	19	94.7
pod	2	3	66.6
total	160	184	86.9

line	stmt	bran	cond	sub	pod	time	code
1							package Math::Round::Fair;
2	6			6		123381	use warnings;
	6					12
	6					213
3	6			6		29	use strict;
	6					10
	6					186
4	6			6		128	use 5.005000;
	6					25
	6					320
5	6			6		4954	use Devel::Assert;
	6					69948
	6					44
6	6			6		575	use Carp;
	6					12
	6					513
7	6			6		31	use base qw/Exporter/;
	6					13
	6					509
8	6			6		29	use List::Util qw/shuffle sum min/;
	6					10
	6					661
9	6			6		5852	use POSIX qw/floor ceil DBL_EPSILON/;
	6					49074
	6					42
10
11							our $VERSION = '0.03';
12
13							our @EXPORT_OK = qw/round_fair round_adjacent/;
14
15							BEGIN {
16	6			6		1092	my $debug;
17	13			13	0	902	sub DEBUG { $debug }
18
19	6		100			45	$debug = $ENV{MATH_ROUND_FAIR_DEBUG} \|\| 0;
20	6	50		6		7308	use Devel::Assert DEBUG() ? ('-all -verbose') : ();
	6					13
	6					24
21
22							# used in assertions
23	6	100		5		16	eval q{use Perl6::Junction 'none'} if DEBUG();
	5					3375
	5					58413
	5					1940
24							}
25
26
27							=head1 NAME
28
29							Math::Round::Fair - distribute rounding errors fairly
30
31							=head1 SYNOPSIS
32
33							use Math::Round::Fair 'round_fair', 'round_adjacent';
34
35							my $cents = 7;
36							my @weights = (1, 2, 3, 2, 1);
37							my @allocation = round_fair($cents, @weights);
38
39							print "@allocation\n";
40
41							# output will be one of the following:
42							# 0 1 3 2 1
43							# 0 2 2 2 1
44							# 0 2 3 1 1
45							# 0 2 3 2 0
46							# 1 1 2 2 1
47							# 1 1 3 1 1
48							# 1 1 3 2 0
49							# 1 2 2 1 1
50							# 1 2 2 2 0
51
52							my @total;
53							for ( 1..900 ) {
54							@allocation = round_fair($cents, @weights);
55							@total[$_] += @allocation[$_] for 0..$#allocation;
56							}
57							print "@total\n";
58
59							# output will be near 700 1400 2100 1400 700, e.g.:
60							# 698 1411 2096 1418 677
61
62
63							my @rounded = round_adjacent(0.95, 0.65, 0.41, 0.99);
64							# @rounded will be one of the following:
65							# 59% of the time: 1, 1, 0, 1
66							# 35% of the time: 1, 0, 1, 1
67							# 5% of the time: 0, 1, 1, 1
68							# 1% of the time: 1, 1, 1, 0
69
70
71							=head1 DESCRIPTION
72
73							This module provides two exportable functions, C, which
74							allocates an integer value, fairly distributing rounding errors, and
75							C, which takes a list of real numbers and rounds them up, or
76							down, to an adjacent integer, fairly. Both functions return a list of fairly
77							rounded integer values.
78
79							C and C round up, or down, randomly, where the
80							probability of rounding up is equal to the fraction to round. For example, 0.5
81							will round to 1.0 with a probability of 0.5. 0.3 will round to 1.0 3 out of 10
82							times and to zero 7 out of 10 times, on average.
83
84							Consider the problem of distributing one indivisible item, for example a penny,
85							across three evenly weighted accounts, A, B, and C.
86
87							Using a naive approach, none of the accounts will receive an allocation since
88							the allocated portion to each is 1/3 and 1/3 rounds to zero. We are left with
89							1 unallocated item.
90
91							Another approach is to adjust the basis at each step. We start with 1 item to
92							allocate to 3 accounts. 1/3 rounds to 0, so account A receives no allocation,
93							and we drop it from consideration. Now, we have 2 accounts and one item to
94							allocate. 1/2 rounds to 1, so we allocate 1 item to account B. Account C
95							gets no allocation since there is nothing left to allocate.
96
97							But what happens if we allocate one item to the same three accounts 10,000
98							times? Ideally, two accounts should end up with 3,333 items and one should end
99							up with 3,334 items.
100
101							Using the naive approach, all three accounts receive no allocation since at
102							each round the allocation is 1/3 which rounds to zero. Using the second method,
103							account A and account C will receive no allocation, and account B will receive
104							a total allocation of 10,000 items. Account B always receives the benefit of
105							the rounding errors using the second method.
106
107							The algorithm employed by this module uses randomness to ensure a fair distribution of
108							rounding errors. In our example problem, we start with 1 item to allocate. We
109							calculate account A's share, 1/3. Since it is less than one item, we give it a
110							1/3 chance of rounding up (and, therefore, a 2/3 chance of rounding down). It
111							wins the allocation 1/3 of the time. 2/3 of the time we continue to B. We
112							calculate B's allocation as 1/2 (since there are only 2 accounts remaining and
113							one item to allocate). B rounds up 1/2 of 2/3 (or 1/3) of the time and down
114							1/2 of 2/3 (or 1/3) of the time. If neither A nor B rounds up (which occurs
115							2/3 * 1/2, or 1/3 of the time), C's allocation is calculated as 1/1 since we
116							have one item to allocate and only one account to allocate it to. So, 1/3 of
117							the time C receives the benefit of the rounding error. We never end up with
118							any unallocated items.
119
120							This algorithm works for any number of weighted allocations.
121
122							=over 4
123
124							=item round_fair($value, @weights)
125
126							Returns a list of integer values that sum to C<$value> where each return value
127							is a portion of C<$value> allocated by the respective weights in C<@weights>.
128							The number of return values is equal to the number of elements in C<@weights>
129
130							C<$value> must be an integer.
131
132							=cut
133
134							sub round_fair {
135	52			52	1	268	my $value = shift;
136
137	52	50	33			221	croak "Value to be allocated must be an integer >= 0" unless int($value) == $value && $value >= 0;
138
139	52	50				106	return ($value) if @_ == 1;
140	52	50				92	return (0) x @_ if $value == 0;
141
142	52					50	my $basis = 0;
143	52					76	for my $w ( @_ ) {
144	214	50				306	croak "Weights must be > 0" unless $w > 0;
145	214					307	$basis += $w;
146							}
147
148	52					68	my $sum = 0;
149	52					79	my @in = map { my $r = $value * $_ / $basis; $sum += $r; $r } @_;
	214					268
	214					202
	214					322
150
151							# First, create the extra entry for the total, so that the sum of
152							# the new array is zero.
153	52					75	push @in, -$sum;
154
155	52					104	my $out = _round_adjacent_arrayref(\@in);
156	52					66	pop @$out; # Discard the entry for the total
157
158	52					160	return @$out;
159							}
160
161							=item round_adjacent(@input_values)
162
163							Returns a list of integer values, each of which is numerically
164							adjacent to the corresponding element of @input_values, and whose total is
165							numerically adjacent to the total of @input_values.
166
167							The expected value of each output value is equal to the corresponding element
168							of @input_values (within a small error margin due to the limited machine
169							precision).
170
171							=cut
172
173							sub round_adjacent {
174	11002	100		11002	1	549496	return () unless @_; # identity
175
176							# First, create the extra entry for the total, so that the sum of
177							# the new array is zero.
178	11001					29270	push @_, -sum(@_);
179
180							# use a reference to eliminate an unnecessary copy
181	11001					20226	my $out = _round_adjacent_arrayref(\@_);
182
183	11001					13537	pop @$out; # Discard the entry for the total
184	11001					43021	return @$out;
185							}
186
187							sub _round_adjacent_arrayref {
188	11053			11053		12493	my $in = shift;
189
190							# Next, shuffle the order, so that the input order has no effect
191							# on the randomness characteristics.
192	6			6		5433	my @order = shuffle($[ .. $#{$in});
	6					2884
	6					493
	11053					11878
	11053					47166
193	11053					59562	@$in = map $in->[$_], @order;
194
195	11053					22868	my $out = _round_adjacent_core($in);
196
197	11053					10678	assert(sum(@$out) == 0);
198
199							# put the output back into original order
200	11053					9893	my @r;
201	11053					84125	$r[$order[$_]] = $out->[$_] for $[ .. $#order;
202
203	11053					35240	return \@r;
204							}
205
206							# Like _round_adjacent_arrayref, except that the inputs must sum to zero, and the
207							# input order may affect the variance and correlations, etc.
208							sub _round_adjacent_core {
209	11053			11053		11717	my $in = shift;
210
211	11053					9501	assert(scalar @$in); # @$in must not be empty
212
213	11053					15382	my $eps1 = 4.0 * DBL_EPSILON() * (1 + @$in);
214	11053					11139	my $eps = $eps1;
215	11053					13297	my @fp = map { my $ip = floor($_); $_ - $ip } @$in;
	70270					95160
	70270					118278
216
217	11053					12915	assert(none(@fp) < 0.0);
218
219							# TBD: Maybe accuracy or fairness can be improved by
220							# re-adjusting after every iteration. This would slow it
221							# down significantly, though.
222	11053					17885	_adjust_input(\@fp);
223
224	11053					13313	my @out;
225	11053					10873	INPUT: while() {
	11053					10188
226	70270					77023	$eps += $eps1;
	11053					22648
227
228	70270					63601	assert(_check_invariants($eps, $in, \@fp));
229
230	11053			11053		12400	# Calculate the next output. Discard the next input in the
231							# process.
232	70270					79956	my $p0 = shift @fp; # Probability of having to overpay
233	70270	100				132633	my $r0 = rand()<$p0 ? 1 : 0; # 1 if selected to overpay; else 0
234	70270					169667	push @out, floor(shift @$in) + $r0;
	11053					19906
235	11053	100				37553
236	70270	100				130914	last unless @fp;
	4016					7229
237
238	4016	50	33			18880	# Now adjust the remaining fractional parts.
239
240							# $slack[i] = min( $p0 * $fp[i], (1-$p0) * (1-$fp[i]) ).
241	59217					61327	my @slack;
	4016					4411
242	59217	100				57856	my $tslack = 0.0;
	4016					6827
243	59217					60727	do {
	2616					14147
244							@slack = map {
245	59217					67364	if ( 1 ) {
	502371					445559
	1400					2080
246	502371					927402	my $slack = min $p0 * $_, (1 - $_) * (1.0 - $p0);
	1400					13129
247	502371					520563	$tslack += $slack;
248	502371					726121	$slack;
249							}
250							else {
251							# This is fewer FLOPS, but the perf benefit
252	0			0			# is only 1% on a modern system, and it leads
253							# to greater numerical errors for some reason.
254	0	0					my $add = $p0 + $_;
	0
255	0						my $mult = $p0 * $_;
256	0						$add > 1.0 ? 1.0 - $add + $mult : $mult
257	0						}
258	0						} @fp;
259							};
260
261	0						# See bottom of file for proof of this property:
262	59217					69392	assert($tslack + $eps >= $p0 * (1.0 - $p0));
	0
263
264	0						# wrapped in assert to make it a noop when DEBUG() == 0
265	59217					51205	assert(do { warn "TSLACK = $tslack\n" if DEBUG() > 1; 1 });
266
267	59217	100				117893	if ( $tslack > $eps1 ) {
268	37168					51714	$eps += 128.0 * $eps1 * $eps / $tslack;
269							# NOTE: The expected value of gain is
270							# $p0 * ($p0 - 1.0) /$tslack +
271							# (1.0 - $p0) * $p0 / $tslack = 0
272	37168					33430	my $gain = do {
273	37168	100				57291	if ( $r0 ) {
274							# Last guy overpaid, so the probabilities for
275							# subsequent payers drop.
276	17549					23108	($p0 - 1.0) / $tslack;
277							}
278							else {
279							# Last guy underpaid, so the probabilities for
280							# subsequent payers rise.
281	19619					23664	$p0 / $tslack;
282							}
283							};
284
285							# NOTE: The change in the sum of @fp due to this step
286							# is $tslack * $gain, which is either $p0 or ($p0 - 1).
287							# Either way, the sum remains an integer, because it
288							# was reduced by $p0 when we shifted off the first
289							# element early in the INPUT loop iteration.
290							# Also note that each element of @fp stays in the range
291							# [0,1] because if $r0, then slack($_, $p0) * -$gain <=
292							# $p0 * $_ * (1.0 - $p0) / ($p0 * (1.0 - $p0)) ==
293							# $_, and otherwise slack($_, $p0) * $gain <=
294							# (1 - $p0) * (1 - $_) * $p0 / ($p0 * (1.0 - $p0)) ==
295							# 1 - $_.
296							# We modify in place here, for performance.
297	37168					225908	$_ += shift(@slack) * $gain for @fp;
298							}
299							}
300							assert(@$in == 0);
301							return \@out;
302							}
303
304							sub _adjust_input {
305							my $p = shift;
306
307							# Adjust @$p to account for numerical errors due to small
308							# difference of large numbers when the integer parts are big.
309							my $sum = sum @$p;
310							if ( $sum != floor($sum) ) {
311							my $target = floor($sum + 0.5);
312
313							die "Total loss of precision"
314							unless abs($sum - $target) < 0.1 && $sum + 0.05 != $sum;
315
316							my $adj = $target / $sum;
317							if ( $adj <= 1.0 ) {
318							$_ *= $adj for @$p;
319							} else {
320							$adj = (@$p - $target) / (@$p - $sum);
321							$_ = 1.0 - (1.0-$_) * $adj for @$p;
322							}
323							}
324							}
325
326							sub _check_invariants {
327							my ( $eps, $v, $fp ) = @_;
328
329							if ( DEBUG() > 1 ) {
330							warn sprintf "%d %f\n", floor($_), $_ for @$fp;
331							}
332
333							assert(@$v && @$v == @$fp);
334
335							for ( @$fp ) {
336							assert($_ >= -$eps);
337							assert($_ <= 1.0 + $eps);
338							}
339
340							my $sum = sum @$fp;
341							assert(abs($sum - floor($sum + 0.5)) < $eps * (1 + $sum));
342
343							1;
344							}
345
346							1;
347
348							__END__