File Coverage

blib/lib/Statistics/Test/Sequence.pm

Criterion	Covered	Total	%
statement	94	97	96.9
branch	15	18	83.3
condition	7	14	50.0
subroutine	10	10	100.0
pod	3	3	100.0
total	129	142	90.8

line	stmt	bran	cond	sub	pod	time	code
1							package Statistics::Test::Sequence;
2
3	3			3		72710	use 5.006;
	3					12
	3					111
4	3			3		17	use strict;
	3					4
	3					126
5	3			3		16	use warnings;
	3					5
	3					143
6
7							our $VERSION = '0.01';
8
9	3			3		16	use Carp qw/croak/;
	3					3
	3					278
10	3			3		2685	use Params::Util qw/_POSINT _ARRAY _CODE/;
	3					17814
	3					294
11	3			3		4639	use Math::BigFloat;
	3					80801
	3					20
12	3			3		163968	use Memoize;
	3					7273
	3					2781
13
14							=head1 NAME
15
16							Statistics::Test::Sequence - Sequence correlation test for random numbers
17
18							=head1 SYNOPSIS
19
20							use Statistics::Test::Sequence;
21							my $tester = Statistics::Test::Sequence->new();
22							$tester->set_data( [map {rand()} 1..1000000] );
23
24							my ($metric, $actual_freq, $expected_freq) = $tester->test();
25							use Data::Dumper;
26							print "$metric\n";
27							print "Frequencies:\n";
28							print Dumper $actual_freq;
29							print "Expected frequencies:\n";
30							print Dumper $expected_freq;
31
32							=head1 DESCRIPTION
33
34							This module implements a sequence correlation test for random number
35							generators. It shows pairwise correlation between subsequent
36							random numbers.
37
38							The algorithm is as follows: (Following Blobel. Citation in SEE ALSO section.)
39
40							=over 2
41
42							=item *
43
44							Given C random numbers C.
45
46							=item *
47
48							For all C, compare C with C. If C is greater
49							then C, assign a 0-Bit to the number. Otherwise, assign a
50							1-Bit.
51
52							=item *
53
54							Find all sequences of equal Bits. For every sequence, increment
55							a counter for the length C of that sequence. (Regardless of whether it's
56							a sequence of 1's or 0's.)
57
58							=item *
59
60							For uncorrelated random numbers, the number of sequences C
61							of length C in the set of C random numbers is expected to be:
62
63							N(k) = 2((k^2+3k+1)N - (k^3+3k^2-k-4)) / (k+3)!
64
65							=back
66
67							=head1 METHODS
68
69							=cut
70
71							=head2 new
72
73							Creates a new random number tester.
74
75							=cut
76
77							sub new {
78	1			1	1	16	my $proto = shift;
79	1		33			8	my $class = ref($proto)\|\|$proto;
80
81	1					3	my $self = {
82							data => undef,
83							};
84
85	1					4	bless $self => $class;
86
87	1					7	return $self;
88							}
89
90							=head2 set_data
91
92							Sets the random numbers to operate on. First argument
93							must be either an array reference to an array of random
94							numbers or a code reference.
95
96							If the first argument is a code reference, the second
97							argument must be an integer C. The code reference is
98							called C-times and its return values are used as
99							random numbers.
100
101							The code reference semantics are particularily useful if
102							you do not want to store all random numbers in memory at
103							the same time. You can write a subroutine that, for example,
104							generates and returns batches of 100 random numbers so no
105							more than 101 of these numbers will be in memory at the
106							same time. Note that if you return 100 numbers at once and
107							pass in C, you will have a sequence of 5000 random
108							numbers.
109
110							=cut
111
112							sub set_data {
113	2			2	1	5289	my $self = shift;
114	2					4	my $data = shift;
115	2	100				18	if (_ARRAY($data)) {
		50
116	1					8	$self->{data} = $data;
117	1					3	return 1;
118							}
119							elsif (_CODE($data)) {
120	1					4	$self->{data} = $data;
121	1					238	my $n = shift;
122	1	50				46	if (not _POSINT($n)) {
123	0					0	croak("'set_data' needs an integer as second argument if the first argument is a code reference.");
124							}
125	1					16	$self->{n} = $n;
126	1					4	return 1;
127							}
128							else {
129	0					0	croak("Invalid arguments to 'set_data'.");
130							}
131							}
132
133							=head2 test
134
135							Runs the sequence test on the data that was previously set using
136							C.
137
138							Returns three items: The first is the root mean square of the bin
139							residuals divided by the number of random numbers. It I be
140							used as a measure for the quality of the random number generator
141							and should be as close to zero as possible. A better metric is to
142							compare the following two return values.
143
144							The second return value is a reference to the array of frequencies.
145							An example is in order here. Generating one million random numbers,
146							I get:
147
148							[0, 416765, 181078, 56318, 11486, 1056, 150]
149
150							This means there were no sequences of length 0 (obvious), 416765
151							sequences of length 1, etc. There were no sequences of length 7 or
152							greater. This example is a bad random number generator! (It's a
153							linear congruent generator with C<(a*x_i+c)%m> and C,
154							C, C, and C).
155
156							The third return value is similar in nature to the second in that it
157							is a reference to an array containing sequence length frequencies.
158							This one, however, contains the frequencies that would be expected for
159							the given number of random numbers, were they uncorrelated.
160							The number of bins has the maximum
161							length of an occurring sequence as an upper limit. In the given example,
162							you would get: (Dumped with Data::Dumper)
163
164							$VAR1 = [
165							'0',
166							'416666.75',
167							'183333.1',
168							'52777.64722222222222222222222222222222222',
169							'11507.89523809523809523809523809523809524',
170							'2033.72068452380952380952380952380952381',
171							'303.1287808641975308641975308641975308642',
172							# ...
173							];
174
175							Note that where I put in a C<# ...>, you would really see a couple
176							more lines of numbers until the numbers go below an expected
177							frequency of C<0.1>.
178							For C and C, you get about
179							39 sequences, C is expected to be found 4-5 times, etc.
180
181							=cut
182
183							sub test {
184	2			2	1	1628	my $self = shift;
185	2					7	my $data = $self->{data};
186
187	2	50				8	if (not defined $data) {
188	0					0	croak("Set data using 'set_data' first.");
189							}
190
191							# bin counters
192	2					2	my @bins;
193							# current sequence type (> or <)
194	2					3	my $current = undef;
195							# current sequence length
196	2					4	my $length = 0;
197							# total number of random numbers
198	2					3	my $numbers;
199
200	2	100				7	if (_ARRAY($data)) {
201	1		50			8	$current = ($data->[0] <=> $data->[1]) \|\| 1;
202	1					3	$length++;
203	1					2	$numbers = @$data;
204
205	1					4	foreach my $i (1 .. $#$data-1) {
206	9998		50			24182	my $cmp = ($data->[$i] <=> $data->[$i+1]) \|\| 1;
207	9998	100				12521	if ($current == $cmp) {
208	3275					3785	$length++;
209							}
210							else {
211	6723					6036	$current = $cmp;
212	6723					5653	$bins[$length]++;
213	6723					6978	$length = 1;
214							}
215							}
216	1					6	$bins[$length]++;
217							}
218							else { # CODE
219	1					2	my @cache;
220	1					4	my $calls = $self->{n};
221	1					2	my $first_run = 1;
222	1					4	foreach (1..$calls) {
223							# get new data
224	100					254	push @cache, $data->();
225							# first run => initialize with first comparison
226	100	100	66			3692	if ($first_run and @cache > 1) {
227	1		50			4	$current = ($cache[0] <=> $cache[1]) \|\| 1;
228	1					2	shift @cache;
229	1					1	$length++; # == 1
230	1					2	$numbers++; # == 1
231	1					1	$first_run = 0;
232							}
233	100					205	while (@cache > 1) {
234	9998					9812	$numbers++;
235	9998					9960	my $this = shift @cache;
236	9998		50			19492	my $cmp = ($this <=> $cache[0]) \|\| 1;
237	9998	100				13538	if ($current == $cmp) {
238	3273					6710	$length++;
239							}
240							else {
241	6725					6112	$current = $cmp;
242	6725					6059	$bins[$length]++;
243	6725					13532	$length = 1;
244							}
245							}
246							}
247	1					4	$bins[$length]++;
248							}
249
250	2					10	my @expected = (0); # 0-bin is empty
251	2					6	foreach my $bin (1..$#bins) {
252	13					29162	$expected[$bin] = expected_frequency($bin, $numbers-1);
253							}
254	2					5270	my $last_bin = $#bins;
255	2					11	while ($expected[$last_bin] > 0.1) {
256	3					8307	$last_bin++;
257	3					85	$expected[$last_bin] = expected_frequency($last_bin, $numbers-1);
258							}
259
260	2					5831	foreach my $bin (0..$last_bin) {
261	18	100				40	$bins[$bin] = 0 if not defined $bins[$bin];
262							}
263
264
265	2					8	my @diff = map { abs($bins[$_] - $expected[$_]) } 0..$#bins;
	18					5062
266
267	2					412	my $residual = 0;
268	2					13	$residual += $_**2 for @diff;
269	2					10229	$residual = sqrt($residual);
270	2					11661	$residual = "$residual"; # de-bigfloatize
271
272	2					126	@expected = map {"$_"} @expected; # de-bigfloatize
	18					711
273
274							return(
275	2					139	$residual / ($numbers-1),
276							\@bins,
277							\@expected,
278							);
279							}
280
281							=head1 SUBROUTINES
282
283							=head2 expected_frequency
284
285							Returns the expected frequency of the sequence length C
286							in a set of C random numbers assuming uncorrelated random
287							numbers.
288
289							Returns this as a L.
290
291							Expects C and C as arguments.
292
293							This subroutine is memoized. (See L.)
294
295							=cut
296
297							memoize('expected_frequency');
298							sub expected_frequency {
299							my $k = Math::BigFloat->new(shift);
300							my $n = Math::BigFloat->new(shift);
301							return(
302							2 * ( ($k*2 + 3$k + 1)$n - ($k3 + 3$k**2 - $k - 4) )
303							/ faculty($k+3)
304							);
305							}
306
307							=head2 faculty
308
309							Computes the factulty of the first argument recursively as a
310							L. This subroutine is memoized. (See L.)
311
312							=cut
313
314							memoize('faculty');
315							sub faculty {
316							my $n = shift;
317							return Math::BigFloat->bone() if $n <= 1;
318							return $n * faculty($n-1);
319							}
320
321							1;
322							__END__