line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Statistics::Test::Sequence; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
72710
|
use 5.006; |
|
3
|
|
|
|
|
12
|
|
|
3
|
|
|
|
|
111
|
|
4
|
3
|
|
|
3
|
|
17
|
use strict; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
126
|
|
5
|
3
|
|
|
3
|
|
16
|
use warnings; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
143
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
8
|
|
|
|
|
|
|
|
9
|
3
|
|
|
3
|
|
16
|
use Carp qw/croak/; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
278
|
|
10
|
3
|
|
|
3
|
|
2685
|
use Params::Util qw/_POSINT _ARRAY _CODE/; |
|
3
|
|
|
|
|
17814
|
|
|
3
|
|
|
|
|
294
|
|
11
|
3
|
|
|
3
|
|
4639
|
use Math::BigFloat; |
|
3
|
|
|
|
|
80801
|
|
|
3
|
|
|
|
|
20
|
|
12
|
3
|
|
|
3
|
|
163968
|
use Memoize; |
|
3
|
|
|
|
|
7273
|
|
|
3
|
|
|
|
|
2781
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Statistics::Test::Sequence - Sequence correlation test for random numbers |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use Statistics::Test::Sequence; |
21
|
|
|
|
|
|
|
my $tester = Statistics::Test::Sequence->new(); |
22
|
|
|
|
|
|
|
$tester->set_data( [map {rand()} 1..1000000] ); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my ($metric, $actual_freq, $expected_freq) = $tester->test(); |
25
|
|
|
|
|
|
|
use Data::Dumper; |
26
|
|
|
|
|
|
|
print "$metric\n"; |
27
|
|
|
|
|
|
|
print "Frequencies:\n"; |
28
|
|
|
|
|
|
|
print Dumper $actual_freq; |
29
|
|
|
|
|
|
|
print "Expected frequencies:\n"; |
30
|
|
|
|
|
|
|
print Dumper $expected_freq; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
This module implements a sequence correlation test for random number |
35
|
|
|
|
|
|
|
generators. It shows pairwise correlation between subsequent |
36
|
|
|
|
|
|
|
random numbers. |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
The algorithm is as follows: (Following Blobel. Citation in SEE ALSO section.) |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=over 2 |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=item * |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Given C random numbers C. |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=item * |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
For all C, compare C with C. If C is greater |
49
|
|
|
|
|
|
|
then C, assign a 0-Bit to the number. Otherwise, assign a |
50
|
|
|
|
|
|
|
1-Bit. |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=item * |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Find all sequences of equal Bits. For every sequence, increment |
55
|
|
|
|
|
|
|
a counter for the length C of that sequence. (Regardless of whether it's |
56
|
|
|
|
|
|
|
a sequence of 1's or 0's.) |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=item * |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
For uncorrelated random numbers, the number of sequences C |
61
|
|
|
|
|
|
|
of length C in the set of C random numbers is expected to be: |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
N(k) = 2*((k^2+3*k+1)*N - (k^3+3*k^2-k-4)) / (k+3)! |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=back |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head1 METHODS |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=cut |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head2 new |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Creates a new random number tester. |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=cut |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub new { |
78
|
1
|
|
|
1
|
1
|
16
|
my $proto = shift; |
79
|
1
|
|
33
|
|
|
8
|
my $class = ref($proto)||$proto; |
80
|
|
|
|
|
|
|
|
81
|
1
|
|
|
|
|
3
|
my $self = { |
82
|
|
|
|
|
|
|
data => undef, |
83
|
|
|
|
|
|
|
}; |
84
|
|
|
|
|
|
|
|
85
|
1
|
|
|
|
|
4
|
bless $self => $class; |
86
|
|
|
|
|
|
|
|
87
|
1
|
|
|
|
|
7
|
return $self; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=head2 set_data |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Sets the random numbers to operate on. First argument |
93
|
|
|
|
|
|
|
must be either an array reference to an array of random |
94
|
|
|
|
|
|
|
numbers or a code reference. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
If the first argument is a code reference, the second |
97
|
|
|
|
|
|
|
argument must be an integer C. The code reference is |
98
|
|
|
|
|
|
|
called C-times and its return values are used as |
99
|
|
|
|
|
|
|
random numbers. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
The code reference semantics are particularily useful if |
102
|
|
|
|
|
|
|
you do not want to store all random numbers in memory at |
103
|
|
|
|
|
|
|
the same time. You can write a subroutine that, for example, |
104
|
|
|
|
|
|
|
generates and returns batches of 100 random numbers so no |
105
|
|
|
|
|
|
|
more than 101 of these numbers will be in memory at the |
106
|
|
|
|
|
|
|
same time. Note that if you return 100 numbers at once and |
107
|
|
|
|
|
|
|
pass in C, you will have a sequence of 5000 random |
108
|
|
|
|
|
|
|
numbers. |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=cut |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub set_data { |
113
|
2
|
|
|
2
|
1
|
5289
|
my $self = shift; |
114
|
2
|
|
|
|
|
4
|
my $data = shift; |
115
|
2
|
100
|
|
|
|
18
|
if (_ARRAY($data)) { |
|
|
50
|
|
|
|
|
|
116
|
1
|
|
|
|
|
8
|
$self->{data} = $data; |
117
|
1
|
|
|
|
|
3
|
return 1; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
elsif (_CODE($data)) { |
120
|
1
|
|
|
|
|
4
|
$self->{data} = $data; |
121
|
1
|
|
|
|
|
238
|
my $n = shift; |
122
|
1
|
50
|
|
|
|
46
|
if (not _POSINT($n)) { |
123
|
0
|
|
|
|
|
0
|
croak("'set_data' needs an integer as second argument if the first argument is a code reference."); |
124
|
|
|
|
|
|
|
} |
125
|
1
|
|
|
|
|
16
|
$self->{n} = $n; |
126
|
1
|
|
|
|
|
4
|
return 1; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
else { |
129
|
0
|
|
|
|
|
0
|
croak("Invalid arguments to 'set_data'."); |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=head2 test |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Runs the sequence test on the data that was previously set using |
136
|
|
|
|
|
|
|
C. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Returns three items: The first is the root mean square of the bin |
139
|
|
|
|
|
|
|
residuals divided by the number of random numbers. It I be |
140
|
|
|
|
|
|
|
used as a measure for the quality of the random number generator |
141
|
|
|
|
|
|
|
and should be as close to zero as possible. A better metric is to |
142
|
|
|
|
|
|
|
compare the following two return values. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
The second return value is a reference to the array of frequencies. |
145
|
|
|
|
|
|
|
An example is in order here. Generating one million random numbers, |
146
|
|
|
|
|
|
|
I get: |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
[0, 416765, 181078, 56318, 11486, 1056, 150] |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
This means there were no sequences of length 0 (obvious), 416765 |
151
|
|
|
|
|
|
|
sequences of length 1, etc. There were no sequences of length 7 or |
152
|
|
|
|
|
|
|
greater. This example is a bad random number generator! (It's a |
153
|
|
|
|
|
|
|
linear congruent generator with C<(a*x_i+c)%m> and C, |
154
|
|
|
|
|
|
|
C, C, and C). |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
The third return value is similar in nature to the second in that it |
157
|
|
|
|
|
|
|
is a reference to an array containing sequence length frequencies. |
158
|
|
|
|
|
|
|
This one, however, contains the frequencies that would be expected for |
159
|
|
|
|
|
|
|
the given number of random numbers, were they uncorrelated. |
160
|
|
|
|
|
|
|
The number of bins has the maximum |
161
|
|
|
|
|
|
|
length of an occurring sequence as an upper limit. In the given example, |
162
|
|
|
|
|
|
|
you would get: (Dumped with Data::Dumper) |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
$VAR1 = [ |
165
|
|
|
|
|
|
|
'0', |
166
|
|
|
|
|
|
|
'416666.75', |
167
|
|
|
|
|
|
|
'183333.1', |
168
|
|
|
|
|
|
|
'52777.64722222222222222222222222222222222', |
169
|
|
|
|
|
|
|
'11507.89523809523809523809523809523809524', |
170
|
|
|
|
|
|
|
'2033.72068452380952380952380952380952381', |
171
|
|
|
|
|
|
|
'303.1287808641975308641975308641975308642', |
172
|
|
|
|
|
|
|
# ... |
173
|
|
|
|
|
|
|
]; |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
Note that where I put in a C<# ...>, you would really see a couple |
176
|
|
|
|
|
|
|
more lines of numbers until the numbers go below an expected |
177
|
|
|
|
|
|
|
frequency of C<0.1>. |
178
|
|
|
|
|
|
|
For C and C, you get about |
179
|
|
|
|
|
|
|
39 sequences, C is expected to be found 4-5 times, etc. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=cut |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub test { |
184
|
2
|
|
|
2
|
1
|
1628
|
my $self = shift; |
185
|
2
|
|
|
|
|
7
|
my $data = $self->{data}; |
186
|
|
|
|
|
|
|
|
187
|
2
|
50
|
|
|
|
8
|
if (not defined $data) { |
188
|
0
|
|
|
|
|
0
|
croak("Set data using 'set_data' first."); |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# bin counters |
192
|
2
|
|
|
|
|
2
|
my @bins; |
193
|
|
|
|
|
|
|
# current sequence type (> or <) |
194
|
2
|
|
|
|
|
3
|
my $current = undef; |
195
|
|
|
|
|
|
|
# current sequence length |
196
|
2
|
|
|
|
|
4
|
my $length = 0; |
197
|
|
|
|
|
|
|
# total number of random numbers |
198
|
2
|
|
|
|
|
3
|
my $numbers; |
199
|
|
|
|
|
|
|
|
200
|
2
|
100
|
|
|
|
7
|
if (_ARRAY($data)) { |
201
|
1
|
|
50
|
|
|
8
|
$current = ($data->[0] <=> $data->[1]) || 1; |
202
|
1
|
|
|
|
|
3
|
$length++; |
203
|
1
|
|
|
|
|
2
|
$numbers = @$data; |
204
|
|
|
|
|
|
|
|
205
|
1
|
|
|
|
|
4
|
foreach my $i (1 .. $#$data-1) { |
206
|
9998
|
|
50
|
|
|
24182
|
my $cmp = ($data->[$i] <=> $data->[$i+1]) || 1; |
207
|
9998
|
100
|
|
|
|
12521
|
if ($current == $cmp) { |
208
|
3275
|
|
|
|
|
3785
|
$length++; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
else { |
211
|
6723
|
|
|
|
|
6036
|
$current = $cmp; |
212
|
6723
|
|
|
|
|
5653
|
$bins[$length]++; |
213
|
6723
|
|
|
|
|
6978
|
$length = 1; |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
} |
216
|
1
|
|
|
|
|
6
|
$bins[$length]++; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
else { # CODE |
219
|
1
|
|
|
|
|
2
|
my @cache; |
220
|
1
|
|
|
|
|
4
|
my $calls = $self->{n}; |
221
|
1
|
|
|
|
|
2
|
my $first_run = 1; |
222
|
1
|
|
|
|
|
4
|
foreach (1..$calls) { |
223
|
|
|
|
|
|
|
# get new data |
224
|
100
|
|
|
|
|
254
|
push @cache, $data->(); |
225
|
|
|
|
|
|
|
# first run => initialize with first comparison |
226
|
100
|
100
|
66
|
|
|
3692
|
if ($first_run and @cache > 1) { |
227
|
1
|
|
50
|
|
|
4
|
$current = ($cache[0] <=> $cache[1]) || 1; |
228
|
1
|
|
|
|
|
2
|
shift @cache; |
229
|
1
|
|
|
|
|
1
|
$length++; # == 1 |
230
|
1
|
|
|
|
|
2
|
$numbers++; # == 1 |
231
|
1
|
|
|
|
|
1
|
$first_run = 0; |
232
|
|
|
|
|
|
|
} |
233
|
100
|
|
|
|
|
205
|
while (@cache > 1) { |
234
|
9998
|
|
|
|
|
9812
|
$numbers++; |
235
|
9998
|
|
|
|
|
9960
|
my $this = shift @cache; |
236
|
9998
|
|
50
|
|
|
19492
|
my $cmp = ($this <=> $cache[0]) || 1; |
237
|
9998
|
100
|
|
|
|
13538
|
if ($current == $cmp) { |
238
|
3273
|
|
|
|
|
6710
|
$length++; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
else { |
241
|
6725
|
|
|
|
|
6112
|
$current = $cmp; |
242
|
6725
|
|
|
|
|
6059
|
$bins[$length]++; |
243
|
6725
|
|
|
|
|
13532
|
$length = 1; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
} |
247
|
1
|
|
|
|
|
4
|
$bins[$length]++; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
2
|
|
|
|
|
10
|
my @expected = (0); # 0-bin is empty |
251
|
2
|
|
|
|
|
6
|
foreach my $bin (1..$#bins) { |
252
|
13
|
|
|
|
|
29162
|
$expected[$bin] = expected_frequency($bin, $numbers-1); |
253
|
|
|
|
|
|
|
} |
254
|
2
|
|
|
|
|
5270
|
my $last_bin = $#bins; |
255
|
2
|
|
|
|
|
11
|
while ($expected[$last_bin] > 0.1) { |
256
|
3
|
|
|
|
|
8307
|
$last_bin++; |
257
|
3
|
|
|
|
|
85
|
$expected[$last_bin] = expected_frequency($last_bin, $numbers-1); |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
2
|
|
|
|
|
5831
|
foreach my $bin (0..$last_bin) { |
261
|
18
|
100
|
|
|
|
40
|
$bins[$bin] = 0 if not defined $bins[$bin]; |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
|
265
|
2
|
|
|
|
|
8
|
my @diff = map { abs($bins[$_] - $expected[$_]) } 0..$#bins; |
|
18
|
|
|
|
|
5062
|
|
266
|
|
|
|
|
|
|
|
267
|
2
|
|
|
|
|
412
|
my $residual = 0; |
268
|
2
|
|
|
|
|
13
|
$residual += $_**2 for @diff; |
269
|
2
|
|
|
|
|
10229
|
$residual = sqrt($residual); |
270
|
2
|
|
|
|
|
11661
|
$residual = "$residual"; # de-bigfloatize |
271
|
|
|
|
|
|
|
|
272
|
2
|
|
|
|
|
126
|
@expected = map {"$_"} @expected; # de-bigfloatize |
|
18
|
|
|
|
|
711
|
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
return( |
275
|
2
|
|
|
|
|
139
|
$residual / ($numbers-1), |
276
|
|
|
|
|
|
|
\@bins, |
277
|
|
|
|
|
|
|
\@expected, |
278
|
|
|
|
|
|
|
); |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=head1 SUBROUTINES |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=head2 expected_frequency |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
Returns the expected frequency of the sequence length C |
286
|
|
|
|
|
|
|
in a set of C random numbers assuming uncorrelated random |
287
|
|
|
|
|
|
|
numbers. |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
Returns this as a L. |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
Expects C and C as arguments. |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
This subroutine is memoized. (See L.) |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=cut |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
memoize('expected_frequency'); |
298
|
|
|
|
|
|
|
sub expected_frequency { |
299
|
|
|
|
|
|
|
my $k = Math::BigFloat->new(shift); |
300
|
|
|
|
|
|
|
my $n = Math::BigFloat->new(shift); |
301
|
|
|
|
|
|
|
return( |
302
|
|
|
|
|
|
|
2 * ( ($k**2 + 3*$k + 1)*$n - ($k**3 + 3*$k**2 - $k - 4) ) |
303
|
|
|
|
|
|
|
/ faculty($k+3) |
304
|
|
|
|
|
|
|
); |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head2 faculty |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
Computes the factulty of the first argument recursively as a |
310
|
|
|
|
|
|
|
L. This subroutine is memoized. (See L.) |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
=cut |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
memoize('faculty'); |
315
|
|
|
|
|
|
|
sub faculty { |
316
|
|
|
|
|
|
|
my $n = shift; |
317
|
|
|
|
|
|
|
return Math::BigFloat->bone() if $n <= 1; |
318
|
|
|
|
|
|
|
return $n * faculty($n-1); |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
1; |
322
|
|
|
|
|
|
|
__END__ |