File Coverage

blib/lib/Statistics/Test/Sequence.pm
Criterion Covered Total %
statement 94 97 96.9
branch 15 18 83.3
condition 7 14 50.0
subroutine 10 10 100.0
pod 3 3 100.0
total 129 142 90.8


line stmt bran cond sub pod time code
1             package Statistics::Test::Sequence;
2              
3 3     3   72710 use 5.006;
  3         12  
  3         111  
4 3     3   17 use strict;
  3         4  
  3         126  
5 3     3   16 use warnings;
  3         5  
  3         143  
6              
7             our $VERSION = '0.01';
8              
9 3     3   16 use Carp qw/croak/;
  3         3  
  3         278  
10 3     3   2685 use Params::Util qw/_POSINT _ARRAY _CODE/;
  3         17814  
  3         294  
11 3     3   4639 use Math::BigFloat;
  3         80801  
  3         20  
12 3     3   163968 use Memoize;
  3         7273  
  3         2781  
13              
14             =head1 NAME
15              
16             Statistics::Test::Sequence - Sequence correlation test for random numbers
17              
18             =head1 SYNOPSIS
19              
20             use Statistics::Test::Sequence;
21             my $tester = Statistics::Test::Sequence->new();
22             $tester->set_data( [map {rand()} 1..1000000] );
23            
24             my ($metric, $actual_freq, $expected_freq) = $tester->test();
25             use Data::Dumper;
26             print "$metric\n";
27             print "Frequencies:\n";
28             print Dumper $actual_freq;
29             print "Expected frequencies:\n";
30             print Dumper $expected_freq;
31              
32             =head1 DESCRIPTION
33              
34             This module implements a sequence correlation test for random number
35             generators. It shows pairwise correlation between subsequent
36             random numbers.
37              
38             The algorithm is as follows: (Following Blobel. Citation in SEE ALSO section.)
39              
40             =over 2
41              
42             =item *
43              
44             Given C random numbers C.
45              
46             =item *
47              
48             For all C, compare C with C. If C is greater
49             then C, assign a 0-Bit to the number. Otherwise, assign a
50             1-Bit.
51              
52             =item *
53              
54             Find all sequences of equal Bits. For every sequence, increment
55             a counter for the length C of that sequence. (Regardless of whether it's
56             a sequence of 1's or 0's.)
57              
58             =item *
59              
60             For uncorrelated random numbers, the number of sequences C
61             of length C in the set of C random numbers is expected to be:
62              
63             N(k) = 2*((k^2+3*k+1)*N - (k^3+3*k^2-k-4)) / (k+3)!
64              
65             =back
66              
67             =head1 METHODS
68              
69             =cut
70              
71             =head2 new
72              
73             Creates a new random number tester.
74              
75             =cut
76              
77             sub new {
78 1     1 1 16 my $proto = shift;
79 1   33     8 my $class = ref($proto)||$proto;
80              
81 1         3 my $self = {
82             data => undef,
83             };
84              
85 1         4 bless $self => $class;
86              
87 1         7 return $self;
88             }
89              
90             =head2 set_data
91              
92             Sets the random numbers to operate on. First argument
93             must be either an array reference to an array of random
94             numbers or a code reference.
95              
96             If the first argument is a code reference, the second
97             argument must be an integer C. The code reference is
98             called C-times and its return values are used as
99             random numbers.
100              
101             The code reference semantics are particularily useful if
102             you do not want to store all random numbers in memory at
103             the same time. You can write a subroutine that, for example,
104             generates and returns batches of 100 random numbers so no
105             more than 101 of these numbers will be in memory at the
106             same time. Note that if you return 100 numbers at once and
107             pass in C, you will have a sequence of 5000 random
108             numbers.
109              
110             =cut
111              
112             sub set_data {
113 2     2 1 5289 my $self = shift;
114 2         4 my $data = shift;
115 2 100       18 if (_ARRAY($data)) {
    50          
116 1         8 $self->{data} = $data;
117 1         3 return 1;
118             }
119             elsif (_CODE($data)) {
120 1         4 $self->{data} = $data;
121 1         238 my $n = shift;
122 1 50       46 if (not _POSINT($n)) {
123 0         0 croak("'set_data' needs an integer as second argument if the first argument is a code reference.");
124             }
125 1         16 $self->{n} = $n;
126 1         4 return 1;
127             }
128             else {
129 0         0 croak("Invalid arguments to 'set_data'.");
130             }
131             }
132              
133             =head2 test
134              
135             Runs the sequence test on the data that was previously set using
136             C.
137              
138             Returns three items: The first is the root mean square of the bin
139             residuals divided by the number of random numbers. It I be
140             used as a measure for the quality of the random number generator
141             and should be as close to zero as possible. A better metric is to
142             compare the following two return values.
143              
144             The second return value is a reference to the array of frequencies.
145             An example is in order here. Generating one million random numbers,
146             I get:
147              
148             [0, 416765, 181078, 56318, 11486, 1056, 150]
149              
150             This means there were no sequences of length 0 (obvious), 416765
151             sequences of length 1, etc. There were no sequences of length 7 or
152             greater. This example is a bad random number generator! (It's a
153             linear congruent generator with C<(a*x_i+c)%m> and C,
154             C, C, and C).
155              
156             The third return value is similar in nature to the second in that it
157             is a reference to an array containing sequence length frequencies.
158             This one, however, contains the frequencies that would be expected for
159             the given number of random numbers, were they uncorrelated.
160             The number of bins has the maximum
161             length of an occurring sequence as an upper limit. In the given example,
162             you would get: (Dumped with Data::Dumper)
163              
164             $VAR1 = [
165             '0',
166             '416666.75',
167             '183333.1',
168             '52777.64722222222222222222222222222222222',
169             '11507.89523809523809523809523809523809524',
170             '2033.72068452380952380952380952380952381',
171             '303.1287808641975308641975308641975308642',
172             # ...
173             ];
174              
175             Note that where I put in a C<# ...>, you would really see a couple
176             more lines of numbers until the numbers go below an expected
177             frequency of C<0.1>.
178             For C and C, you get about
179             39 sequences, C is expected to be found 4-5 times, etc.
180              
181             =cut
182              
183             sub test {
184 2     2 1 1628 my $self = shift;
185 2         7 my $data = $self->{data};
186              
187 2 50       8 if (not defined $data) {
188 0         0 croak("Set data using 'set_data' first.");
189             }
190              
191             # bin counters
192 2         2 my @bins;
193             # current sequence type (> or <)
194 2         3 my $current = undef;
195             # current sequence length
196 2         4 my $length = 0;
197             # total number of random numbers
198 2         3 my $numbers;
199              
200 2 100       7 if (_ARRAY($data)) {
201 1   50     8 $current = ($data->[0] <=> $data->[1]) || 1;
202 1         3 $length++;
203 1         2 $numbers = @$data;
204              
205 1         4 foreach my $i (1 .. $#$data-1) {
206 9998   50     24182 my $cmp = ($data->[$i] <=> $data->[$i+1]) || 1;
207 9998 100       12521 if ($current == $cmp) {
208 3275         3785 $length++;
209             }
210             else {
211 6723         6036 $current = $cmp;
212 6723         5653 $bins[$length]++;
213 6723         6978 $length = 1;
214             }
215             }
216 1         6 $bins[$length]++;
217             }
218             else { # CODE
219 1         2 my @cache;
220 1         4 my $calls = $self->{n};
221 1         2 my $first_run = 1;
222 1         4 foreach (1..$calls) {
223             # get new data
224 100         254 push @cache, $data->();
225             # first run => initialize with first comparison
226 100 100 66     3692 if ($first_run and @cache > 1) {
227 1   50     4 $current = ($cache[0] <=> $cache[1]) || 1;
228 1         2 shift @cache;
229 1         1 $length++; # == 1
230 1         2 $numbers++; # == 1
231 1         1 $first_run = 0;
232             }
233 100         205 while (@cache > 1) {
234 9998         9812 $numbers++;
235 9998         9960 my $this = shift @cache;
236 9998   50     19492 my $cmp = ($this <=> $cache[0]) || 1;
237 9998 100       13538 if ($current == $cmp) {
238 3273         6710 $length++;
239             }
240             else {
241 6725         6112 $current = $cmp;
242 6725         6059 $bins[$length]++;
243 6725         13532 $length = 1;
244             }
245             }
246             }
247 1         4 $bins[$length]++;
248             }
249              
250 2         10 my @expected = (0); # 0-bin is empty
251 2         6 foreach my $bin (1..$#bins) {
252 13         29162 $expected[$bin] = expected_frequency($bin, $numbers-1);
253             }
254 2         5270 my $last_bin = $#bins;
255 2         11 while ($expected[$last_bin] > 0.1) {
256 3         8307 $last_bin++;
257 3         85 $expected[$last_bin] = expected_frequency($last_bin, $numbers-1);
258             }
259              
260 2         5831 foreach my $bin (0..$last_bin) {
261 18 100       40 $bins[$bin] = 0 if not defined $bins[$bin];
262             }
263              
264              
265 2         8 my @diff = map { abs($bins[$_] - $expected[$_]) } 0..$#bins;
  18         5062  
266              
267 2         412 my $residual = 0;
268 2         13 $residual += $_**2 for @diff;
269 2         10229 $residual = sqrt($residual);
270 2         11661 $residual = "$residual"; # de-bigfloatize
271              
272 2         126 @expected = map {"$_"} @expected; # de-bigfloatize
  18         711  
273              
274             return(
275 2         139 $residual / ($numbers-1),
276             \@bins,
277             \@expected,
278             );
279             }
280              
281             =head1 SUBROUTINES
282              
283             =head2 expected_frequency
284              
285             Returns the expected frequency of the sequence length C
286             in a set of C random numbers assuming uncorrelated random
287             numbers.
288              
289             Returns this as a L.
290              
291             Expects C and C as arguments.
292              
293             This subroutine is memoized. (See L.)
294              
295             =cut
296              
297             memoize('expected_frequency');
298             sub expected_frequency {
299             my $k = Math::BigFloat->new(shift);
300             my $n = Math::BigFloat->new(shift);
301             return(
302             2 * ( ($k**2 + 3*$k + 1)*$n - ($k**3 + 3*$k**2 - $k - 4) )
303             / faculty($k+3)
304             );
305             }
306              
307             =head2 faculty
308              
309             Computes the factulty of the first argument recursively as a
310             L. This subroutine is memoized. (See L.)
311              
312             =cut
313              
314             memoize('faculty');
315             sub faculty {
316             my $n = shift;
317             return Math::BigFloat->bone() if $n <= 1;
318             return $n * faculty($n-1);
319             }
320              
321             1;
322             __END__