File Coverage

blib/lib/Statistics/Sequences/Runs.pm
Criterion Covered Total %
statement 171 189 90.4
branch 66 78 84.6
condition 9 16 56.2
subroutine 33 36 91.6
pod 18 18 100.0
total 297 337 88.1


line stmt bran cond sub pod time code
1             package Statistics::Sequences::Runs;
2 6     6   312006 use 5.008008;
  6         15  
3 6     6   19 use strict;
  6         6  
  6         115  
4 6     6   20 use warnings FATAL => 'all';
  6         16  
  6         214  
5 6     6   19 use Carp qw(carp croak);
  6         7  
  6         307  
6 6     6   20 use base qw(Statistics::Sequences);
  6         5  
  6         2612  
7 6     6   152904 use List::AllUtils qw(all mesh sum uniq);
  6         9  
  6         281  
8 6     6   21 use Number::Misc qw(is_even is_numeric);
  6         7  
  6         214  
9 6     6   2505 use Statistics::Zed 0.10;
  6         37208  
  6         158  
10 6     6   27 use String::Numeric qw(is_int);
  6         8  
  6         10166  
11             $Statistics::Sequences::Runs::VERSION = '0.22';
12            
13             =pod
14            
15             =head1 NAME
16            
17             Statistics::Sequences::Runs - The Runs Test: Wald-Wolfowitz runs test descriptives, deviation and combinatorics
18            
19             =head1 VERSION
20            
21             This is documentation for B of Statistics::Sequences::Runs.
22            
23             =head1 SYNOPSIS
24            
25             use strict;
26             use Statistics::Sequences::Runs 0.22;
27             my $runs = Statistics::Sequences::Runs->new();
28            
29             # Data are a sequence of dichotomous strings:
30             my @data = (qw/1 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1/);
31             my $val;
32            
33             # - Pre-load data to use for all methods:
34             $runs->load(\@data);
35             $val = $runs->observed();
36             $val = $runs->expected();
37            
38             # - or give data as "data => $aref" to each method:
39             $val = $runs->observed(data => AREF);
40            
41             # - or give frequencies of the 2 "states" in a sequence:
42             $val = $runs->expected(freqs => [11, 9]); # works with other methods except observed()
43            
44             # Deviation ratio:
45             $val = $runs->z_value(ccorr => 1);
46            
47             # Probability of deviation from expectation:
48             my ($z, $p) = $runs->z_value(ccorr => 1, tails => 1); # dev. ratio with p-value
49             $val = $runs->p_value(tails => 1); # normal dist. p-value itself
50             $val = $runs->p_value(exact => 1, tails => 1); # p-value by combinatorics
51            
52             # Keyed list of descriptives etc.:
53             my $href = $runs->stats_hash(values => [qw/observed p_value/], exact => 1);
54            
55             # Print descriptives etc. in the same way:
56             $runs->dump(
57             values => [qw/observed expected p_value/],
58             exact => 1,
59             flag => 1,
60             precision_s => 3,
61             precision_p => 7
62             );
63             # prints: observed = 11.000, expected = 10.900, p_value = 0.5700167
64            
65             =head1 DESCRIPTION
66            
67             The module returns statistical information re Wald-type runs across a sequence of dichotmous events on one or more consecutive trials. For example, given an accuracy-based sequence composed of matches (H) and misses (M) like (H, H, M, H, M, M, M, M, H), there are 5 runs: 3 for Hs, 2 for Ms. This observed number of runs can be compared with the number expected to occur by chance over the number of trials, relative to the expected variance. More runs than expected ("negative serial dependence") can denote irregularity, instability, mixing up of alternatives. Fewer runs than expected ("positive serial dependence") can denote cohesion, insulation, isolation of alternatives. Both can indicate sequential dependency: either negative (an alternation bias), or positive (a repetition bias).
68            
69             The distribution of runs is asymptotically normal, and a deviation-based test of extra-chance occurrence when at least one alternative has more than 20 occurrences (Siegal rule), or both event occurrences exceed 10 (Kelly, 1982), is conventionally considered reliable; otherwise, the module provides an "exact test" based on combinatorics.
70            
71             For non-dichotomous, continuous or multinomial data, see L for potentially transforming them for runs descriptives/tests.
72            
73             =head1 SUBROUTINES/METHODS
74            
75             =head2 Data-handling
76            
77             =head3 new
78            
79             $runs = Statistics::Sequences::Runs->new();
80            
81             Returns a new Runs object. Expects/accepts no arguments but the classname.
82            
83             =head3 load
84            
85             $runs->load(ARRAY);
86             $runs->load(AREF);
87             $runs->load(foodat => AREF); # named whatever
88            
89             Loads a sequence anonymously or by name - see L in the Statistics::Data manpage for details on the various ways data can be loaded, updated and then retrieved. Every load unloads all previous loads and any updates to them.
90            
91             Alternatively, skip this action; data don't always have to be loaded to use the stats methods here. The sequence can be provided with each method call, as shown below, or by simply giving the observed counts of runs (apart, of course, for calculating these counts, when a specific sequence is needed).
92            
93             =head3 add, access, unload
94            
95             See L for these additional operations on data that have been loaded.
96            
97             =head2 Descriptives
98            
99             =head3 observed
100            
101             $v = $runs->observed(); # use the data loaded anonymously
102             $v = $runs->observed(name => 'foodat'); # ... or the name given on loading
103             $v = $runs->observed(data => AREF); # ... or just give the data now
104            
105             Returns the total observed number of runs in the loaded or given data. For example,
106            
107             $v = $runs->observed(data => [qw/H H H T T H H/]);
108            
109             returns 3 (for the runs 'HHH', 'TT' and 'HH').
110            
111             =cut
112            
113             sub observed {
114 58     58 1 5416 my ( $self, @args ) = @_;
115 58 100       108 my $args = ref $args[0] ? $args[0] : {@args};
116 58 100       111 return $args->{'observed'} if defined $args->{'observed'};
117 49         95 my $data = _get_data( $self, $args );
118 49         1074 my $observed = 0;
119 49         39 for ( 0 .. scalar @{$data} - 1 ) {
  49         89  
120 1250 100 100     2974 if ( $_ == 0 or $data->[$_] ne $data->[ $_ - 1 ] ) {
121 550         400 $observed++;
122             }
123             }
124 49         96 return $observed;
125             }
126            
127             =head3 observed_per_state
128            
129             @freq = $runs->observed_per_state(data => AREF);
130             $href = $runs->observed_per_state(data => AREF);
131            
132             Returns the number of runs per state - as a two-dimensional array where the first element gives the count for the first state in the data, and so for the second. A hashref is returned if not called in list context, the frequencies keyed by state. For example:
133            
134             @ari = $runs->observed_per_state(data => [qw/H H H T T H H/]); # returns (2, 1)
135             $ref = $runs->observed_per_state(data => [qw/H H H T T H H/]); # returns { H => 2, T => 1}
136            
137             Exceptions: If there was only one state in the loaded/given sequence (e.g., data => [qw/H H H/]), there is only one run and so the returned array will be one-dimensional, i.e., (1), and the returned hashref has only a single key (for this example: { H => 1 }). If there are no states, with an empty array loaded/given for the sequence, then the same applies, except the returned array is (0) and the returned hashref has the empty string as its single key ( q{} => 0 ).
138            
139             =cut
140            
141             sub observed_per_state {
142 6     6 1 2123 my ( $self, @args ) = @_;
143 6 50       17 my $args = ref $args[0] ? $args[0] : {@args};
144 6         10 my $data = _get_data( $self, $args );
145 6         143 my @states = uniq @{$data};
  6         31  
146 6         9 my @freqs = ();
147 6 100       5 if ( scalar @{$data} ) {
  6         37  
148 5 100       10 if ( scalar @states > 1 ) {
149 3 50       8 @freqs = $data->[0] eq $states[0] ? ( 1, 0 ) : ( 0, 1 );
150             }
151             else {
152 2         3 @freqs = (1);
153             }
154 5         4 for ( 1 .. scalar @{$data} - 1 ) {
  5         12  
155 64 100       84 if ( $data->[$_] ne $data->[ $_ - 1 ] ) {
156 11 100       13 if ( $data->[$_] eq $states[0] ) {
157 5         5 $freqs[0]++;
158             }
159             else {
160 6         6 $freqs[1]++;
161             }
162             }
163             }
164             }
165             else {
166 1         2 @states = (q{});
167 1         2 @freqs = (0);
168             }
169 6 100       32 return wantarray ? @freqs : { mesh @states, @freqs };
170             }
171            
172             =head3 expected
173            
174             $v = $runs->expected(); # or specify loaded data by "name", or give as "data"
175             $v = $runs->expected(data => AREF); # use these data
176             $v = $runs->expected(freqs => [POS_INT, POS_INT]); # no actual data; calculate from these two Ns
177            
178             Returns the expected number of runs across the loaded data. Expectation is given as follows:
179            
180             =for html

  E[R] = ( (2n1n2) / (n1 + n2) ) + 1

181            
182             where I(I is the number of observations of each element in the data.
183            
184             =cut
185            
186             sub expected {
187 36     36 1 1950 my ( $self, @args ) = @_;
188 36         74 my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args );
189 36         28 my $val;
190 36 100       53 if ($sum) {
191 33         55 $val = ( ( 2 * $n1 * $n2 ) / $sum ) + 1;
192             }
193 36         68 return $val;
194             }
195            
196             =head3 variance
197            
198             $v = $runs->variance(); # use data already loaded - anonymously; or specify its "name"
199             $v = $runs->variance(data => AREF); # use these data
200             $v = $runs->variance(freqs => [POS_INT, POS_INT]); # use these counts - not any particular sequence of data
201            
202             Returns the variance in the number of runs for the given data.
203            
204             =for html

  V[R] = ( (2n1n2)([2n1n2] – [n1 + n2]) ) / ( ((n1 + n2)2)((n1 + n2) – 1) )

205            
206             defined as above for L.
207            
208             The data to test can already have been Led, or you send it directly as a flat referenced array keyed as B.
209            
210             =cut
211            
212             sub variance {
213 35     35 1 2316 my ( $self, @args ) = @_;
214 35         50 my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args );
215 35         30 my $val;
216 35 100       45 if ($sum) {
217 32 100       48 if ( $sum < 2 ) {
218 4         4 $val = 0;
219             }
220             else {
221 28         78 $val =
222             ( ( 2 * $n1 * $n2 * ( ( 2 * $n1 * $n2 ) - $sum ) ) /
223             ( ( $sum**2 ) * ( $sum - 1 ) ) );
224             }
225             }
226 35         127 return $val;
227             }
228            
229             =head3 observed_deviation
230            
231             $v = $runs->obsdev(); # use data already loaded - anonymously; or specify its "name"
232             $v = $runs->obsdev(data => AREF); # use these data
233            
234             Returns the deviation of (difference between) observed and expected runs for the loaded/given sequence (I - I).
235            
236             I: obsdev
237            
238             =cut
239            
240             sub observed_deviation {
241 2     2 1 545 my ( $self, @args ) = @_;
242 2         10 return $self->observed(@args) - $self->expected(@args);
243             }
244             *obsdev = \&observed_deviation;
245            
246             =head3 standard_deviation
247            
248             $v = $runs->stdev(); # use data already loaded - anonymously; or specify its "name"
249             $v = $runs->stdev(data => AREF);
250             $v = $runs->stdev(freqs => [POS_INT, POS_INT]); # don't use actual data; calculate from these two Ns
251            
252             Returns square-root of the variance.
253            
254             I: stdev, stddev
255            
256             =cut
257            
258             sub standard_deviation {
259 2     2 1 547 my ( $self, @args ) = @_;
260 2         5 return sqrt $self->variance(@args);
261             }
262             *stdev = \&standard_deviation;
263             *stddev = \&standard_deviation;
264            
265             =head3 skewness
266            
267             $v = $runs->skewness(); # use data already loaded - anonymously; or specify its "name"
268             $v = $runs->skewness(data => AREF); # use these data
269            
270             Returns run skewness as given by Barton & David (1958) based on the frequencies of the two different elements in the sequence.
271            
272             =cut
273            
274             sub skewness {
275 0     0 1 0 my ( $self, @args ) = @_;
276 0         0 my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args );
277 0         0 my $k3 = 0;
278 0 0 0     0 if ( $sum && $n1 != $n2 ) {
279 0         0 $k3 =
280             ( ( 2 * $n1 * $n2 ) / $sum**3 ) *
281             ( ( ( 16 * $n1**2 * $n2**2 ) / $sum**2 ) -
282             ( ( 4 * $n1 * $n2 * ( $sum + 3 ) ) / $sum ) +
283             3 * $sum );
284             }
285 0         0 return $k3;
286             }
287            
288             =head3 kurtosis
289            
290             $v = $runs->kurtosis(); # use data already loaded - anonymously; or specify its "name"
291             $v = $runs->kurtosis(data => AREF); # use these data
292            
293             Returns run kurtosis as given by Barton & David (1958) based on the frequencies of the two different elements in the sequence.
294            
295             =cut
296            
297             sub kurtosis {
298 0     0 1 0 my ( $self, @args ) = @_;
299 0         0 my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args );
300 0         0 my $k4;
301 0 0       0 if ( defined $sum ) {
302 0         0 $k4 = ( ( 2 * $n1 * $n2 ) / $sum**4 ) * (
303             (
304             ( 48 * ( 5 * $sum - 6 ) * $n1**3 * $n2**3 ) /
305             ( $sum**2 * $sum**2 )
306             ) - (
307             ( 48 * ( 2 * $sum**2 + 3 * $sum - 6 ) * $n1**2 * $n2**2 ) /
308             ( $sum**2 * $sum )
309             ) + (
310             (
311             2 * ( 4 * $sum**3 + 45 * $sum**2 - 37 * $sum - 18 ) *
312             $n1 * $n2
313             ) / $sum**2
314             ) - ( 7 * $sum**2 + 13 * $sum - 6 )
315             );
316             }
317 0         0 return $k4;
318             }
319            
320             =head2 Distribution and tests
321            
322             =head3 pmf
323            
324             $p = $runs->pmf(data => AREF); # or no args to use last pre-loaded data
325             $p = $runs->pmf(observed => POS_INT, freqs => [POS_INT, POS_INT]);
326            
327             Implements the runs probability mass function, returning the probability for a particular number of runs given so many dichotomous events (e.g., as in Swed & Eisenhart, 1943, p. 66); i.e., for I' the observed number of runs, I

{I = I'}. The required function parameters are the observed number of runs, and the frequencies (counts) of each state in the sequence, which can be given directly, as above, in the arguments B and B, respectively, or these will be worked out from a given data sequence itself (given here or as pre-loaded). For derivation, see its public internal methods L and L, which make use of the choose() method from Orwant et al. (1999).

328            
329             =cut
330            
331             sub pmf {
332 2     2 1 360 my ( $self, @args ) = @_;
333 2         6 my ( $n1, $n2 ) = $self->bi_frequency(@args);
334 2         4 return _pmf_num( $self->observed(@args), $n1, $n2 ) /
335             _pmf_denom( $n1, $n2 );
336             }
337            
338             =head3 cdf
339            
340             $p = $runs->cdf(data => AREF); # or no args to use last pre-loaded data
341             $p = $runs->cdf(observed => POS_INT, freqs => [POS_INT, POS_INT]);
342            
343             Implements the cumulative distribution function for runs, returning the probability of obtaining the observed number of runs or less down to the expected number of 2 (assuming that the two possible events are actually represented in the data), as per Swed & Eisenhart (1943), p. 66; i.e., for I' the observed number of runs, I

{I <= I'}. The summation is over the probability mass function L. The function parameters are the observed number of runs, and the frequencies (counts) of the two events, which can be given directly, as above, in the arguments B and B, respectively, or these will be worked out from a given data sequence itself (given here or as pre-loaded).

344            
345             =cut
346            
347             sub cdf {
348 5     5 1 545 my ( $self, @args ) = @_;
349 5         9 my ( $n1, $n2 ) = $self->bi_frequency(@args);
350 5         8 my $u = $self->observed(@args);
351 5         6 my $sum = 0;
352 5         7 for ( 2 .. $u ) {
353 39         41 $sum += _pmf_num( $_, $n1, $n2 );
354             }
355 5         7 return $sum / _pmf_denom( $n1, $n2 );
356             }
357            
358             =head3 cdfi
359            
360             $p = $runs->cdfi(data => AREF); # or no args for last pre-loaded data
361             $p = $runs->cdfi(observed => POS_INT, freqs => [POS_INT, POS_INT]);
362            
363             Implements the (inverse) cumulative distribution function for runs, returning the probability of obtaining more than the observed number of runs up from the expected number of 2 (assuming that the two possible events are actually represented in the data), as per Swed & Eisenhart (1943), p. 66; ; i.e., for I' the observed number of runs, I

= 1 - I

{I <= I' - 1}. The summation is over the probability mass function L. The function parameters are the observed number of runs, and the frequencies (counts) of the two events, which can be given directly, as above, in the arguments B and B, respectively, or these will be worked out from a given data sequence itself (given here as B or as pre-loaded).

364            
365             =cut
366            
367             sub cdfi {
368 8     8 1 383 my ( $self, @args ) = @_;
369 8         13 my ( $n1, $n2 ) = $self->bi_frequency(@args);
370 8         15 my $u = $self->observed(@args);
371 8         7 my $sum = 0;
372 8         14 for ( 2 .. $u - 1 ) {
373 147         138 $sum += _pmf_num( $_, $n1, $n2 );
374             }
375 8         27 return 1 - $sum / _pmf_denom( $n1, $n2 );
376             }
377            
378             =head3 z_value
379            
380             $v = $runs->z_value(ccorr => BOOL); # use data already loaded - anonymously; or specify its "name"
381             $v = $runs->z_value(data => AREF, ccorr => BOOL);
382             ($zvalue, $pvalue) = $runs->z_value(data => AREF, ccorr => BOOL, tails => 1|2); # wanting an array, get p-value too
383            
384             Returns the normal deviate from a test of runcount deviation, taking the runcount expected from that observed and dividing by the root variance, by default with a continuity correction to expectation. Called wanting an array, returns the I-value with its I

-value for the B (1 or 2) given. The returned value is an empty string if the variance is undefined, empty or equals 0 (as when there is only one state in the sequence).

385            
386             The data to test can already have been Led, or sent directly as an aref keyed as B.
387            
388             Other options are B (for the z_value) and B (for the p_value).
389            
390             I: zscore, zvalue
391            
392             =cut
393            
394             sub z_value {
395 8     8 1 1895 my ( $self, @args ) = @_;
396 8 100       22 my $args = ref $args[0] ? $args[0] : {@args};
397 8         17 my $observed = $self->observed($args);
398 8 100       17 return q{} if $observed == 0;
399 7         52 my $zed = Statistics::Zed->new();
400             return $zed->z_value(
401             observed => $observed,
402             expected => $self->expected($args),
403             variance => $self->variance($args),
404             ccorr => ( defined $args->{'ccorr'} ? $args->{'ccorr'} : 1 ),
405             tails => ( $args->{'tails'} || 2 ),
406             precision_s => $args->{'precision_s'},
407 7 100 50     128 precision_p => $args->{'precision_p'},
408             );
409             }
410             *zvalue = \&z_value;
411             *zscore = \&z_value;
412            
413             =head3 p_value
414            
415             $p = $runs->p_value(); # using loaded data and default args
416             $p = $runs->p_value(ccorr => BOOL, tails => 1|2); # normal-approx. for last-loaded data
417             $p = $runs->p_value(exact => BOOL); # calc combinatorially for observed >= or < than expectation
418             $p = $runs->p_value(data => AREF, exact => BOOL); # given data
419             $p = $runs->p_value(observed => POS_INT, freqs => [POS_INT, POS_INT]); # no data sequence, specify known params
420            
421             Returns the probability of getting the observed number of runs or a smaller number given the number of each of the two events. By default, a large sample is assumed, and the probability is obtained from the normalized deviation, as given by the L method.
422            
423             If the option B is defined and not zero, then the probability is worked out combinatorially, as per Swed & Eisenhart (1943), Eq. 1, p. 66 (and also Siegal, 1956, Eqs. 6.12a and 6.12b, p. 138). This is only implemented as a one-tailed test; the B option has no effect. This tests the hypotheses that there are either too many or too few runs relative to chance expectation; which of these hypotheses is tested is based on the expected value returned by the L method, using L if there are more runs than expected, or L if there are fewer runs than expected; use these functions themselves to specify the hypothesis to be tested.
424            
425             If there is only one state/event in the sequence, then the variance from the expected value of 1 is 0, and this method returns 1 (however long this single event sequence is, the observed number of runs cannot differ from the expected number of runs). If the sequence is empty, an empty string is returned.
426            
427             Output from these tests has been checked against the tables and examples in Swed & Eisenhart (given to 7 decimal places), and found to agree.
428            
429             The option B gives the returned I

-value to so many decimal places.

430            
431             I: pvalue
432            
433             =cut
434            
435             sub p_value {
436 19     19 1 4859 my ( $self, @args ) = @_;
437 19 100       59 my $args = ref $args[0] ? $args[0] : {@args};
438 19         34 my $vals = {
439             observed => $self->observed($args),
440             expected => $self->expected($args),
441             variance => $self->variance($args),
442             };
443 19 100       48 return $args->{'exact'}
444             ? _p_exact( $self, $args, $vals )
445             : _p_norm( $self, $args, $vals );
446             }
447             *pvalue = \&p_value;
448            
449             =head3 ztest_ok
450            
451             $bool = $runs->ztest_ok(); # use data already loaded - anonymously; or specify its "name"
452             $bool = $runs->ztest_ok(data => AREF);
453            
454             Returns true for the loaded sequence if its constituent sample numbers are sufficient for their expected runs to be normally approximated - using Siegal's (1956, p. 140) rule - ok if I of the two Is are greater than 20.
455            
456             =cut
457            
458             sub ztest_ok {
459 5     5 1 1658 my ( $self, @args ) = @_;
460 5         10 my ( $n1, $n2 ) = $self->bi_frequency(@args);
461 5 100 100     26 my $retval =
462             $n1 > 20 || $n2 > 20
463             ? 1
464             : 0;
465 5         9 return $retval;
466             }
467            
468             =head2 Utils
469            
470             Methods used internally, or for returning/printing descriptives, etc., in a bunch.
471            
472             =head3 bi_frequency
473            
474             @freq = $runs->bi_frequency(data => AREF); # or no args if using last pre-loaded data
475            
476             Returns frequency of the two elements - or croaks if there are more than 2, and gives zero for any absent.
477            
478             =cut
479            
480             sub bi_frequency {
481 99     99 1 961 my ( $self, @args ) = @_;
482 99 100       172 my $args = ref $args[0] ? $args[0] : {@args};
483            
484             # might be called internally from a method where an array of frequencies per state is optionally already given:
485             carp
486             'Argument named \'trials\' is deprecated; use \'freqs\' to give aref of frequencies per state'
487 99 50       153 if $args->{'trials'};
488 99 100       150 return @{ $args->{'freqs'} } if ref $args->{'freqs'};
  17         34  
489 82         94 my $data = _get_data( $self, $args );
490            
491             # build hash keying each element with its frequency:
492 82         1508 my %states = ();
493 82         65 for ( @{$data} ) {
  82         92  
494 1983         1395 $states{$_}++;
495             }
496            
497             # Check that number of states in the sequences are computable for Runs,
498             # and ensure that there is at least zero frequency for the 2 states:
499 82         69 my $nstates = scalar keys %states;
500 82         116 my @vals = values %states;
501            
502 82 100       182 if ( !$nstates ) {
    100          
    50          
503 7         12 @vals = ( 0, 0 );
504             }
505             elsif ( $nstates == 1 ) {
506 27         21 push @vals, 0;
507             }
508             elsif ( $nstates > 2 ) {
509 0         0 croak
510             'Cannot compute runs: More than two states were found in the data';
511             }
512 82         155 return @vals;
513             }
514            
515             =head3 n_max_seq
516            
517             $n = $runs->n_max_seq(); # loaded data
518             $n = $runs->n_max_seq(data => AREF); # this sequence
519             $n = $runs->n_max_seq(observed => POS_INT, freqs => [POS_INT, POS_INT]); # these counts
520            
521             Returns the number of possible sequences for the two given state frequencies. So the urn contains I1 black balls and I2 white balls, well mixed; taking I1 + I2 drawings from it without replacement, any sequence has the same probability of occurring; how many different sequences of black and white balls are possible? For the two counts, this is "sum of I1 + I2 I I1", or:
522            
523             =for html

   Nmax = ( N1 + N2 )! / N1!N2!

524            
525             This is the denominator term in the runs L; not taking into account probability of obtaining so many of each event, of the proportion of black and white balls in the urn.
526            
527             =cut
528            
529             sub n_max_seq {
530 1     1 1 172 my ( $self, @args ) = @_;
531 1         3 return _pmf_denom( $self->bi_frequency(@args) );
532             }
533            
534             =head3 m_seq_k
535            
536             $n = $runs->m_seq_k(); # loaded data
537             $n = $runs->m_seq_k(data => AREF); # this sequence
538             $n = $runs->m_seq_k(observed => POS_INT, freqs => [POS_INT, POS_INT]); # these counts
539            
540             Returns the number of sequences that can produce I runs from I elements of a single kind, with all other kinds of elements in the sequence assumed to be of a single kind, under the conditions of L. See Swed and Eisenhart (1943), or Barton and David (1958, p. 253). With the frequentist probability M / N, this is the numerator term in the runs L.
541            
542             =cut
543            
544             sub m_seq_k {
545 1     1 1 338 my ( $self, @args ) = @_;
546 1         3 return _pmf_num( $self->observed(@args), $self->bi_frequency(@args) );
547             }
548            
549             =head3 stats_hash
550            
551             $href = $runs->stats_hash(values => [qw/observed expected z_value/], precision_s => POS_INT, ccorr => BOOL); # among other values/options
552             $href = $runs->stats_hash(values =>
553             {
554             observed => BOOL,
555             expected => BOOL,
556             variance => BOOL,
557             z_value => BOOL,
558             p_value => BOOL,
559             },
560             exact => BOOL, # for p_value
561             ccorr => BOOL # for z_value
562             );
563            
564             Returns a hashref for the counts and stats as specified in its "values" argument, and with any options for calculating them (e.g., exact for p_value). See L for details. If calling via a "runs" object, the option "stat => 'runs'" is not needed (unlike when using the parent "sequences" object).
565            
566             =head3 dump
567            
568             $runs->dump(values => [qw/observed expected z_value/], precision_s => POS_INT, ccorr => BOOL); # among other values/options
569             $runs->dump(values =>
570             {
571             observed => BOOL,
572             expected => BOOL,
573             variance => BOOL,
574             z_value => BOOL,
575             p_value => BOOL,
576             },
577             precision_s => POS_INT,
578             precision_p => POS_INT, # for p_value
579             flag => BOOL, # for p_value
580             exact => BOOL, # for p_value
581             ccorr => BOOL # for z_value
582             );
583            
584             Print Runs-test results to STDOUT, including the stats as given a true value by their method names in a referenced hash of B, and with options relevant to thesemethods (see the template above). Default values to dump are observed() and p_value()). Optionally also give the data directly.
585            
586             =cut
587            
588             sub dump {
589 0     0 1 0 my ( $self, @args ) = @_;
590 0 0       0 my $args = ref $args[0] ? $args[0] : {@args};
591 0         0 $args->{'stat'} = 'runs';
592 0         0 $self->SUPER::dump($args);
593 0         0 return;
594             }
595            
596             =head3 dump_data
597            
598             $runs->dump_data(delim => "\n"); # print whatevers loaded (or specify by name, or as "data")
599            
600             See L for details.
601            
602             =cut
603            
604             # Private methods:
605            
606             sub _p_exact {
607 9     9   9 my ( $self, $args, $vals ) = @_;
608 9         7 my $pval;
609 9 100   18   26 if ( all { is_numeric($_) } ( $vals->{'observed'}, $vals->{'expected'} ) ) {
  18         106  
610             $pval =
611 8 100       111 ( $vals->{'observed'} - $vals->{'expected'} >= 0 )
612             ? $self->cdfi($args)
613             : $self->cdf($args);
614 8 100       20 if ( $args->{'precision_p'} ) {
615 2         27 $pval = sprintf q{%.} . $args->{'precision_p'} . qw{f}, $pval;
616             }
617             }
618             else {
619 1         5 $pval = q{};
620             }
621 9         36 return $pval;
622             }
623            
624             sub _p_norm {
625 10     10   10 my ( $self, $args, $vals ) = @_;
626 10         10 my $pval;
627 10 100       43 if ( !$vals->{'expected'} ) {
    100          
628 1         2 $pval = q{};
629             }
630             elsif ( !$vals->{'variance'} ) {
631 2         3 $pval = 1;
632             }
633             else {
634 7         25 my $zed = Statistics::Zed->new();
635             $pval = $zed->p_value(
636 7         41 %{$vals},
637             ccorr => ( defined $args->{'ccorr'} ? $args->{'ccorr'} : 1 ),
638             tails => ( $args->{'tails'} || 2 ),
639 7 100 50     100 precision_p => $args->{'precision_p'},
640             );
641             }
642 10         744 return $pval;
643             }
644            
645             sub _pmf_num {
646 191     191   141 my ( $u, $m, $n ) = @_;
647 191         115 my $f;
648 191 100       220 if ( is_even($u) ) {
649 99         1255 my $k = $u / 2 - 1;
650 99         109 $f = 2 * _choose( $m - 1, $k ) * _choose( $n - 1, $k );
651             }
652             else {
653 92         1111 my $k = ( $u + 1 ) / 2;
654 92         103 $f =
655             _choose( $m - 1, $k - 1 ) * _choose( $n - 1, $k - 2 ) +
656             _choose( $m - 1, $k - 2 ) * _choose( $n - 1, $k - 1 );
657             }
658 191         230 return $f;
659             }
660            
661             sub _pmf_denom {
662 17     17   199 my @args = @_;
663 17         49 return _choose( sum(@args), $args[0] );
664             }
665            
666             sub _choose { # from Orwant et al., p. 573
667 583     583   378 my ( $n, $k ) = @_;
668 583         334 my ( $res, $j ) = ( 1, 1 );
669 583 50 33     1506 return 0 if $k > $n || $k < 0;
670 583 100       637 $k = ( $n - $k ) if ( $n - $k ) < $k;
671 583         682 while ( $j <= $k ) {
672 4309         2534 $res *= $n--;
673 4309         4752 $res /= $j++;
674             }
675 583         701 return $res;
676             }
677            
678             sub _sum_bi_frequency {
679 71     71   57 my ( $self, @args ) = @_;
680 71         136 my ( $n1, $n2 ) = $self->bi_frequency(@args);
681 71         51 my $sum;
682 71 50   142   196 if ( all { is_int($_) } ( $n1, $n2 ) ) {
  142         597  
683 71         344 $sum = $n1 + $n2;
684             }
685 71         157 return ( $sum, $n1, $n2 );
686             }
687            
688             sub _get_data {
689 137     137   113 my ( $self, $args ) = @_;
690 137 100       181 return ref $args->{'data'} ? $args->{'data'} : $self->get_aref( %{$args} );
  119         329  
691             }
692            
693             1;
694            
695             __END__