File Coverage

blib/lib/Statistics/Sequences/Joins.pm
Criterion Covered Total %
statement 70 85 82.3
branch 30 42 71.4
condition 6 14 42.8
subroutine 16 19 84.2
pod 9 9 100.0
total 131 169 77.5


line stmt bran cond sub pod time code
1             package Statistics::Sequences::Joins;
2 3     3   41015 use 5.008008;
  3         7  
3 3     3   11 use strict;
  3         3  
  3         52  
4 3     3   16 use warnings FATAL => 'all';
  3         18  
  3         107  
5 3     3   9 use Carp qw(carp croak);
  3         11  
  3         170  
6 3     3   11 use base qw(Statistics::Sequences);
  3         3  
  3         1555  
7 3     3   66760 use List::AllUtils qw(true uniq);
  3         3  
  3         129  
8 3     3   1339 use Statistics::Zed 0.10;
  3         22542  
  3         2224  
9             $Statistics::Sequences::Joins::VERSION = '0.20';
10            
11             =pod
12            
13             =head1 NAME
14            
15             Statistics::Sequences::Joins - The Joins Test: Wishart-Hirschfeld statistics for frequency of alternations in a dichotomous sequence
16            
17             =head1 VERSION
18            
19             This is documentation for B of Statistics::Sequences::Joins.
20            
21             =head1 SYNOPSIS
22            
23             use Statistics::Sequences::Joins 0.20;
24             my $joins = Statistics::Sequences::Joins->new();
25             $joins->load([1, 0, 0, 0, 1, 1, 0, 1, 1, 1]); # bi-valued sequence
26             my $val = $joins->observed(); # or give "data => AREF" to stat methods
27             $val = $joins->expected(trials => 10, prob => .5); # sufficient, independent of data
28             $val = $joins->variance(trials => 10, prob => .5); # same
29             $val = $joins->z_value(tails => 1, ccorr => 1); # use loaded data
30             my ($z, $p) = $joins->z_value(tails => 1, ccorr => 1); # as above, but wantarray for z- and p-value
31             $p = $joins->p_value(tails => 1); # using loaded data
32             $val = $joins->z_value(trials => 10, observed => 4, tails => 1, ccorr => 1); # sufficicent
33             my $href = $joins->stats_hash(values => {observed => 1, p_value => 1}); # or other methods as attribs in the hashref
34             # print values to STDOUT:
35             $joins->dump(values => {observed => 1, expected => 1, p_value => 1}, format => 'line', flag => 1, precision_s => 3, precision_p => 7);
36            
37             =head1 DESCRIPTION
38            
39             A sequence of dichotomous, binary-valued, two-element events consists of zero or more alternations (or "joins") of those events. For example, joins are marked out with asterisks for the following sequence:
40            
41             0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0
42             * * * * * * * *
43            
44             So there's a join (of 0 and 1) at indices 1 and 2 (from zero), then immediately another join (of 1 and 0) at indices 2 and 3, and then another join at 5 and 6 ... for a total joincount of eight.
45            
46             This module provides methods to calculate and return this observed joincount, and also the expected joincount and its variance for the number of trials and probability of each event, following the limiting form of the probability distribution of the number of joins in a binary-valued sequence given by Wishart and Hirschfeld (1936). This assumes that the probability that an event can take one or another value at each trial is constant over all trials. The concept might seem similar to L but runs are counted for each continuous segment between alternations, while it is blind to the length of these repetitions and even to event-probabilities.
47            
48             =head1 METHODS
49            
50             Methods include those described in L, and have the same form as those in its other sub-modules, but naturally have specific operations as follows.
51            
52             =head2 new
53            
54             $joins = Statistics::Sequences::Joins->new();
55            
56             Returns a new Joins object. Expects/accepts no arguments but the classname.
57            
58             =head2 load
59            
60             $joins->load(@data); # anonymously
61             $joins->load(\@data);
62             $joins->load('sample1' => \@data); # labelled whatever
63            
64             Loads data anonymously or by name - see L in the Statistics::Data manpage for details on the various ways data can be loaded and then retrieved (more than shown here). Here, the data are checked to ensure that they contain no more than two unique elements--if not, a C and return of 0 occurs. Every load unloads all previous loads and any additions to them.
65            
66             Alternatively, skip this action; data don't have to be pre-loaded to use the stats methods here (see below).
67            
68             =cut
69            
70             sub load {
71 2     2 1 447 my $self = shift;
72 2         11 $self->SUPER::load(@_);
73 2         221 my $data = $self->access( index => -1 );
74 2         36 my @uniq = uniq( @{$data} );
  2         10  
75 2 50       5 if ( scalar @uniq > 2 ) {
76 0         0 carp __PACKAGE__, ' More than two elements were found in the data: '
77             . join( ' ', @uniq );
78 0         0 return 0;
79             }
80             else {
81 2         5 return 1;
82             }
83             }
84            
85             =head2 add, access, unload
86            
87             See L for these additional operations on data that have been loaded.
88            
89             =head2 observed
90            
91             $count = $joins->observed(); # assumes data have already been loaded
92             $count = $joins->observed(data => [1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1]);
93            
94             Returns the number of joins (or alternations) in a sequence - i.e., when, from the second trial onwards, the event on trial I doesn't equal the event on trial I - 1. For example, the following sequence adds up to 7 joins:
95            
96             Sequence: 1 0 0 0 1 0 0 1 0 1 1 0
97             JoinCount: 0 1 1 1 2 3 3 4 5 6 6 7
98            
99             Formally, for a sequence I = {I_I} indexed from zero,
100            
101             =for html
  J
N–1
Σ
i=1
 } 
0, aiai–1
1, otherwise
102            
103             The sequence to test can have been already Led, or it can be sent directly to this method, keyed as B. If no data are found by either of these ways, a C is heard.
104            
105             =cut
106            
107             sub observed {
108 8     8 1 722 my $self = shift;
109 8 100       15 my $args = ref $_[0] ? shift : {@_};
110 8 100       22 my $data = ref $args->{'data'} ? $args->{'data'} : $self->access($args);
111             croak 'No sequence of data to calculate joincout'
112             if not ref $data
113 8 50 33     65 or not scalar @{$data};
  8         14  
114 8         7 my $sum = 0;
115 8         6 for my $i ( 1 .. scalar @{$data} - 1 ) {
  8         16  
116 56 100       109 $sum++
117             if $data->[$i] ne $data->[ $i - 1 ]
118             ; # increment count if event is not same as last
119             }
120 8         12 return $sum;
121             }
122             *jco = \&observed;
123            
124             =head2 expected
125            
126             $val = $joins->expected(); # assumes data already loaded, uses default prob value (.5)
127             $val = $joins->expected(data => [1, 0, 0, 0, 1, 0, 0, 1, 0, 1]); # count these data, use default prob value (.5)
128             $val = $joins->expected(data => [1, 0, 0, 0, 1, 0, 0, 1, 0, 1], prob => .2); # count these data, use given prob value
129             $val = $joins->expected(data => [1, 0, 0, 0, 1, 0, 0, 1, 0, 1], state => 1); # count off trial numbers and prob. of event
130             $val = $joins->expected(prob => .2, trials => 10); # use this trial number and probability of one of the 2 events
131            
132             Returns the expected number of joins between the two possible elements of the given data, or for data of the given attributes, from Wishart and Hirschfeld (1936, p. 228):
133            
134             =for html

  E[J] = 2(N – 1)pq

135            
136             where I is the number of observations/trials, I

is the expected probability of the joined event taking on its observed value, and I is (1 - I

), the expected probability of the joined event I taking on its observed value.

137            
138             The data to test can already have been Led, or you send it directly keyed as B. The data are only needed to count off the number of trials, and the proportion of 1s (or other given state of the two), if the B and B attributes are not defined. If B is defined, then B is worked out from the actual data (as long as there are some, or 1/2 is assumed). If B is not defined, B takes the value you give to it, or, if it too is not defined, then 1/2 (assuming equiprobability of the two events).
139            
140             Counting up the observed number of joins needs some data to count through, but getting the expectation and variance for the joincount can just be fed with the number of B, and the Bability of one of the two events.
141            
142             =cut
143            
144             sub expected {
145 11     11 1 881 my $self = shift;
146 11 100       23 my $args = ref $_[0] ? shift : {@_};
147 11         15 my ( $n, $p ) = _get_N_and_p( $self, $args );
148 11         37 return 2 * ( $n - 1 ) * $p * ( 1 - $p );
149             }
150             *jce = \&expected;
151            
152             =head2 variance
153            
154             $val = $joins->variance(); # assume data already "loaded" for counting
155             $val = $joins->variance(data => $aref); # use inplace array reference, will use default prob of 1/2
156             $val = $joins->variance(data => [1, 0, 0, 0, 1, 0, 0, 1, 0, 1]); # count off trial numbers and prob. of event
157             $val = $joins->variance(data => [1, 0, 0, 0, 1, 0, 0, 1, 0, 1], prob => prob); # specify the event prob (recommended)
158             $val = $joins->variance(trials => number, prob => prob); # sufficient statistics
159            
160             Returns the expected variance in the number of joins for the given data, as estimated in Wishart and Hirschfeld (1936, p. 232), with a correction for small I (the second term) given by Burdick and Kelly (1977, p. 106, Eq. 20) that is trivial for very large I:
161            
162             =for html

  V[J] = 4Npq(1 – 3pq) – 2pq(3 – 10pq)

163            
164             with variables defined as above for L. The default operation applies the Burdick-Kelly correction; this can be dodged by specifying B => 0.
165            
166             The data to test can already have been Led, or it is given directly, keyed as B. The data are only needed to count off the number of trials, and estimate the expected probability of the joined event, if the B and B attributes aren't defined. If B is defined, then B is worked out from the actual data (as long as there are some, or expect a C). If B is not defined, B takes the given value or, if it too is not defined, then 1/2 (assuming equiprobability of the two events).
167            
168             =cut
169            
170             sub variance {
171 14     14 1 1076 my $self = shift;
172 14 100       29 my $args = ref $_[0] ? shift : {@_};
173 14         21 my ( $n, $p ) = _get_N_and_p( $self, $args );
174 14         16 my $pq = $p * ( 1 - $p );
175 14         28 my $var = 4 * $n * $pq * ( 1 - 3 * $pq );
176             $var -= 2 * $pq * ( 3 - 10 * $pq )
177             if not defined $args->{'ncorr'}
178 14 100 100     47 or $args->{'ncorr'} == 1;
179 14         42 return $var;
180             }
181             *jcv = \&variance;
182            
183             =head2 obsdev
184            
185             $v = $joins->obsdev(); # use data already loaded - anonymously; or specify its "label" or "index" - see observed()
186             $v = $joins->obsdev(data => [qw/blah bing blah blah blah/]); # use these data
187             $v = $joing->obsdev(observed => NUM, trials => NUM, prop => PROB); # sufficient
188            
189             Returns the observed deviation: the observed I expected joincount for the loaded/given sequence (I - I). Alias: C. Alternatively, the observed value might be given (as B => NUM), and so the method only has to get the expected value (as specified in L).
190            
191             =cut
192            
193             sub obsdev {
194 1     1 1 180 my $self = shift;
195 1 50       5 my $args = ref $_[0] ? shift : {@_};
196             my $obs =
197             defined $args->{'observed'}
198 1 50       4 ? $args->{'observed'}
199             : $self->observed($args);
200 1         3 return $obs - $self->expected($args);
201             }
202             *observed_deviation = \&obsdev;
203            
204             =head2 stdev
205            
206             $v = $joins->stdev(); # use data already loaded - anonymously; or specify its "label" or "index" - see observed()
207             $v = $joins->stdev(data => [qw/blah bing blah blah blah/]);
208            
209             Returns the standard deviation (square-root of the variance). Alias: C.
210            
211             =cut
212            
213             sub stdev {
214 2     2 1 401 return sqrt( variance(@_) );
215             }
216             *standard_deviation = \&stdev;
217            
218             =head2 z_value
219            
220             $val = $joins->z_value(); # data already loaded, use default windows and prob
221             $val = $joins->z_value(data => $aref, prob => .5, ccorr => 1, ncorr => 1);
222             ($zvalue, $pvalue) = $joins->z_value(data => $aref, prob => .5, ccorr => 1, tails => 2); # same but wanting an array, get the p-value too
223            
224             Returns the I-score from a test of joincount deviation, taking the joincount L away from that L and dividing by the root expected joincount L, by default with a continuity correction (B) to expectation. Called in list context, returns the I-score with its I

-value for the B (1 or 2) specified (2 by default).

225            
226             The data to test can already have been Led, or it is given directly, keyed as B.
227            
228             Other options are B (for the z_value) and B (for the p_value), and B for the (default) correction for small I.
229            
230             =cut
231            
232             sub z_value {
233 6     6 1 916 my $self = shift;
234 6 50       21 my $args = ref $_[0] ? shift : {@_};
235             my $obs =
236             defined $args->{'observed'}
237 6 100       15 ? $args->{'observed'}
238             : $self->observed($args);
239 6 100       11 my $ccorr = defined $args->{'ccorr'} ? $args->{'ccorr'} : 1;
240 6   50     23 my $tails = $args->{'tails'} || 2;
241 6         20 my $zed = Statistics::Zed->new();
242             my ( $zval, $pval ) = $zed->zscore(
243             observed => $obs,
244             expected => $self->expected($args),
245             variance => $self->variance($args),
246             ccorr => $ccorr,
247             tails => $tails,
248             precision_s => $args->{'precision_s'},
249 6         99 precision_p => $args->{'precision_p'},
250             );
251 6 100       659 return wantarray ? ( $zval, $pval ) : $zval;
252             }
253             *jzs = \&z_value;
254             *joincount_zscore = \&z_value;
255             *zscore = \&z_value;
256            
257             =head2 p_value
258            
259             $p = $joins->p_value(); # using loaded data and default args
260             $p = $joins->p_value(ccorr => 0|1, tails => 1|2); # as above, with options
261             $p = $joins->p_value(data => [1, 0, 1, 1, 0]); # directly giving data (by-passing load and read)
262             $p = $joins->p_value(trials => NUM, observed => NUM, prob => PROB); # without using data
263            
264             Returns the normal probability value for I-value given by taking the joincount L away from that L and dividing by the root expected joincount L, by default with a continuity correction (B) to expectation and with B => 2. Data are those already Led, or as directly keyed as B. In the absence of "data", the sufficient statistics of B and B are required (or, by default, B => 1/2 is used).
265            
266             =cut
267            
268             sub p_value {
269 1     1 1 181 return ( z_value(@_) )[1];
270             }
271             *test = \&p_value;
272             *joins_test = \&p_value;
273             *jct = \&p_value;
274            
275             =head2 stats_hash
276            
277             $href = $joins->stats_hash(values => {observed => 1, expected => 1, variance => 1, z_value => 1, p_value => 1}, prob => .5, ccorr => 1);
278            
279             Returns a hashref for the counts and stats as specified in its "values" argument, and with any options for calculating them (e.g., exact for p_value). See L for details. If calling via a "joins" object, the option "stat => 'joins'" is not needed (unlike when using the parent "sequences" object).
280            
281             =head2 dump
282            
283             $joins->dump(values => { observed => 1, variance => 1, p_value => 1}, exact => 1, flag => 1, precision_s => 3); # among other options
284            
285             Print Joins-test results to STDOUT. See L in the Statistics::Sequences manpage for details.
286            
287             =cut
288            
289             sub dump {
290 0     0 1 0 my $self = shift;
291 0 0       0 my $args = ref $_[0] ? $_[0] : {@_};
292 0         0 $args->{'stat'} = 'joins';
293 0         0 $self->SUPER::dump($args);
294 0         0 return;
295             }
296            
297             # returns two vars: (1) number of elements in the given "data" (sequence), and (2) relative frequency of a given state in the data
298             sub _get_N_and_p {
299 25     25   25 my ( $self, $args, $n, $data ) = @_;
300 25 100       31 if ( defined $args->{'trials'} ) {
301 9         9 $n = $args->{'trials'};
302             }
303             else {
304 16 100       29 $data = ref $args->{'data'} ? $args->{'data'} : $self->access($args);
305 16         92 $n = scalar @{$data};
  16         14  
306             }
307             my $p =
308             defined $args->{'prob'} ? $args->{'prob'}
309             : ( defined $args->{'state'} and defined $data )
310 25 50 33     39 ? _count_pfrq( $data, $args->{'state'} )
    100          
311             : .5;
312 25         28 return ( $n, $p );
313             }
314            
315             sub _count_pfrq {
316 0     0     my ( $aref, $state, $count ) = @_;
317 0 0 0       return .5 if not ref $aref or not scalar @{$aref};
  0            
318 0 0   0     $count++ if true { $_ eq $state } @{$aref};
  0            
  0            
319 0           return $count / scalar @{$aref};
  0            
320             }
321            
322             1;
323            
324             __END__