File Coverage

blib/lib/Statistics/Data/Dichotomize.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package Statistics::Data::Dichotomize;
2              
3 1     1   24662 use 5.006;
  1         5  
  1         44  
4 1     1   7 use strict;
  1         2  
  1         38  
5 1     1   6 use warnings;
  1         8  
  1         37  
6 1     1   535 use Statistics::Data;
  0            
  0            
7             use vars qw($VERSION @ISA @EXPORT);
8             use Exporter;
9             @ISA = qw(Statistics::Data Exporter);
10             $VERSION = '0.03';
11             use Carp qw(croak carp);
12             use Statistics::Lite qw(mean median mode);
13             use Number::Misc 'is_numeric'; # is_numeric('x');
14              
15             @EXPORT = (qw/split cut pool swing match binate shrink windowize/);
16              
17             =head1 NAME
18              
19             Statistics-Data-Dichotomize - Dichotomize one or more numerical or categorical sequences into a single two-valued one
20              
21             =head1 SYNOPSIS
22              
23             use Statistics::Data::Dichotomize 0.03;
24             my $ddat = Statistics::Data::Dichotomize->new();
25             $ddat->load(23, 24, 7, 55); # numerical data
26             my $aref = $ddat->split(value => 'median',); # or use swing(), pool(), binate(), shrink()
27             # - alternatively, without load()
28             $aref = $ddat->split(data => [23, 24, 7, 55], value => 20);
29             # or after a multi-sequence load:
30             $ddat->load(fiz =>[qw/c b c a a/], gok => [qw/b b b c a/]); # names are arbitary
31             $aref = $ddat->binate(data => 'fiz', oneis => 'c',); # returns (1, 0, 1, 0, 0)
32             $ddat = $ddat->match(); # or majoritymatch() for more than 2 sequences
33              
34             $ddat->print_dataline("%d\t"); # loaded sequence is also cached. prints ""
35            
36             $ddat->load([qw/c b c a a/], [qw/b b b c a/]); # categorical (stringy) data
37             printf("%d\t", @$ddat, "\n"); # DIY version. prints "0 1 0 0 1"
38            
39             # plus other methods from Statistics::Data
40              
41             =head1 DESCRIPTION
42              
43             Transform one or more sequences into a binomial, dichotomous, two-valued sequence by various methods. Each method returns the dichotomized sequences as a referenced array.
44              
45             =head1 METHODS
46              
47             =head2 new
48              
49             To create class object directly from this module, inheriting all the L<Statistics::Data|Statistics::Data> methods.
50              
51             =head2 load, add, read, unload
52              
53             Methods for loading, updating and retrieving data are inherited from L<Statistics::Data|Statistics::Data>. See that manpage for details.
54              
55             =cut
56              
57             =head2 Numerical data: Single sequence dichotomization
58              
59             =head3 split, cut
60              
61             $aref = $seq->split(value => 'median', equal => 'gt'); # split loaded data at its median (as per Statistics::Lite)
62             ($aref, $val) = $seq->split(data => \@data, value => \&Statistics::Lite::median); # same by reference, giving data, getting back median too
63             $aref = $seq->split(value => 23); # split anonymously cached data at a specific value
64             $aref = $seq->split(value => 'mean', data => 'blues'); # split named data at its arithmetical mean (as per Statistics::Lite)
65              
66             Reduce data by categorizing them as to whether they're numerically higher or low than a particular value, e.g., their median value. So the following data, when split over values greater than or equal to 5, yield the dichotomous sequence:
67              
68             @orig_data = (4, 3, 3, 5, 3, 4, 5, 6, 3, 5, 3, 3, 6, 4, 4, 7, 6, 4, 7, 3);
69             @split_data = (0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0);
70              
71             Arguments:
72              
73             =over 4
74              
75             =item value => 'mean|median|mode' - or a specific numerical value, or code reference
76              
77             Specify the value at which the data will be split. This could be the mean, median or mode (as calculated by L<Statistics::Lite|Statistics::Lite>), or a numerical value within the range of the data, or some appropriate subroutine - one that takes a list and returns a single descriptive about it. The default is the I<median>. The split-value, as specified by B<value>, can be retrieved as the second element returned if calling for an array.
78              
79             =item equal => 'I<gt>|I<lt>|I<0>'
80              
81             Specify how to split the data should the split-value (as specified by B<value>) be present in the data. The default value is 0: observations equal to the split-value are skipped; the L<Joins|Statistics::Sequences::Joins> test in particular assumes this. If B<equal =E<gt> 'I<gt>'>: all data-values I<greater than or equal to> the split-value will form one group, and all data-values less than the split-value will form another. To split with all values I<less than or equal to> in one group, and higher values in another, use B<equal =E<gt> 'I<lt>'>.
82              
83             =item data => 'I<string>'
84              
85             Refer to data to split, if not already loaded.
86              
87             =back
88              
89             =cut
90              
91             sub split {
92             my $self = shift;
93             my $args = ref $_[0] ? $_[0] : {@_};
94             my $dat = ref $args->{'data'} ? $args->{'data'} : $self->read($args);
95             croak __PACKAGE__, '::split All data must be numeric for dichotomizing about a split-value' if !$self->all_numeric($dat);
96             $args->{'value'} = 'median' if ! defined $args->{'value'};
97             $args->{'equal'} = 'gt' if ! defined $args->{'equal'};
98             my ($val, @seqs) = ();
99            
100             # Get a split-value:
101             if (! is_numeric($args->{'value'})) {
102             my $code = delete $args->{'value'};
103             no strict 'refs';
104             $val = $code->(@{$dat});
105             }
106             else {
107             $val = $args->{'value'};
108             }
109             # Categorize by number of observations above and below the split_value:
110             push @seqs, $_ > $val ? 1 : $_ < $val ? 0 : $args->{'equal'} eq 'gt' ? 1 : $args->{'equal'} eq 'lt' ? 0 : 1 foreach @{$dat};
111             $self->{'testdata'} = \@seqs;
112             $self->{'split_value'} = $val;
113             return wantarray ? (\@seqs, $val) : \@seqs;
114             }
115             *cut = \&split;
116              
117             =head3 swing
118              
119             $seq->swing();
120             $seq->swing(data => 'reds'); # if more than one are loaded, or a single one was loaded with a name
121              
122             Group by the rises and falls in the data. Each element in the named data-set is subtracted from its successor, and the result is replaced with a 1 if the difference represents an increase, or 0 if it represents a decrease. For example, the following numerical series (example from Wolfowitz, 1943, p. 283) produces the subsequent dichotomous series.
123              
124             @values = (qw/3 4 7 6 5 1 2 3 2/);
125             @dichot = (qw/1 1 0 0 0 1 1 0/);
126              
127             Dichotomously, the data can be seen as commencing with an ascending run of length 2, followed by a descending run of length 3, and so on. Note that the number of resulting observations is 1 less than the original number.
128              
129             Note that the critical region of the distribution lies (only) in the upper-tail; a one-tailed test of significance is appropriate.
130              
131             =over 4
132              
133             =item equal => 'I<gt>|I<lt>|I<rpt>|I<0>'
134              
135             The default result when the difference between two successive values is zero is to skip the observation, and move onto the next succession (B<equal =E<gt> 0>). Alternatively, you may wish to repeat the result for the previous succession; skipping only a difference of zero should it occur as the first result (B<equal =E<gt> 'rpt'>). Or, a difference greater than or equal to zero is counted as an increase (B<equal =E<gt> 'gt'>), or a difference less than or equal to zero is counted as a decrease. For example,
136              
137             @values = (qw/3 3 7 6 5 2 2/);
138             @dicho_def = (qw/1 0 0 0/); # First and final results (of 3 - 3, and 2 - 2) are skipped
139             @dicho_rpt = (qw/1 0 0 0 0/); # First result (of 3 - 3) is skipped, and final result repeats the former
140             @dicho_gt = (qw/1 1 0 0 0 1/); # Greater than or equal to zero is an increase
141             @dicho_lt = (qw/0 1 0 0 0 0/); # Less than or equal to zero is a decrease
142              
143             =back
144              
145             =cut
146              
147             sub swing {
148             my $self = shift;
149             my $args = ref $_[0] ? $_[0] : {@_};
150             my $dat = ref $args->{'data'} ? $args->{'data'} : $self->read($args);
151             croak __PACKAGE__, '::split All data must be numeric for dichotomizing about a split-value' if !$self->all_numeric($dat);
152             $args->{'equal'} = 0 if ! defined $args->{'equal'}; #- no default??
153             my ($i, $res, @seqs) = ();
154              
155             # Replace observations with the succession of rises and falls:
156             for ($i = 0; $i < (scalar @{$dat} - 1); $i++) {
157             $res = $dat->[($i + 1)] - $dat->[$i];
158             if ($res > 0) {
159             push @seqs, 1;
160             }
161             elsif ($res < 0) {
162             push @seqs, 0;
163             }
164             else {
165             if ($args->{'equal'} eq 'rpt') {
166             push @seqs, $seqs[-1] if scalar @seqs;
167             }
168             elsif ($args->{'equal'} eq 'gt') {
169             push @seqs, 1;
170             }
171             elsif ($args->{'equal'} eq 'lt') {
172             push @seqs, 0;
173             }
174             else {
175             next;
176             }
177             }
178             }
179             $self->{'testdata'} = \@seqs;
180             return \@seqs;
181             }
182              
183             =head2 Numerical data: Two sequence dichotomization
184              
185             See also the methods for categorical data where it is ok to ignore any order and intervals in your numerical data.
186              
187             =head3 pool
188              
189             $data_aref = $seq->pool(data => [\@aref1, \@aref2]);
190             $data_aref = $seq->pool(data => [$seq->read(index => 0), $seq->read(index => 1)]); # after $seq->load(\@aref1, \@aref2);
191             $data_aref = $seq->pool(data => [$seq->read(label => '1'), $seq->read(label => '2')]); # after $seq->load(1 => \@aref1, 2 => \@aref2);
192              
193             I<This is typically used for a Wald-Walfowitz test of difference between two samples - ranking by median.>
194              
195             Constructs a single series out of two series of cached I<numerical> data as a ranked pool, i.e., by pooling the data from each series according to the magnitude of their values at each trial, from lowest to heighest. Specifically, the values from both samples are pooled and ordered from lowest to highest, and then dichotomized into runs according to the sample from which neighbouring values come from. Another run occurs wherever there is a change in the source of the values. A non-random effect of, say, higher or lower values consistently coming from one sample rather than another, would be reflected in fewer runs than expected by chance.
196              
197             =cut
198              
199             sub pool {
200             my $self = shift;
201             my $args = ref $_[0] ? $_[0] : {@_};
202             my $dat = ref $args->{'data'} ? $args->{'data'} : $self->read($args);
203             $self->all_numeric($_) foreach @{$dat};
204             my ($dat1, $dat2) = @{$dat};
205             my $sum = scalar(@{$dat1}) + scalar(@{$dat2});
206             my ($i, $x, $y, @seqs) = (0);
207             my @dat = ();
208             $dat[0] = [sort {$a <=> $b} @{$dat1}];
209             $dat[1] = [sort {$a <=> $b} @{$dat2}];
210             while (scalar(@seqs) < $sum) {
211             $x = $dat[0]->[0];
212             $y = $dat[1]->[0];
213             $i = defined $x && defined $y ? $x < $y ? 0 : 1 : defined $x ? 0 : 1;
214             shift @{$dat[$i]};
215             push @seqs, $i;
216             }
217             $self->{'testdata'} = \@seqs;
218             return \@seqs;
219             }
220             ## DEV: consider: List::AllUtils::pairwise:
221             # @x = pairwise { $a + $b } @a, @b; # returns index-by-index sums
222              
223             =head2 Categorical data
224              
225             =head3 match
226              
227             $data_aref = $seq->match(data => [\@aref1, \@aref2], lag => signed integer, loop => 0|1); # with optional crosslag of the two sequences
228             $data_aref = $seq->match(data => [$seq->read(index => 0), $seq->read(index => 1)]); # after $seq->load(\@aref1, \@aref2);
229             $data_aref = $seq->match(data => [$seq->read(label => '1'), $seq->read(label => '2')]); # after $seq->load(1 => \@aref1, 2 => \@aref2);
230              
231             Reduce two lists of loaded data to two categories in a single array, according to the match between the elements at each index. Where the data-values are equal at a certain index, they will be represented with a 1; otherwise a 0. Numerical or stringy data can be equated. For example, the following two arrays would be reduced to the third, where a 1 indicates a match of identical values in the two data sources.
232              
233             @blues = (qw/1 3 3 2 1 5 1 2 4/);
234             @reds = (qw/4 3 1 2 1 4 2 2 4/);
235             @dicho = (qw/0 1 0 1 1 0 0 1 1/);
236              
237             The following options may be specified.
238              
239             =over 4
240              
241             =item data => [\@aref1, \@aref2]
242              
243             Specify two referenced arrays; no data, or more than 2, gets a C<croak>.
244              
245             =item lag => I<integer> (where I<integer> < number of observations I<or> I<integer> > -1 (number of observations) )
246              
247             Match the two data-sets by shifting the first named set ahead or behind the other data-set by B<lag> observations. The default is zero. For example, one data-set might be targets, and another responses to the targets:
248              
249             targets = cbbbdacdbd
250             responses = daadbadcce
251              
252             Matched as a single sequence of hits (1) and misses (0) where B<lag> = B<0> yields (for the match on "a" in the 6th index of both arrays):
253              
254             0000010000
255              
256             With B<lag> => 1, however, each response is associated with the target one ahead of the trial for which it was observed; i.e., each target is shifted to its +1 index. So the first element in the above responses (I<d>) would be associated with the second element of the targets (I<b>), and so on. Now, matching the two data-sets with a B<+1> lag gives two hits, of the 4th and 7th elements of the responses to the 5th and 8th elements of the targets, respectively:
257              
258             000100100
259              
260             making 5 runs. With B<lag> => 0, there are 3 runs. Lag values can be negative, so that B<lag> => -2 will give:
261              
262             00101010
263              
264             Here, responses necessarily start at the third element (I<a>), the first hits occurring when the fifth response-element corresponds to the the third target element (I<b>). The last response (I<e>) could not be used, and the number of elements in the hit/miss sequence became n-B<lag> less the original target sequence. This means that the maximum value of lag must be one less the size of the data-sets, or there will be no data.
265              
266             You can, alternatively, preserve all lagged data by looping any excess to the start or end of the criterion data. The number of observations will then always be the same, regardless of the lag. Matching the data in the example above with a lag of +1, with looping, creates an additional match between the final response and the first target (I<d>):
267              
268             1000100100
269              
270             =item loop => 0|1
271              
272             For circularized lagging), B<loop> => 1, and the size of the returned array is the same as those for the given data. For example, with a lag of +1, the last element in the "response" array is matched to the first element of the "target" array.
273              
274             =back
275              
276             =cut
277              
278             sub match {
279             my $self = shift;
280             my $args = ref $_[0] ? $_[0] : {@_};
281             my $dat = ref $args->{'data'} ? $args->{'data'} : $self->read($args);
282             $dat = $self->crosslag(lag => $args->{'lag'}, data => [$dat->[0], $dat->[1]], loop => $args->{'loop'}) if $args->{'lag'};
283             my $lim = scalar @{$dat->[0]} <= scalar @{$dat->[1]} ? scalar(@{$dat->[0]}) : scalar(@{$dat->[1]}); # ensure criterion data-set is smallest
284             my ($i, @seqs) = ();
285             for ($i = 0; $i < $lim; $i++) {
286             next if !defined $dat->[0]->[$i] || !defined $dat->[1]->[$i];
287             $seqs[$i] = $dat->[0]->[$i] eq $dat->[1]->[$i] ? 1 : 0;
288             }
289             $self->{'testdata'} = \@seqs;
290             return \@seqs;
291             }
292              
293             =head3 binate
294              
295             $seq->binate(oneis => 'E'); # optionally specify a state in the sequence to be set as "1"
296             $seq->binate(data => \@ari, oneis => 'E'); # optionally specify a state in the sequence to be set as "1"
297             # $seq->binate(oneis => 'E', data => 'targets'); # no longer supported
298              
299             A basic utility to convert a list of dichotomous categories into a list of 1s and zeroes, setting the first element in the list to 1 (or whatever is specified as "oneis") on all its occurrences in the list, and all other values in the list to zero. This is simply useful if you have categorical data with two states that, without assuming they have numerical properties, could still be assessed for, say, runs up-and-down, or turning-points. Naturally, this conversion is not meaningful, and should usually not be used, if the data are not categorically dichotomous, e.g., if they consist of the four DNA letters, or the five Zener symbols.
300              
301             =cut
302              
303             sub binate {
304             my $self = shift;
305             my $args = ref $_[0] ? $_[0] : {@_};
306             my $dat = ref $args->{'data'} ? $args->{'data'} : $self->read($args);
307             my $oneis = defined $args->{'oneis'} ? delete $args->{'oneis'} : $dat->[0];# What value set to 1 and others to zero?
308             my $seqs = [map {$_ eq $oneis ? 1 : 0} @{$dat}]; # replace observations with 1s and 0s
309             $self->{'testdata'} = $seqs;
310             return $seqs;
311             }
312              
313             =head2 Numerical or stringy data: Single sequence dichotimisation
314              
315             =head3 shrink, boolwin
316              
317             $seq->shrink(winlen => number, rule => CODE)
318              
319             Take non-overlapping slices, or windows, of a multinomial sequence of a given B<winlen>, and to make a true/false sequence out of them according to whether or not each slice passes a B<rule>. The B<rule> is a code reference that gets the data already L<load|load, add, read, unload>ed as an array reference, and so might be something like this:
320              
321             sub { return Statistics::Lite::mean(@$_) > 2 ? 1 : 0; }
322              
323             If B<winlen> is set to 3, this rule would make the following numerical sequence of 9 elements shrink into the following dichotomous (Boolean) sequence of 3 elements:
324              
325             @data = (1, 2, 3, 3, 3, 3, 4, 2, 1);
326             @means = (2, 3, 2.5 );
327             @dico = (0, 1, 1 );
328              
329             The B<rule> method must return boolean values to dichotomize the data, and B<winlen> should make up equally sized segments (no error is thrown if this isn't the case, the remainder just gets figured in the same way).
330              
331             =cut
332              
333             sub shrink {
334             my $self = shift;
335             my $args = ref $_[0] ? $_[0] : {@_};
336             my $dat = ref $args->{'data'} ? $args->{'data'} : $self->read($args);
337             my $lim = scalar @{$dat};
338             my $len = int(delete $args->{'winlen'});
339             $len ||= 1;
340             my $code = delete $args->{'rule'};
341             croak __PACKAGE__, '::shrink Need a code to Boolean shrink' if !$code or ref $code ne 'CODE';
342             my ($i, @seqs);
343             for ($i = 0; $i < $lim; $i += $len) {
344             push @seqs, $code->([@{$dat}[$i .. ($i + $len - 1)]]);
345             }
346             $self->{'testdata'} = \@seqs;
347             return \@seqs;
348             }
349             *boolwin = \&shrink;
350              
351             =head2 Helper methods
352              
353             =head3 crosslag
354              
355             @lagged_arefs = $dat->crosslag(data => [\@ari1, @ari2], lag => signed integer, loop => 0|1);
356             $aref_of_arefs = $dat->crosslag(data => [\@ari1, @ari2], lag => signed integer, loop => 0|1); # same but not "wanting array"
357              
358             Helper method: Takes two arrays and returns them cross-lagged against each other, shifting and popping values according to the number of "lags". Typically used when wanting to L<match|match> the two arrays against each other.
359              
360             =over 4
361              
362             =item lag => signed integer up to the number of elements
363              
364             Takes the first array sent as "data" as the reference or "target" array for the second "response" array to be shifted so many lags before or behind it. With no looping of the lags, this means the returned arrays are "lag"-elements smaller than the original arrays. For example, with lag => +1 (and loop => 0, the default):
365              
366             @t = qw(c p w p s) becomes (p w p s)
367             @r = qw(p s s w r) becomes (p s s w)
368              
369             =item loop => 0|1
370              
371             For circularized lagging), B<loop> => 1, and the size of the returned array is the same as those for the given data. For example, with a lag of +1, the last element in the "response" array is matched to the first element of the "target" array:
372              
373             @t = qw(c p w p s) becomes (p w p s c) (looped with +1)
374             @r = qw(p s s w r) becomes (p s s w r) (no effect)
375              
376             In this case, it might be more efficient to simply autolag the "target" sequence against itself.
377              
378             =back
379              
380             =cut
381              
382             sub crosslag {
383             my $self = shift;
384             my $args = ref $_[0] ? $_[0] : {@_};
385             my $lag = $args->{'lag'};
386             my $dat1 = $args->{'data'}->[0];
387             my $dat2 = $args->{'data'}->[1];
388             my $loop = $args->{'loop'};
389            
390             return ( wantarray ? ($dat1, $dat2) : [$dat1, $dat2] ) if !$lag or abs($lag) >= scalar @{$dat1};
391            
392             my @tgt = @{$dat1};
393             my @rsp = @{$dat2};
394              
395             if ($lag > 0) {
396             foreach (1 .. abs($lag) ) {
397             if ($loop) {
398             unshift(@tgt, pop @tgt);
399             }
400             else {
401             shift @tgt;
402             pop @rsp;
403             }
404             }
405             }
406             elsif ($lag < 0) {
407             foreach (1 .. abs($lag) ) {
408             if ($loop) {
409             push(@tgt, shift @tgt);
410             }
411             else {
412             pop @tgt;
413             shift @rsp;
414             }
415             }
416             }
417             return wantarray ? (\@tgt, \@rsp) : [\@tgt, \@rsp];
418             }
419              
420              
421             =head1 AUTHOR
422              
423             Roderick Garton, C<< <rgarton at cpan.org> >>
424              
425             =head1 REFERENCES
426              
427             Burdick, D. S., & Kelly, E. F. (1977). Statistical methods in parapsychological research. In B. B. Wolman (Ed.), I<Handbook of parapsychology> (pp. 81-130). New York, NY, US: Van Nostrand Reinhold. [Describes window-boolean reduction.]
428              
429             Swed, F., & Eisenhart, C. (1943). Tables for testing randomness of grouping in a sequence of alternatives. I<Annals of Mathematical Statistics>, I<14>, 66-87. [Describes pool method and test example.]
430              
431             Wolfowitz, J. (1943). On the theory of runs with some applications to quality control. I<Annals of Mathematical Statistics>, I<14>, 280-288. [Describes swings "runs up and down" and test example.]
432              
433             =head1 TO DO
434              
435             Sort option for pool method ?
436              
437             =head1 BUGS
438              
439             Please report any bugs or feature requests to C<bug-Statistics-Data-Dichotomize-0.03 at rt.cpan.org>, or through
440             the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Statistics-Data-Dichotomize-0.03>. I will be notified, and then you'll
441             automatically be notified of progress on your bug as I make changes.
442              
443             =head1 SUPPORT
444              
445             You can find documentation for this module with the perldoc command.
446              
447             perldoc Statistics::Data::Dichotomize
448              
449             You can also look for information at:
450              
451             =over 4
452              
453             =item * RT: CPAN's request tracker (report bugs here)
454              
455             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Statistics-Data-Dichotomize-0.03>
456              
457             =item * AnnoCPAN: Annotated CPAN documentation
458              
459             L<http://annocpan.org/dist/Statistics-Data-Dichotomize-0.03>
460              
461             =item * CPAN Ratings
462              
463             L<http://cpanratings.perl.org/d/Statistics-Data-Dichotomize-0.03>
464              
465             =item * Search CPAN
466              
467             L<http://search.cpan.org/dist/Statistics-Data-Dichotomize-0.03/>
468              
469             =back
470              
471             =head1 LICENSE AND COPYRIGHT
472              
473             Copyright 2012-2013 Roderick Garton.
474              
475             This program is free software; you can redistribute it and/or modify it
476             under the terms of either: the GNU General Public License as published
477             by the Free Software Foundation; or the Artistic License.
478              
479             See http://dev.perl.org/licenses/ for more information.
480              
481             =cut
482              
483             1; # End of Statistics::Data::Dichotomize