File Coverage

blib/lib/Statistics/ANOVA/JT.pm
Criterion Covered Total %
statement 110 110 100.0
branch 7 8 87.5
condition 4 6 66.6
subroutine 17 17 100.0
pod 4 4 100.0
total 142 145 97.9


line stmt bran cond sub pod time code
1             package Statistics::ANOVA::JT;
2            
3 4     4   58901 use 5.006;
  4         12  
  4         145  
4 4     4   15 use strict;
  4         4  
  4         135  
5 4     4   13 use warnings FATAL => 'all';
  4         7  
  4         139  
6 4     4   15 use base qw(Statistics::Data);
  4         4  
  4         2356  
7 4     4   83693 use Algorithm::Combinatorics qw(combinations);
  4         11107  
  4         227  
8 4     4   20 use List::AllUtils qw(sum0);
  4         5  
  4         131  
9 4     4   1824 use Statistics::Data::Rank;
  4         10755  
  4         114  
10 4     4   20 use Statistics::Lite qw(count max);
  4         6  
  4         3528  
11            
12             =head1 NAME
13            
14             Statistics::ANOVA::JT - Jonckheere-Terpstra statistics and test
15            
16             =head1 VERSION
17            
18             Version 0.01
19            
20             =cut
21            
22             our $VERSION = '0.01';
23            
24             =head1 SYNOPSIS
25            
26             use Statistics::ANOVA::JT;
27             my $jt = Statistics::ANOVA::JT->new();
28             $jt->load({1 => [2, 4, 6], 2 => [3, 3, 12], 3 => [5, 7, 11, 16]}); # note ordinal datanames
29             my $j_value = $jt->observed(); # or expected(), variance()
30             my ($z_value, $p_value) = $jt->zprob_test(ccorr => 2, tails => 1, correct_ties => 1);
31             # or without pre-loading:
32             $j_value = $jt->observed(data => {1 => [2, 4, 6], 2 => [5, 3, 12]});
33             # or for subset of loaded data:
34             $j_value = $jt->observed(lab => [1, 3]);
35            
36             =head1 DESCRIPTION
37            
38             Calculates Jonckheere-Terpstra statistics for sameness (common population) across given orders of independent variables. The statistics are based on a between-groups pooled ranking of the data, like the Kruskal-Wallis test, but, unlike Kruskall-Wallis that returns the same result regardless of order of levels, it takes into account ordinal value of the named data. As ordinal values, numerical intervals between the named values do not matter.
39            
40             Data-loading and retrieval are as provided in L, on which the JT object is Cd, so its other methods are available here.
41            
42             Return values are tested on installation against published examples: in Hollander and Wolfe (1999), for sample MStat output on L, and for the final I-value in the L example.
43            
44             =head1 SUBROUTINES/METHODS
45            
46             =head2 new
47            
48             $jt = Statistics::ANOVA::JT->new();
49            
50             New object for accessing methods and storing results. This "isa" Statistics::Data object.
51            
52             =head2 observed
53            
54             $val = $jt->observed(); # data pre-loaded
55             $val = $jt->observed(data => $hashref_of_arefs);
56            
57             Returns the statistic I: From between-group rankings of all possible pairwise splits of the data, accumulates I as the sum of I(I - 1)/2 Mann-Whitney I counts.
58            
59             Optionally, if the data have not been pre-loaded, send as named argument B.
60            
61             =cut
62            
63             sub observed {
64 2     2 1 926 my ( $self, %args ) = @_;
65 2         7 return _calc_j_value( _get_data($self, %args) );
66             }
67            
68             =head2 expected
69            
70             $val = $jt->expected(); # data pre-loaded
71             $val = $jt->expected(data => $hashref_of_arefs);
72            
73             Returns the expected value of the I statistic for the given data.
74            
75             =cut
76            
77             sub expected {
78 3     3 1 1350 my ( $self, %args ) = @_;
79 3         8 return _calc_jexp( _get_data($self, %args) );
80             }
81            
82             =head2 variance
83            
84             $val = $jt->variance(); # data pre-loaded
85             $val = $jt->variance(data => $hashref_of_arefs);
86            
87             Return the variance expected to occur in the I values for the given data.
88            
89             By default, the method accounts for and corrects for ties, but if C = 0, the returned value is the usual "null" distribution variance, otherwise with an elaborate correction accounting for the number of tied variables and each of their sizes, as offered by Hollander & Wolfe (1999) Eq 6.19, p. 204.
90            
91             =cut
92            
93             sub variance {
94 2     2 1 823 my ( $self, %args ) = @_;
95 2 100 66     14 if ( defined $args{'correct_ties'} and $args{'correct_ties'} == 0 ) {
96 1         3 return _calc_jvar_ig_ties( _get_data($self, %args) );
97             }
98             else {
99 1         3 return _calc_jvar_by_ties( _get_data($self, %args) );
100             }
101             }
102            
103             =head2 zprob_test
104            
105             $p_val = $jt->zprob_test(); # data pre-loaded
106             $p_val = $jt->zprob_test(data => $hashref_of_arefs);
107             ($z_val, $p_val) = $jt->zprob_test(); # get z-score too
108            
109             Performs a z-test on the data and returns the associated probability; or, if called in array context, the z-value itself and then the probability value.
110            
111             Rather than calculating the exact I

-value, calculates an expected I value and variance, to provide a normalized I for which the I

-value is read off the normal distribution. This is appropriate for "large" samples, e.g., greater-than 3 levels, with more than eight observations per level. Otherwise, read the value returned from C<$jt-Eobserved()> and look it up in a table of I-values, such as in Hollander & Wolfe (1999), p. 649ff.

112            
113             Optional arguments include B (as above), and B and B as in L. For example, to continuity correct by reducing the observed I-value by 1 (recommended in some texts), set B => 2 (for half on either side of the expected value; if B => 1, then 0.5 is taken off the observed deviation, and so on). The default is not to continuity correct.
114            
115             =cut
116            
117             sub zprob_test {
118 3     3 1 1492 my ( $self, %args ) = @_;
119 3         14 my $href = _get_data($self, %args);
120 3         1468 require Statistics::Zed;
121 3         20816 my $zed = Statistics::Zed->new();
122 3 100 66     80 my ( $z_value, $p_value ) = $zed->z_value(
123             observed => _calc_j_value($href),
124             expected => _calc_jexp($href),
125             variance =>
126             ( defined $args{'correct_ties'} and $args{'correct_ties'} == 0 )
127             ? _calc_jvar_ig_ties($href)
128             : _calc_jvar_by_ties($href),
129             %args
130             );
131 3 50       416 return wantarray ? ( $z_value, $p_value ) : $p_value;
132             }
133            
134             sub _calc_j_value {
135 5     5   7 my $data = shift;
136 5         26 my $rankd = Statistics::Data::Rank->new();
137 5         35 my $j_value = 0;
138 5         6 for my $pairs ( combinations( [ keys %{$data} ], 2 ) ) {
  5         29  
139 15         543 my ( $p1, $p2 ) = @{$pairs};
  15         24  
140 15         64 my $ranks_href = $rankd->ranks_between(
141             data => { $p1 => $data->{$p1}, $p2 => $data->{$p2} } );
142 15         39 my %counts = (
143 15         51 $p1 => count( @{ $data->{$p1} } ),
144 15         1725 $p2 => count( @{ $data->{$p2} } )
145             );
146 15         45 my $nprod = $counts{$p1} * $counts{$p2};
147 15         16 my @us = ();
148 15         20 for ( $p1, $p2 ) {
149 30         24 my $n = $counts{$_};
150 30         86 push @us,
151             $nprod +
152             ( ( $n * ( $n + 1 ) ) / 2 ) -
153 30         39 sum0( @{ $ranks_href->{$_} } );
154             }
155 15         33 $j_value += max(@us);
156             }
157 5         70 return $j_value;
158             }
159            
160             sub _calc_jexp {
161 6     6   9 my ($data) = @_;
162 6         10 my ( $nj, $sum_n ) = ( 0, 0 );
163 6         5 for ( keys %{$data} ) {
  6         16  
164 18         29 my $n = count( @{ $data->{$_} } );
  18         39  
165 18         31 $sum_n += $n;
166 18         23 $nj += $n**2;
167             }
168 6         40 return ( $sum_n**2 - $nj ) / 4;
169             }
170            
171             sub _calc_jvar_ig_ties {
172 2     2   2 my $data = shift;
173 2         3 my ( $sum_n, $v ) = ( 0, 0 );
174 2         2 for ( keys %{$data} ) {
  2         5  
175 6         5 my $n = count( @{ $data->{$_} } );
  6         14  
176 6         9 $sum_n += $n;
177 6         11 $v += $n**2 * ( 2 * $n + 3 );
178             }
179 2         8 return ( $sum_n**2 * ( 2 * $sum_n + 3 ) - $v ) / 72;
180             }
181            
182             sub _calc_jvar_by_ties {
183 3     3   4 my $data = shift;
184 3         4 my $sum_n = 0;
185 3         5 my @ns = ();
186 3         4 for ( keys %{$data} ) {
  3         8  
187 9         9 my $n = count( @{ $data->{$_} } );
  9         14  
188 9         18 $sum_n += $n;
189 9         8 push @ns, $n;
190             }
191 3         4 my $ng = scalar( keys( %{$data} ) );
  3         5  
192 3         13 my $rankd = Statistics::Data::Rank->new();
193 3         25 my ( $uranks_href, $xtied, $gn ) =
194             $rankd->ranks_between( data => $data );
195 3         705 my $a2 = sum0( map { $_ * ( $_ - 1 ) * ( 2 * $_ + 5 ) } @ns );
  9         29  
196 3         5 my $a3 = sum0( map { $_ * ( $_ - 1 ) * ( 2 * $_ + 5 ) } @{$xtied} );
  32         35  
  3         6  
197 3         5 my $b2 = sum0( map { $_ * ( $_ - 1 ) * ( $_ - 2 ) } @ns );
  9         14  
198 3         5 my $b3 = sum0( map { $_ * ( $_ - 1 ) * ( $_ - 2 ) } @{$xtied} );
  32         37  
  3         5  
199 3         7 my $c2 = sum0( map { $_ * ( $_ - 1 ) } @ns );
  9         15  
200 3         5 my $c3 = sum0( map { $_ * ( $_ - 1 ) } @{$xtied} );
  32         45  
  3         3  
201 3         37 return ( 1 / 72 * ( $gn * ( $gn - 1 ) * ( 2 * $gn + 5 ) - $a2 - $a3 ) ) +
202             ( 1 / ( 36 * $gn * ( $gn - 1 ) * ( $gn - 2 ) ) * $b2 * $b3 ) +
203             ( 1 / ( 8 * $gn * ( $gn - 1 ) ) * $c2 * $c3 );
204             }
205            
206             sub _get_data {
207 10     10   14 my ($self, %args) = @_;
208 10         9 my $hoa;
209 10 100       23 if (ref $args{'data'}) {
210 2         4 $hoa = delete $args{'data'};
211             }
212             else {
213 8         32 $hoa = $self->get_hoa_by_lab(%args);
214             }
215 10         315 return $hoa;
216             }
217            
218             =head1 REFERENCES
219            
220             Hollander, M., & Wolfe, D. A. (1999). I. New York, NY, US: Wiley.
221            
222             =head1 DEPENDENCIES
223            
224             L : used as a C for caching and retrieving data.
225            
226             L : used to implement between-sample ranking.
227            
228             L : for z-testing with optional continuity correction and tailing.
229            
230             L : provides the C algorithm to provide all possible pairs of data-names to loop thru in calculating the observed I value.
231            
232             L : provides the handy sum0() function
233            
234             =head1 BUGS
235            
236             Please report any bugs or feature requests to C, or through
237             the web interface at L. I will be notified, and then you'll
238             automatically be notified of progress on your bug as I make changes.
239            
240             =head1 SUPPORT
241            
242             You can find documentation for this module with the perldoc command.
243            
244             perldoc Statistics::ANOVA::JT
245            
246             You can also look for information at:
247            
248             =over 4
249            
250             =item * RT: CPAN's request tracker (report bugs here)
251            
252             L
253            
254             =item * AnnoCPAN: Annotated CPAN documentation
255            
256             L
257            
258             =item * CPAN Ratings
259            
260             L
261            
262             =item * Search CPAN
263            
264             L
265            
266             =back
267            
268             =head1 AUTHOR
269            
270             Roderick Garton, C<< >>
271            
272             =head1 LICENSE AND COPYRIGHT
273            
274             Copyright 2015 Roderick Garton.
275            
276             This program is free software; you can redistribute it and/or modify it
277             under the terms of either: the GNU General Public License as published
278             by the Free Software Foundation; or the Artistic License.
279            
280             See L for more information.
281            
282             =cut
283            
284             1; # End of Statistics::ANOVA::JT