File Coverage

blib/lib/Statistics/ANOVA/KW.pm
Criterion Covered Total %
statement 68 78 87.1
branch 15 28 53.5
condition 1 3 33.3
subroutine 14 16 87.5
pod 5 5 100.0
total 103 130 79.2


line stmt bran cond sub pod time code
1             package Statistics::ANOVA::KW;
2            
3 3     3   58166 use 5.006;
  3         11  
4 3     3   27 use strict;
  3         4  
  3         97  
5 3     3   24 use warnings FATAL => 'all';
  3         9  
  3         162  
6 3     3   16 use base qw(Statistics::Data);
  3         3  
  3         2379  
7 3     3   85726 use Carp qw(croak);
  3         6  
  3         182  
8 3     3   17 use List::AllUtils qw(sum0);
  3         4  
  3         179  
9 3     3   2707 use Math::Cephes qw(:dists);
  3         15865  
  3         930  
10 3     3   1799 use Statistics::Data::Rank;
  3         9273  
  3         92  
11 3     3   19 use Statistics::Lite qw(mean);
  3         3  
  3         2293  
12             $Statistics::ANOVA::KW::VERSION = '0.01';
13            
14             =head1 NAME
15            
16             Statistics::ANOVA::KW - Kruskall-Wallis statistics and test (nonparametric independent analysis of variance by ranks for nominally grouped data)
17            
18             =head1 VERSION
19            
20             This is documentation for B of Statistics::ANOVA::KW.
21            
22             =head1 SYNOPSIS
23            
24             use Statistics::ANOVA::KW;
25             my $kw = Statistics::ANOVA::KW->new();
26             $kw->load({1 => [2, 4, 6], 2 => [3, 3, 12], 3 => [5, 7, 11, 16]});
27             my $h_value = $kw->h_value(); # default used to correct for ties
28             my $p_value = $kw->chiprob_test(); # H taken as chi^2 distributed
29             my ($h_value, $df, $count, $p_value_by_chi, $phi) = $kw->chiprob_test(); # same as above, called in array context
30             my ($f_value, $df_b, $df_w, $p_value_by_f, $omega_sq) = $kw->fprob_test(); # F-equivalent value tests
31            
32             # or without pre-loading, and specify correct_ties as well:
33             $h_value = $kw->h_value(data => {1 => [2, 4, 6], 2 => [5, 3, 12]}, correct_ties => 1);
34             # or test only a subset of the loaded data:
35             $h_value = $kw->h_value(lab => [1, 3]);
36            
37             =head1 DESCRIPTION
38            
39             Performs calculations for the Kruskal-Wallis one-way nonparametric analysis of variance by ranks. This is for (at least) ordinal-level measurements of two or more samples of a nominal/categorical variable with equality of variances across the samples. The test is unreliable for small number of observations per sample (conventionally, all samples should have more than five observations). See L for more information, and discussions of the assumptions/interpretations, and pros/cons, of the test at L (pro) and L (con). Note that the Kruskall-Wallis test is often described as a test for I or more samples, in contrast to the Mann-Whitney test, which is restricted to two samples, but KW can also be used with only two samples: the absolute value of the I-value from a Mann-Whitney test equals the square-root of the KW statistic for two factors.
40            
41             Data-loading and retrieval are as provided in L, on which this module's class object is Cd, so its other methods are available here.
42            
43             Return values are tested on installation against published examples and output from other software (e.g., SPSS).
44            
45             =head2 new
46            
47             $kw = Statistics::ANOVA::KW->new();
48            
49             New object for accessing methods and storing results. This "isa" Statistics::Data object.
50            
51             =head2 load, add, unload
52            
53             $kw->load('a' => [1, 4, 3.2], 'b' => [6.5, 6.5, 9], 'c' => [3, 7, 4.4]);
54            
55             The given data can now be used by any of the following methods. This is inherited from L, and all its other methods are available here via the class object. Only passing of data as a hash of arrays (HOA) is supported for now. Once loaded, subsets of the loaded data can be tested by passing their names (or labels) in a referenced array to the argument B in the following methods (as supported by L). Once loaded, any non-numeric values in the samples are culled ahead of running the following methods.
56            
57             Alternatively, without pre-loading the data, directly give the following methods the HOA of data as the value for the optional named argument B.
58            
59             =cut
60            
61             =head2 h_value
62            
63             $h_value = $kw->h_value(data => \%data, correct_ties => 1);
64             $h_value = $kw->h_value(); # assuming data have already been loaded, & default of TRUE for correct_ties
65            
66             Returns the Kruskall-Wallis I statistic.
67            
68             =cut
69            
70             sub h_value {
71 4     4 1 8617 my ( $self, %args ) = ( shift, @_ );
72             my $data =
73             $args{'data'}
74 4 50       45 ? delete $args{'data'}
75             : $self->get_hoa_by_lab_numonly_indep(%args);
76             my $correct_ties = defined $args{'correct_ties'}
77 4 100       707 and $args{'correct_ties'} == 0 ? 0 : 1;
    50          
78 4         18 return ( _kw_stats( $data, $correct_ties ) )[0];
79             }
80            
81             =head2 chiprob_test
82            
83             ($chi_value, $df, $count, $p_value, $phi) = $kw->chiprob_test(data => HOA, correct_ties => 1); # H as chi-square
84             $p_value = $kw->chiprob_test(data => HOA, correct_ties => 1);
85             $p_value = $kw->chiprob_test(); # assuming data have already been loaded, & default of TRUE for correct_ties
86            
87             Performs the ANOVA and, assuming I-square distribution of the Kruskall-Wallis I value, returns its value, its degrees-of-freedom, the total number of observations (I), its I-square probability value, and I-coefficient as an estimate of effect-size ( = square-root of (I-square divided by I) ). Returns only the I

-value if called in scalar context. Default value of optional argument B is 1.

88            
89             =cut
90            
91             sub chiprob_test {
92 1     1 1 414 my ( $self, %args ) = ( shift, @_ );
93             my $data =
94             $args{'data'}
95 1 50       13 ? delete $args{'data'}
96             : $self->get_hoa_by_lab_numonly_indep(%args);
97             my $correct_ties = defined $args{'correct_ties'}
98 1 50       127 and $args{'correct_ties'} == 0 ? 0 : 1;
    50          
99 1         3 my ( $chi, $df_b, $count ) = _kw_stats( $data, $correct_ties );
100 1         35 my $p_value = chdtrc( $df_b, $chi ); # Math::Cephes fn
101             return
102             wantarray
103 1 50       7 ? ( $chi, $df_b, $count, $p_value, sqrt( $chi / $count ) )
104             : $p_value;
105             }
106            
107             =head2 chiprob_str
108            
109             $str = $kw->chiprob_str(data => HOA, correct_ties => 1);
110             $str = $kw->chiprob_str(); # assuming data have already been loaded, & default of TRUE for correct_ties
111            
112             Performs the same test as for L but returns not an array but a string of the conventional reporting form, e.g., chi^2(df, N = total observations) = chi_value, p = p_value.
113            
114             =cut
115            
116             sub chiprob_str {
117 0     0 1 0 my ( $self, %args ) = ( shift, @_ );
118 0         0 my ( $chi_value, $df, $count, $p_value, $phi ) = $self->chiprob_test(%args);
119 0         0 return "chi^2($df, N = $count) = $chi_value, p = $p_value, phi = $phi";
120             }
121            
122             =head2 fprob_test
123            
124             ($f_value, $df_b, $df_w, $p_value, $es_omega) = $kw->fprob_test(data => HOA, correct_ties => BOOL);
125             $p_value = $kw->fprob_test(data => HOA, correct_ties => BOOL);
126             $p_value = $kw->fprob_test(); # assuming data have already been loaded, & default of TRUE for correct_ties
127            
128             Performs the same test as above but transforms the I-square value into an I-distributed value, returning an array comprised of (1) this I-estimate value, its (2) between- and (3) within-groups degrees-of-freedom, (4) the associated probability of the value per the I-distribution, and (5) an estimate of the effect-size statistic, (partial) I-squared. The latter is returned only if L is installed and available. Called in scalar context, only the I-estimated I

-value is returned. The default value of the optional argument B is 1. This method has not been tested against sample/published data (not being provided in the usual software packages).

129            
130             =cut
131            
132             sub fprob_test {
133 2     2 1 1189 my ( $self, %args ) = ( shift, @_ );
134             my $data =
135             $args{'data'}
136 2 50       15 ? delete $args{'data'}
137             : $self->get_hoa_by_lab_numonly_indep(%args);
138             my $correct_ties = defined $args{'correct_ties'}
139 2 50       334 and $args{'correct_ties'} == 0 ? 0 : 1;
    50          
140 2         17 my ( $f_value, $df_b, $df_w ) = _f_stats( $data, $correct_ties );
141 2         64 my $p_value = fdtrc( $df_b, $df_w, $f_value ); # Math::Cephes fn
142 2 100       13 return $p_value if !wantarray;
143            
144 1         2 my $es_omega;
145 1         2 eval { require Statistics::ANOVA::EffectSize; };
  1         379  
146 1 50       8 if ( !$@ ) {
147 0         0 $es_omega = Statistics::ANOVA::EffectSize->omega_sq_partial_by_f(
148             f_value => $f_value,
149             df_b => $df_b,
150             df_w => $df_w
151             );
152             }
153 1         10 return ( $f_value, $df_b, $df_w, $p_value, $es_omega );
154             }
155            
156             =head2 fprob_str
157            
158             $str = $kw->chiprob_str(data => HOA, correct_ties => BOOL);
159             $str = $kw->chiprob_str(); # assuming data have already been loaded, using default of TRUE for correct_ties
160            
161             Performs the same test as for L but returns not an array but a string of the conventional reporting form, e.g., F(df_b, df_w) = f_value, p = p_value (and also, then, an estimate of partial I-squared, if available, see above).
162            
163             =cut
164            
165             sub fprob_str {
166 0     0 1 0 my ( $self, %args ) = ( shift, @_ );
167 0         0 my ( $f_value, $df_b, $df_w, $p_value, $es_omega ) =
168             $self->fprob_test(%args);
169 0         0 my $str = "F($df_b, $df_w) = $f_value, p = $p_value";
170 0 0       0 if ( defined $es_omega ) {
171 0         0 $str .= ', est_omega^2_p = ' . $es_omega;
172             }
173 0         0 return $str;
174             }
175            
176             sub _kw_stats {
177 7     7   26 my ( $data, $correct_ties ) = @_;
178 7         40 my ( $ranks_href, $ties_aref, $gn, $ties_var ) =
179             Statistics::Data::Rank->ranks_between( data => $data );
180             my $num = sum0(
181             map {
182 17         28 scalar @{ $ranks_href->{$_} } *
183 17         428 ( mean( @{ $ranks_href->{$_} } ) - ( ( $gn + 1 ) / 2 ) )**2
  17         53  
184 7         2772 } keys %{$ranks_href}
  7         24  
185             );
186 7         284 my $h = 12 / ( $gn * ( $gn + 1 ) ) * $num;
187            
188             # correction for ties:
189 7 50 33     57 $h /= ( 1 - ( $ties_var / ( $gn**3 - $gn ) ) )
190             unless defined $correct_ties and not $correct_ties;
191 7         8 return ( $h, ( scalar keys %{$ranks_href} ) - 1, $gn ); # H, df, and grand N
  7         51  
192             }
193            
194             sub _f_stats {
195 2     2   4 my ( $data, $correct_ties ) = @_;
196 2         5 my ( $h, $df_b, $n ) = _kw_stats( $data, $correct_ties );
197 2         9 my $df_w = sum0( map { scalar @{ $data->{$_} } - 1 } keys %{$data} );
  6         6  
  6         16  
  2         6  
198 2         5 my $n_bt = scalar keys( %{$data} );
  2         3  
199 2         6 my $f_val = ( $h / ( $n_bt - 1 ) ) / ( ( $n - 1 - $h ) / ( $n - $n_bt ) );
200 2         7 return ( $f_val, $df_b, $df_w );
201             }
202            
203             =head1 DEPENDENCIES
204            
205             L : used for summing.
206            
207             L : used for probability functions.
208            
209             L : used as base.
210            
211             L : used to calculate between-group sum-of ranks.
212            
213             =head1 REFERENCES
214            
215             Hollander, M., & Wolfe, D. A. (1999). I. New York, NY, US: Wiley.
216            
217             Rice, J. A. (1995). I. Belmont, CA, US: Duxbury.
218            
219             Sarantakos, S. (1993). I. Melbourne, Australia: MacMillan.
220            
221             Siegal, S. (1956). I. New York, NY, US: McGraw-Hill
222            
223             =head1 SEE ALSO
224            
225             L : Also a nonparametric ANOVA by ranks for independent samples, but where the ordinality of the numerical labels of the sample names (the order of the groups) is taken into account.
226            
227             L : Returns the I-value and its I-square I

-value (only), and implements the Newman-Keuls test for pairwise comparison.

228            
229             =head1 AUTHOR
230            
231             Roderick Garton, C<< >>
232            
233             =head1 BUGS
234            
235             Please report any bugs or feature requests to C, or through
236             the web interface at L. I will be notified, and then you'll
237             automatically be notified of progress on your bug as I make changes.
238            
239             =head1 SUPPORT
240            
241             You can find documentation for this module with the perldoc command.
242            
243             perldoc Statistics::ANOVA::KW
244            
245             You can also look for information at:
246            
247             =over 4
248            
249             =item * RT: CPAN's request tracker (report bugs here)
250            
251             L
252            
253             =item * AnnoCPAN: Annotated CPAN documentation
254            
255             L
256            
257             =item * CPAN Ratings
258            
259             L
260            
261             =item * Search CPAN
262            
263             L
264            
265             =back
266            
267            
268             =head1 ACKNOWLEDGEMENTS
269            
270            
271             =head1 LICENSE AND COPYRIGHT
272            
273             Copyright 2015 Roderick Garton.
274            
275             This program is free software; you can redistribute it and/or modify it
276             under the terms of either: the GNU General Public License as published
277             by the Free Software Foundation; or the Artistic License.
278            
279             See L for more information.
280            
281            
282             =cut
283            
284             1; # End of Statistics::ANOVA::KW