File Coverage

blib/lib/Statistics/ANOVA/JT.pm

Criterion	Covered	Total	%
statement	110	110	100.0
branch	7	8	87.5
condition	4	6	66.6
subroutine	17	17	100.0
pod	4	4	100.0
total	142	145	97.9

line	stmt	bran	cond	sub	pod	time	code
1							package Statistics::ANOVA::JT;
2
3	4			4		58901	use 5.006;
	4					12
	4					145
4	4			4		15	use strict;
	4					4
	4					135
5	4			4		13	use warnings FATAL => 'all';
	4					7
	4					139
6	4			4		15	use base qw(Statistics::Data);
	4					4
	4					2356
7	4			4		83693	use Algorithm::Combinatorics qw(combinations);
	4					11107
	4					227
8	4			4		20	use List::AllUtils qw(sum0);
	4					5
	4					131
9	4			4		1824	use Statistics::Data::Rank;
	4					10755
	4					114
10	4			4		20	use Statistics::Lite qw(count max);
	4					6
	4					3528
11
12							=head1 NAME
13
14							Statistics::ANOVA::JT - Jonckheere-Terpstra statistics and test
15
16							=head1 VERSION
17
18							Version 0.01
19
20							=cut
21
22							our $VERSION = '0.01';
23
24							=head1 SYNOPSIS
25
26							use Statistics::ANOVA::JT;
27							my $jt = Statistics::ANOVA::JT->new();
28							$jt->load({1 => [2, 4, 6], 2 => [3, 3, 12], 3 => [5, 7, 11, 16]}); # note ordinal datanames
29							my $j_value = $jt->observed(); # or expected(), variance()
30							my ($z_value, $p_value) = $jt->zprob_test(ccorr => 2, tails => 1, correct_ties => 1);
31							# or without pre-loading:
32							$j_value = $jt->observed(data => {1 => [2, 4, 6], 2 => [5, 3, 12]});
33							# or for subset of loaded data:
34							$j_value = $jt->observed(lab => [1, 3]);
35
36							=head1 DESCRIPTION
37
38							Calculates Jonckheere-Terpstra statistics for sameness (common population) across given orders of independent variables. The statistics are based on a between-groups pooled ranking of the data, like the Kruskal-Wallis test, but, unlike Kruskall-Wallis that returns the same result regardless of order of levels, it takes into account ordinal value of the named data. As ordinal values, numerical intervals between the named values do not matter.
39
40							Data-loading and retrieval are as provided in L, on which the JT object is Cd, so its other methods are available here.
41
42							Return values are tested on installation against published examples: in Hollander and Wolfe (1999), for sample MStat output on L, and for the final I-value in the L example.
43
44							=head1 SUBROUTINES/METHODS
45
46							=head2 new
47
48							$jt = Statistics::ANOVA::JT->new();
49
50							New object for accessing methods and storing results. This "isa" Statistics::Data object.
51
52							=head2 observed
53
54							$val = $jt->observed(); # data pre-loaded
55							$val = $jt->observed(data => $hashref_of_arefs);
56
57							Returns the statistic I: From between-group rankings of all possible pairwise splits of the data, accumulates I as the sum of I(I - 1)/2 Mann-Whitney I counts.
58
59							Optionally, if the data have not been pre-loaded, send as named argument B.
60
61							=cut
62
63							sub observed {
64	2			2	1	926	my ( $self, %args ) = @_;
65	2					7	return _calc_j_value( _get_data($self, %args) );
66							}
67
68							=head2 expected
69
70							$val = $jt->expected(); # data pre-loaded
71							$val = $jt->expected(data => $hashref_of_arefs);
72
73							Returns the expected value of the I statistic for the given data.
74
75							=cut
76
77							sub expected {
78	3			3	1	1350	my ( $self, %args ) = @_;
79	3					8	return _calc_jexp( _get_data($self, %args) );
80							}
81
82							=head2 variance
83
84							$val = $jt->variance(); # data pre-loaded
85							$val = $jt->variance(data => $hashref_of_arefs);
86
87							Return the variance expected to occur in the I values for the given data.
88
89							By default, the method accounts for and corrects for ties, but if C = 0, the returned value is the usual "null" distribution variance, otherwise with an elaborate correction accounting for the number of tied variables and each of their sizes, as offered by Hollander & Wolfe (1999) Eq 6.19, p. 204.
90
91							=cut
92
93							sub variance {
94	2			2	1	823	my ( $self, %args ) = @_;
95	2	100	66			14	if ( defined $args{'correct_ties'} and $args{'correct_ties'} == 0 ) {
96	1					3	return _calc_jvar_ig_ties( _get_data($self, %args) );
97							}
98							else {
99	1					3	return _calc_jvar_by_ties( _get_data($self, %args) );
100							}
101							}
102
103							=head2 zprob_test
104
105							$p_val = $jt->zprob_test(); # data pre-loaded
106							$p_val = $jt->zprob_test(data => $hashref_of_arefs);
107							($z_val, $p_val) = $jt->zprob_test(); # get z-score too
108
109							Performs a z-test on the data and returns the associated probability; or, if called in array context, the z-value itself and then the probability value.
110
111							Rather than calculating the exact I -value, calculates an expected I value and variance, to provide a normalized I for which the I -value is read off the normal distribution. This is appropriate for "large" samples, e.g., greater-than 3 levels, with more than eight observations per level. Otherwise, read the value returned from C<$jt-Eobserved()> and look it up in a table of I-values, such as in Hollander & Wolfe (1999), p. 649ff.
112
113							Optional arguments include B (as above), and B and B as in L. For example, to continuity correct by reducing the observed I-value by 1 (recommended in some texts), set B => 2 (for half on either side of the expected value; if B => 1, then 0.5 is taken off the observed deviation, and so on). The default is not to continuity correct.
114
115							=cut
116
117							sub zprob_test {
118	3			3	1	1492	my ( $self, %args ) = @_;
119	3					14	my $href = _get_data($self, %args);
120	3					1468	require Statistics::Zed;
121	3					20816	my $zed = Statistics::Zed->new();
122	3	100	66			80	my ( $z_value, $p_value ) = $zed->z_value(
123							observed => _calc_j_value($href),
124							expected => _calc_jexp($href),
125							variance =>
126							( defined $args{'correct_ties'} and $args{'correct_ties'} == 0 )
127							? _calc_jvar_ig_ties($href)
128							: _calc_jvar_by_ties($href),
129							%args
130							);
131	3	50				416	return wantarray ? ( $z_value, $p_value ) : $p_value;
132							}
133
134							sub _calc_j_value {
135	5			5		7	my $data = shift;
136	5					26	my $rankd = Statistics::Data::Rank->new();
137	5					35	my $j_value = 0;
138	5					6	for my $pairs ( combinations( [ keys %{$data} ], 2 ) ) {
	5					29
139	15					543	my ( $p1, $p2 ) = @{$pairs};
	15					24
140	15					64	my $ranks_href = $rankd->ranks_between(
141							data => { $p1 => $data->{$p1}, $p2 => $data->{$p2} } );
142	15					39	my %counts = (
143	15					51	$p1 => count( @{ $data->{$p1} } ),
144	15					1725	$p2 => count( @{ $data->{$p2} } )
145							);
146	15					45	my $nprod = $counts{$p1} * $counts{$p2};
147	15					16	my @us = ();
148	15					20	for ( $p1, $p2 ) {
149	30					24	my $n = $counts{$_};
150	30					86	push @us,
151							$nprod +
152							( ( $n * ( $n + 1 ) ) / 2 ) -
153	30					39	sum0( @{ $ranks_href->{$_} } );
154							}
155	15					33	$j_value += max(@us);
156							}
157	5					70	return $j_value;
158							}
159
160							sub _calc_jexp {
161	6			6		9	my ($data) = @_;
162	6					10	my ( $nj, $sum_n ) = ( 0, 0 );
163	6					5	for ( keys %{$data} ) {
	6					16
164	18					29	my $n = count( @{ $data->{$_} } );
	18					39
165	18					31	$sum_n += $n;
166	18					23	$nj += $n**2;
167							}
168	6					40	return ( $sum_n**2 - $nj ) / 4;
169							}
170
171							sub _calc_jvar_ig_ties {
172	2			2		2	my $data = shift;
173	2					3	my ( $sum_n, $v ) = ( 0, 0 );
174	2					2	for ( keys %{$data} ) {
	2					5
175	6					5	my $n = count( @{ $data->{$_} } );
	6					14
176	6					9	$sum_n += $n;
177	6					11	$v += $n*2 ( 2 * $n + 3 );
178							}
179	2					8	return ( $sum_n*2 ( 2 * $sum_n + 3 ) - $v ) / 72;
180							}
181
182							sub _calc_jvar_by_ties {
183	3			3		4	my $data = shift;
184	3					4	my $sum_n = 0;
185	3					5	my @ns = ();
186	3					4	for ( keys %{$data} ) {
	3					8
187	9					9	my $n = count( @{ $data->{$_} } );
	9					14
188	9					18	$sum_n += $n;
189	9					8	push @ns, $n;
190							}
191	3					4	my $ng = scalar( keys( %{$data} ) );
	3					5
192	3					13	my $rankd = Statistics::Data::Rank->new();
193	3					25	my ( $uranks_href, $xtied, $gn ) =
194							$rankd->ranks_between( data => $data );
195	3					705	my $a2 = sum0( map { $_ * ( $_ - 1 ) * ( 2 * $_ + 5 ) } @ns );
	9					29
196	3					5	my $a3 = sum0( map { $_ * ( $_ - 1 ) * ( 2 * $_ + 5 ) } @{$xtied} );
	32					35
	3					6
197	3					5	my $b2 = sum0( map { $_ * ( $_ - 1 ) * ( $_ - 2 ) } @ns );
	9					14
198	3					5	my $b3 = sum0( map { $_ * ( $_ - 1 ) * ( $_ - 2 ) } @{$xtied} );
	32					37
	3					5
199	3					7	my $c2 = sum0( map { $_ * ( $_ - 1 ) } @ns );
	9					15
200	3					5	my $c3 = sum0( map { $_ * ( $_ - 1 ) } @{$xtied} );
	32					45
	3					3
201	3					37	return ( 1 / 72 * ( $gn * ( $gn - 1 ) * ( 2 * $gn + 5 ) - $a2 - $a3 ) ) +
202							( 1 / ( 36 * $gn * ( $gn - 1 ) * ( $gn - 2 ) ) * $b2 * $b3 ) +
203							( 1 / ( 8 * $gn * ( $gn - 1 ) ) * $c2 * $c3 );
204							}
205
206							sub _get_data {
207	10			10		14	my ($self, %args) = @_;
208	10					9	my $hoa;
209	10	100				23	if (ref $args{'data'}) {
210	2					4	$hoa = delete $args{'data'};
211							}
212							else {
213	8					32	$hoa = $self->get_hoa_by_lab(%args);
214							}
215	10					315	return $hoa;
216							}
217
218							=head1 REFERENCES
219
220							Hollander, M., & Wolfe, D. A. (1999). I. New York, NY, US: Wiley.
221
222							=head1 DEPENDENCIES
223
224							L : used as a C for caching and retrieving data.
225
226							L : used to implement between-sample ranking.
227
228							L : for z-testing with optional continuity correction and tailing.
229
230							L : provides the C algorithm to provide all possible pairs of data-names to loop thru in calculating the observed I value.
231
232							L : provides the handy sum0() function
233
234							=head1 BUGS
235
236							Please report any bugs or feature requests to C, or through
237							the web interface at L. I will be notified, and then you'll
238							automatically be notified of progress on your bug as I make changes.
239
240							=head1 SUPPORT
241
242							You can find documentation for this module with the perldoc command.
243
244							perldoc Statistics::ANOVA::JT
245
246							You can also look for information at:
247
248							=over 4
249
250							=item * RT: CPAN's request tracker (report bugs here)
251
252							L
253
254							=item * AnnoCPAN: Annotated CPAN documentation
255
256							L
257
258							=item * CPAN Ratings
259
260							L
261
262							=item * Search CPAN
263
264							L
265
266							=back
267
268							=head1 AUTHOR
269
270							Roderick Garton, C<< >>
271
272							=head1 LICENSE AND COPYRIGHT
273
274							Copyright 2015 Roderick Garton.
275
276							This program is free software; you can redistribute it and/or modify it
277							under the terms of either: the GNU General Public License as published
278							by the Free Software Foundation; or the Artistic License.
279
280							See L for more information.
281
282							=cut
283
284							1; # End of Statistics::ANOVA::JT