File Coverage

blib/lib/Statistics/Data/Rank.pm

Criterion	Covered	Total	%
statement	24	130	18.4
branch	0	28	0.0
condition	0	6	0.0
subroutine	8	17	47.0
pod	5	5	100.0
total	37	186	19.8

line	stmt	bran	cond	sub	pod	time	code
1							package Statistics::Data::Rank;
2
3	1			1		21048	use 5.006;
	1					5
	1					87
4	1			1		7	use strict;
	1					2
	1					47
5	1			1		5	use warnings FATAL => 'all';
	1					15
	1					62
6	1			1		6	use base qw(Statistics::Data);
	1					2
	1					492
7	1			1		7	use Carp qw(croak);
	1					1
	1					85
8	1			1		524	use List::AllUtils qw(sum0);
	1					12615
	1					125
9	1			1		452	use Statistics::Lite qw(count mean min);
	1					1444
	1					62
10	1			1		491	use String::Util qw(hascontent);
	1					5035
	1					949
11
12							=head1 NAME
13
14							Statistics::Data::Rank - Utilities for ranking data
15
16							=head1 VERSION
17
18							This is documentation for version 0.01.
19
20							=cut
21
22							our $VERSION = '0.01';
23
24							=head1 SYNOPSIS
25
26							use Statistics::Data::Rank;
27							my $rank = Statistics::Data::Rank->new();
28							my %vars = ('nodrug' => [174, 224, 260], 'placebo' => [261, 213, 231], 'morphine' => [199, 143, 113]);
29							my $ranks_href = $rankd->ranks_between(data => \%vars); # pre-load data:
30							$rankd->load(\%vars);
31							$ranks_href = $rankd->ranks_within();
32							my $sor = $rankd->sum_of_ranks_within(); # or _between()
33							# or specify which vars to rank/sum-rank:
34							$sor = $rankd->sum_of_ranks_within(lab => [qw/placebo morphine/]);
35
36							=head1 DESCRIPTION
37
38							Performs ranking of nammed data, either by an independent, between-variable method (as in Kruskall-Wallis test), or a dependent, cross-variable method (as in Friedman test). Methods return hash of ranks and sum-of-ranks. Data must be pre-loaded (as per L or sent to the methods with the argument B as a hash-ref of array-refs. Output is tested ahead of installation to ensure it matches published data (Siegal, 1956).
39
40							=head1 SUBROUTINES/METHODS
41
42							=head2 new
43
44							$rankd = Statistics::Data->new();
45
46							Constructor, expecting/accepting no args. Inherited from L.
47
48							=head2 load, add, unload
49
50							$rankd->load('a' => [1, 4], 'b' => [3, 7]);
51
52							The given data can now be used by any of the following methods. This is inherited from L, and all its other methods are available here via the class object.
53
54							=cut
55
56							=head2 ranks_between
57
58							$ranks_href = $rankd->ranks_between(data => $values_href);
59							$ranks_href = $rankd->ranks_between(lab => [qw/fez bop/]); # two, say, of previously loaded data
60							$ranks_href = $rankd->ranks_between(); # all of any previously loaded data
61							($ranks_href, $ties_aref, $nties) = $rankd->ranks_between(data => $values_href);
62
63							Given a hash of arefs where the keys are names (groups, treatments) of the sample data (each as an aref), return a hash of the ranks of each value under each name, after pooling all the data and ranking them with a link to their name. Ties are resolved by giving each tied score the mean of the ranks for which it is tied (see Siegal, 1956, p. 188ff). If called in list context, then a reference to an array of the number of variables having the same value per its rank, and a scalar for the number of ties, are also returned. Before ranking, data are checked for numeracy, and any non-numeric or empty values are culled.
64
65							Used, e.g., by Kruskal-Wallis ANOVA, L ANOVA, Dwass-Steel comparison, and Worsley-cluster tests.
66
67							=cut
68
69							sub ranks_between {
70	0			0	1		my ( $self, %args ) = @_;
71	0	0					my $data =
72							$args{'data'}
73							? delete $args{'data'}
74							: $self->get_hoa_by_lab_numonly_indep(%args);
75	0						croak 'Variable data must be numeric and not empty'
76							if not ref $data
77	0	0	0				or not scalar keys %{$data}; # $self->all_numeric( values %{$data} );
78	0						my ( $ranks_href, $xtied_aref, $nties, $ties_var ) = _ranks_between($data);
79							return
80	0	0					wantarray ? ( $ranks_href, $xtied_aref, $nties, $ties_var ) : $ranks_href;
81							}
82
83							=head2 ranks_within
84
85							$ranks_href = $rankd->ranks_within(data => $values_href); # pass data now
86							$ranks_href = $rankd->ranks_within(); # using all of any previously loaded data
87							($ranks_href, $ties_href) = $rankd->ranks_within();
88
89							Given a hash of arefs where the keys are variable names, and the values are their actual sample data (each as an aref), returns a hash of the ranks of each value under each name, calculated dependently (per the values across individual indices). So if 'a' => [1, 3, 7] and 'b' => [4, 5, 6], the ranks returned will be 'a' => [1, 2, 6] and 'b' => [3, 4, 5]. Ties are resolved by giving each tied score the mean of the ranks for which it is tied (see Siegal, 1956, p. 188ff). If called in list context, then a reference to hash of aref is also returned, giving the number of variables having the same value at each index for a rank. Before ranking, data are checked for numeracy, and any non-numeric or empty values are culled.
90
91							Used, e.g., by L and L tests.
92
93							=cut
94
95							sub ranks_within {
96	0			0	1		my ( $self, %args ) = @_;
97	0	0					my $data =
98							$args{'data'}
99							? delete $args{'data'}
100							: $self->get_hoa_by_lab_numonly_across(%args);
101	0						croak 'Variable data must be numeric and not empty'
102							if not ref $data
103	0	0	0				or not scalar keys %{$data}; # $self->all_numeric( values %{$data} );
104	0						my ( $ranks_href, $xtied_href ) = _ranks_within($data);
105	0	0					return wantarray ? ( $ranks_href, $xtied_href ) : $ranks_href;
106							}
107
108							=head2 sum_of_ranks_between
109
110							$sor = $rankd->sum_of_ranks_between(); # all pre-loaded data
111							$sor = $rankd->sum_of_ranks_between(data => HASHREF); # or using these data
112							$sor = $rankd->sum_of_ranks_between(lab => STRING); # or for a particular load
113
114							Returns the sum of ranks for (1) the entire dataset, either as given in argument B, or all pre-loaded variables; or for a particular pre-loaded dataset (variable) as given in the named argument B, where (assuming more than one variable), all values have been pooled and ordered by value per variable.
115
116							=cut
117
118							sub sum_of_ranks_between {
119	0			0	1		my ( $self, %args ) = @_;
120	0						my $lab = delete $args{'lab'};
121	0						my $ranks_href = $self->ranks_between(%args);
122	0	0					if ( hascontent($lab) ) {
123	0	0					croak 'Named variable does not exist'
124							if !exists $ranks_href->{$lab};
125	0						return sum0( @{ $ranks_href->{$lab} } );
	0
126							}
127							else {
128							return {
129	0						map { $_ => sum0( @{ $ranks_href->{$_} } ) }
	0
	0
130	0						keys %{$ranks_href}
131							};
132							}
133							}
134
135							=head2 sum_of_ranks_within
136
137							$sor = $rankd->sum_of_ranks_within(); # all pre-loaded data
138							$sor = $rankd->sum_of_ranks_within(data => HASHREF); # or using these data
139							$sor = $rankd->sum_of_ranks_within(lab => STRING); # or for a particular load
140
141							If called in array context, the sum-href is returned followed by the href of ties (useful for some statistic). Otherwise, it returns the href of summed ranks. The sum for a particular named variable can also be returned by the argument B.
142
143							=cut
144
145							sub sum_of_ranks_within {
146	0			0	1		my ( $self, %args ) = @_;
147	0						my $lab = delete $args{'lab'};
148	0						my ( $ranks_href, $xtied_href ) = $self->ranks_within(%args);
149	0	0					if ( hascontent($lab) ) {
150	0	0					croak 'Named variable does not exist'
151							if !exists $ranks_href->{$lab};
152	0						return sum0( @{ $ranks_href->{$lab} } );
	0
153							}
154							else {
155	0						my $sums =
156	0						{ map { $_ => sum0( @{ $ranks_href->{$_} } ) } keys %{$ranks_href} };
	0
	0
157	0	0					return wantarray ? ( $sums, $xtied_href ) : $sums;
158							}
159							}
160
161							=head2 sumsq_ranks_within
162
163							Returns the sum of the squared sums-of-ranks calculated dependently (per the values across individual indices). Used in L. Expects a hashref of the variables, keyed by name. Called in list context, also returns a hash of the tied ranks.
164
165							=cut
166
167							sub sumsq_ranks_within {
168	0			0	1		my ( $self, %args ) = @_;
169	0						my ( $ranks_href, $xtied_href ) = $self->ranks_within(%args);
170	0						my $sumsq = sum0( map { sum0( @{$_} )**2 } values %{$ranks_href} );
	0
	0
	0
171	0	0					return wantarray ? ( $sumsq, $xtied_href ) : $sumsq;
172							}
173
174							sub _ranks_between {
175	0			0			my $href_of_data = shift;
176	0						my $href_of_lab_by_values = _hash_of_aref_names_per_values($href_of_data);
177	0						my @sorted = sort { $a <=> $b } keys %{$href_of_lab_by_values};
	0
	0
178	0						my ( $nties, $ties_var, @xtied, %ranks ) = ( 1, 0 );
179	0						for my $i( 0 .. scalar @sorted - 1 )
180							{ # loop thru all values in order
181	0						my @groups = @{ $href_of_lab_by_values->{ $sorted[$i] } };
	0
182	0						my $nties_i = scalar @groups; # for values within all and any group
183	0	0					if ( $nties_i > 1 ) { # must be ties
184	0						$ties_var += ( $nties_i**3 - $nties_i );
185	0						for (@groups) {
186	0						push @{ $ranks{$_} }, mean( $nties .. $nties + $nties_i - 1 );
	0
187							}
188	0						$nties += $nties_i;
189							}
190							else {
191	0						push @{ $ranks{ $groups[0] } }, $nties++;
	0
192							}
193	0						push @xtied, $nties_i;
194							}
195	0						$nties--;
196	0						return ( \%ranks, \@xtied, $nties, $ties_var )
197							; # rank hash-of-arefs, tie-correction, N, ari of tied group Ns
198							}
199
200							sub _ranks_within {
201	0			0			my $href_of_data = shift;
202	0						my ( $old, $cur, $col, $ties, $av_rank, %ranks, %row_values ) = ( 0, 0 );
203	0						my %xtied = ();
204
205							# - set the averaged ranks, going down each index:
206							# - list the values at this index in each data-array:
207							# - a value might occur in more than one var at this index, so store an array of the vars:
208
209	0						for my $i ( 0 .. _min_n_of_hoa($href_of_data) - 1 ) {
210	0						for ( keys %{$href_of_data} ) {
	0
211	0						push @{ $row_values{ ( @{ $href_of_data->{$_} } )[$i] } },
	0
	0
212							$_; # hash with values as keys and names as arefs
213							}
214
215							# loop adapted from Boggs' "rank" function in Statistics-RankCorrelation:
216	0						for my $rval ( sort { $a <=> $b } keys %row_values ) {
	0
217	0						$ties =
218	0						scalar @{ $row_values{$rval} }; # N vars of same value per source
219	0						$cur += $ties;
220	0	0					if ( $ties > 1 ) {
221	0						$av_rank = $old + ( $ties + 1 ) / 2; # average tied data
222	0						for ( @{ $row_values{$rval} } ) {
	0
223	0						push @{ $ranks{$_} }, $av_rank;
	0
224							}
225	0						push @{ $xtied{$i} }, $ties;
	0
226							}
227							else {
228	0						push @{ $ranks{ $row_values{$rval}[0] } }, $cur;
	0
229	0						push @{ $xtied{$i} }, $ties;
	0
230							}
231	0						$old = $cur;
232							}
233	0						( $old, $cur, %row_values ) = ( 0, 0 );
234							}
235	0						return ( \%ranks, \%xtied );
236							}
237
238							# create a hash from a hash of named data where the keys are the values of the data linked to an aref of the names
239
240							sub _hash_of_aref_names_per_values {
241	0			0			my $hoa = shift;
242	0						my %grouped = ();
243	0						for my $name ( keys %{$hoa} ) {
	0
244	0						for ( @{ $hoa->{$name} } ) {
	0
245	0						push @{ $grouped{$_} }, $name;
	0
246							}
247							}
248	0						return \%grouped;
249							}
250
251							sub _min_n_of_hoa {
252	0			0			my $data = shift;
253	0						return min( map { count( @{ $data->{$_} } ) } keys %{$data} );
	0
	0
	0
254							}
255
256							=head1 DEPENDENCIES
257
258							L : used for summing.
259
260							L : used as base.
261
262							L : for basic decriptives.
263
264							L : string content checking.
265
266							=head1 DIAGNOSTICS
267
268							=over 4
269
270							=item Variable data must be numeric and not empty
271
272							Ced ahead of calculating (sum of) ranks between or within and there is no hashref of data available.
273
274							=item Named variable does not exist
275
276							Ced by sum_of_ranks_between and sum_of_ranks_within if the value of the optional argument B does not exist as pre-loaded data; either in a call to L or L, or as B in the present method.
277
278							=back
279
280							=head1 REFERENCES
281
282							Siegal, S. (1956). I. New York, NY, US: McGraw-Hill
283
284							=head1 AUTHOR
285
286							Roderick Garton, C<< >>
287
288							=head1 BUGS AND LIMITATIONS
289
290							Please report any bugs or feature requests to C, or through
291							the web interface at L. I will be notified, and then you'll
292							automatically be notified of progress on your bug as I make changes.
293
294							=head1 SUPPORT
295
296							You can find documentation for this module with the perldoc command.
297
298							perldoc Statistics::Data::Rank
299
300
301							You can also look for information at:
302
303							=over 4
304
305							=item * RT: CPAN's request tracker (report bugs here)
306
307							L
308
309							=item * AnnoCPAN: Annotated CPAN documentation
310
311							L
312
313							=item * CPAN Ratings
314
315							L
316
317							=item * Search CPAN
318
319							L
320
321							=back
322
323							=head1 ACKNOWLEDGEMENTS
324
325							L : loop for dealing with ties in calculating "ranks within" adapted from Boggs' "rank" function.
326
327							=head1 LICENSE AND COPYRIGHT
328
329							Copyright 2015 Roderick Garton.
330
331							This program is free software; you can redistribute it and/or modify it
332							under the terms of either: the GNU General Public License as published
333							by the Free Software Foundation; or the Artistic License.
334
335							See L for more information.
336
337
338							=cut
339
340							1; # End of Statistics::Data::Rank