File Coverage

blib/lib/Statistics/Data/Rank.pm
Criterion Covered Total %
statement 24 130 18.4
branch 0 28 0.0
condition 0 6 0.0
subroutine 8 17 47.0
pod 5 5 100.0
total 37 186 19.8


line stmt bran cond sub pod time code
1             package Statistics::Data::Rank;
2            
3 1     1   21048 use 5.006;
  1         5  
  1         87  
4 1     1   7 use strict;
  1         2  
  1         47  
5 1     1   5 use warnings FATAL => 'all';
  1         15  
  1         62  
6 1     1   6 use base qw(Statistics::Data);
  1         2  
  1         492  
7 1     1   7 use Carp qw(croak);
  1         1  
  1         85  
8 1     1   524 use List::AllUtils qw(sum0);
  1         12615  
  1         125  
9 1     1   452 use Statistics::Lite qw(count mean min);
  1         1444  
  1         62  
10 1     1   491 use String::Util qw(hascontent);
  1         5035  
  1         949  
11            
12             =head1 NAME
13            
14             Statistics::Data::Rank - Utilities for ranking data
15            
16             =head1 VERSION
17            
18             This is documentation for version 0.01.
19            
20             =cut
21            
22             our $VERSION = '0.01';
23            
24             =head1 SYNOPSIS
25            
26             use Statistics::Data::Rank;
27             my $rank = Statistics::Data::Rank->new();
28             my %vars = ('nodrug' => [174, 224, 260], 'placebo' => [261, 213, 231], 'morphine' => [199, 143, 113]);
29             my $ranks_href = $rankd->ranks_between(data => \%vars); # pre-load data:
30             $rankd->load(\%vars);
31             $ranks_href = $rankd->ranks_within();
32             my $sor = $rankd->sum_of_ranks_within(); # or _between()
33             # or specify which vars to rank/sum-rank:
34             $sor = $rankd->sum_of_ranks_within(lab => [qw/placebo morphine/]);
35            
36             =head1 DESCRIPTION
37            
38             Performs ranking of nammed data, either by an independent, between-variable method (as in Kruskall-Wallis test), or a dependent, cross-variable method (as in Friedman test). Methods return hash of ranks and sum-of-ranks. Data must be pre-loaded (as per L or sent to the methods with the argument B as a hash-ref of array-refs. Output is tested ahead of installation to ensure it matches published data (Siegal, 1956).
39            
40             =head1 SUBROUTINES/METHODS
41            
42             =head2 new
43            
44             $rankd = Statistics::Data->new();
45            
46             Constructor, expecting/accepting no args. Inherited from L.
47            
48             =head2 load, add, unload
49            
50             $rankd->load('a' => [1, 4], 'b' => [3, 7]);
51            
52             The given data can now be used by any of the following methods. This is inherited from L, and all its other methods are available here via the class object.
53            
54             =cut
55            
56             =head2 ranks_between
57            
58             $ranks_href = $rankd->ranks_between(data => $values_href);
59             $ranks_href = $rankd->ranks_between(lab => [qw/fez bop/]); # two, say, of previously loaded data
60             $ranks_href = $rankd->ranks_between(); # all of any previously loaded data
61             ($ranks_href, $ties_aref, $nties) = $rankd->ranks_between(data => $values_href);
62            
63             Given a hash of arefs where the keys are names (groups, treatments) of the sample data (each as an aref), return a hash of the ranks of each value under each name, after pooling all the data and ranking them with a link to their name. Ties are resolved by giving each tied score the mean of the ranks for which it is tied (see Siegal, 1956, p. 188ff). If called in list context, then a reference to an array of the number of variables having the same value per its rank, and a scalar for the number of ties, are also returned. Before ranking, data are checked for numeracy, and any non-numeric or empty values are culled.
64            
65             Used, e.g., by Kruskal-Wallis ANOVA, L ANOVA, Dwass-Steel comparison, and Worsley-cluster tests.
66            
67             =cut
68            
69             sub ranks_between {
70 0     0 1   my ( $self, %args ) = @_;
71 0 0         my $data =
72             $args{'data'}
73             ? delete $args{'data'}
74             : $self->get_hoa_by_lab_numonly_indep(%args);
75 0           croak 'Variable data must be numeric and not empty'
76             if not ref $data
77 0 0 0       or not scalar keys %{$data}; # $self->all_numeric( values %{$data} );
78 0           my ( $ranks_href, $xtied_aref, $nties, $ties_var ) = _ranks_between($data);
79             return
80 0 0         wantarray ? ( $ranks_href, $xtied_aref, $nties, $ties_var ) : $ranks_href;
81             }
82            
83             =head2 ranks_within
84            
85             $ranks_href = $rankd->ranks_within(data => $values_href); # pass data now
86             $ranks_href = $rankd->ranks_within(); # using all of any previously loaded data
87             ($ranks_href, $ties_href) = $rankd->ranks_within();
88            
89             Given a hash of arefs where the keys are variable names, and the values are their actual sample data (each as an aref), returns a hash of the ranks of each value under each name, calculated dependently (per the values across individual indices). So if 'a' => [1, 3, 7] and 'b' => [4, 5, 6], the ranks returned will be 'a' => [1, 2, 6] and 'b' => [3, 4, 5]. Ties are resolved by giving each tied score the mean of the ranks for which it is tied (see Siegal, 1956, p. 188ff). If called in list context, then a reference to hash of aref is also returned, giving the number of variables having the same value at each index for a rank. Before ranking, data are checked for numeracy, and any non-numeric or empty values are culled.
90            
91             Used, e.g., by L and L tests.
92            
93             =cut
94            
95             sub ranks_within {
96 0     0 1   my ( $self, %args ) = @_;
97 0 0         my $data =
98             $args{'data'}
99             ? delete $args{'data'}
100             : $self->get_hoa_by_lab_numonly_across(%args);
101 0           croak 'Variable data must be numeric and not empty'
102             if not ref $data
103 0 0 0       or not scalar keys %{$data}; # $self->all_numeric( values %{$data} );
104 0           my ( $ranks_href, $xtied_href ) = _ranks_within($data);
105 0 0         return wantarray ? ( $ranks_href, $xtied_href ) : $ranks_href;
106             }
107            
108             =head2 sum_of_ranks_between
109            
110             $sor = $rankd->sum_of_ranks_between(); # all pre-loaded data
111             $sor = $rankd->sum_of_ranks_between(data => HASHREF); # or using these data
112             $sor = $rankd->sum_of_ranks_between(lab => STRING); # or for a particular load
113            
114             Returns the sum of ranks for (1) the entire dataset, either as given in argument B, or all pre-loaded variables; or for a particular pre-loaded dataset (variable) as given in the named argument B, where (assuming more than one variable), all values have been pooled and ordered by value per variable.
115            
116             =cut
117            
118             sub sum_of_ranks_between {
119 0     0 1   my ( $self, %args ) = @_;
120 0           my $lab = delete $args{'lab'};
121 0           my $ranks_href = $self->ranks_between(%args);
122 0 0         if ( hascontent($lab) ) {
123 0 0         croak 'Named variable does not exist'
124             if !exists $ranks_href->{$lab};
125 0           return sum0( @{ $ranks_href->{$lab} } );
  0            
126             }
127             else {
128             return {
129 0           map { $_ => sum0( @{ $ranks_href->{$_} } ) }
  0            
  0            
130 0           keys %{$ranks_href}
131             };
132             }
133             }
134            
135             =head2 sum_of_ranks_within
136            
137             $sor = $rankd->sum_of_ranks_within(); # all pre-loaded data
138             $sor = $rankd->sum_of_ranks_within(data => HASHREF); # or using these data
139             $sor = $rankd->sum_of_ranks_within(lab => STRING); # or for a particular load
140            
141             If called in array context, the sum-href is returned followed by the href of ties (useful for some statistic). Otherwise, it returns the href of summed ranks. The sum for a particular named variable can also be returned by the argument B.
142            
143             =cut
144            
145             sub sum_of_ranks_within {
146 0     0 1   my ( $self, %args ) = @_;
147 0           my $lab = delete $args{'lab'};
148 0           my ( $ranks_href, $xtied_href ) = $self->ranks_within(%args);
149 0 0         if ( hascontent($lab) ) {
150 0 0         croak 'Named variable does not exist'
151             if !exists $ranks_href->{$lab};
152 0           return sum0( @{ $ranks_href->{$lab} } );
  0            
153             }
154             else {
155 0           my $sums =
156 0           { map { $_ => sum0( @{ $ranks_href->{$_} } ) } keys %{$ranks_href} };
  0            
  0            
157 0 0         return wantarray ? ( $sums, $xtied_href ) : $sums;
158             }
159             }
160            
161             =head2 sumsq_ranks_within
162            
163             Returns the sum of the squared sums-of-ranks calculated dependently (per the values across individual indices). Used in L. Expects a hashref of the variables, keyed by name. Called in list context, also returns a hash of the tied ranks.
164            
165             =cut
166            
167             sub sumsq_ranks_within {
168 0     0 1   my ( $self, %args ) = @_;
169 0           my ( $ranks_href, $xtied_href ) = $self->ranks_within(%args);
170 0           my $sumsq = sum0( map { sum0( @{$_} )**2 } values %{$ranks_href} );
  0            
  0            
  0            
171 0 0         return wantarray ? ( $sumsq, $xtied_href ) : $sumsq;
172             }
173            
174             sub _ranks_between {
175 0     0     my $href_of_data = shift;
176 0           my $href_of_lab_by_values = _hash_of_aref_names_per_values($href_of_data);
177 0           my @sorted = sort { $a <=> $b } keys %{$href_of_lab_by_values};
  0            
  0            
178 0           my ( $nties, $ties_var, @xtied, %ranks ) = ( 1, 0 );
179 0           for my $i( 0 .. scalar @sorted - 1 )
180             { # loop thru all values in order
181 0           my @groups = @{ $href_of_lab_by_values->{ $sorted[$i] } };
  0            
182 0           my $nties_i = scalar @groups; # for values within all and any group
183 0 0         if ( $nties_i > 1 ) { # must be ties
184 0           $ties_var += ( $nties_i**3 - $nties_i );
185 0           for (@groups) {
186 0           push @{ $ranks{$_} }, mean( $nties .. $nties + $nties_i - 1 );
  0            
187             }
188 0           $nties += $nties_i;
189             }
190             else {
191 0           push @{ $ranks{ $groups[0] } }, $nties++;
  0            
192             }
193 0           push @xtied, $nties_i;
194             }
195 0           $nties--;
196 0           return ( \%ranks, \@xtied, $nties, $ties_var )
197             ; # rank hash-of-arefs, tie-correction, N, ari of tied group Ns
198             }
199            
200             sub _ranks_within {
201 0     0     my $href_of_data = shift;
202 0           my ( $old, $cur, $col, $ties, $av_rank, %ranks, %row_values ) = ( 0, 0 );
203 0           my %xtied = ();
204            
205             # - set the averaged ranks, going down each index:
206             # - list the values at this index in each data-array:
207             # - a value might occur in more than one var at this index, so store an array of the vars:
208            
209 0           for my $i ( 0 .. _min_n_of_hoa($href_of_data) - 1 ) {
210 0           for ( keys %{$href_of_data} ) {
  0            
211 0           push @{ $row_values{ ( @{ $href_of_data->{$_} } )[$i] } },
  0            
  0            
212             $_; # hash with values as keys and names as arefs
213             }
214            
215             # loop adapted from Boggs' "rank" function in Statistics-RankCorrelation:
216 0           for my $rval ( sort { $a <=> $b } keys %row_values ) {
  0            
217 0           $ties =
218 0           scalar @{ $row_values{$rval} }; # N vars of same value per source
219 0           $cur += $ties;
220 0 0         if ( $ties > 1 ) {
221 0           $av_rank = $old + ( $ties + 1 ) / 2; # average tied data
222 0           for ( @{ $row_values{$rval} } ) {
  0            
223 0           push @{ $ranks{$_} }, $av_rank;
  0            
224             }
225 0           push @{ $xtied{$i} }, $ties;
  0            
226             }
227             else {
228 0           push @{ $ranks{ $row_values{$rval}[0] } }, $cur;
  0            
229 0           push @{ $xtied{$i} }, $ties;
  0            
230             }
231 0           $old = $cur;
232             }
233 0           ( $old, $cur, %row_values ) = ( 0, 0 );
234             }
235 0           return ( \%ranks, \%xtied );
236             }
237            
238             # create a hash from a hash of named data where the keys are the values of the data linked to an aref of the names
239            
240             sub _hash_of_aref_names_per_values {
241 0     0     my $hoa = shift;
242 0           my %grouped = ();
243 0           for my $name ( keys %{$hoa} ) {
  0            
244 0           for ( @{ $hoa->{$name} } ) {
  0            
245 0           push @{ $grouped{$_} }, $name;
  0            
246             }
247             }
248 0           return \%grouped;
249             }
250            
251             sub _min_n_of_hoa {
252 0     0     my $data = shift;
253 0           return min( map { count( @{ $data->{$_} } ) } keys %{$data} );
  0            
  0            
  0            
254             }
255            
256             =head1 DEPENDENCIES
257            
258             L : used for summing.
259            
260             L : used as base.
261            
262             L : for basic decriptives.
263            
264             L : string content checking.
265            
266             =head1 DIAGNOSTICS
267            
268             =over 4
269            
270             =item Variable data must be numeric and not empty
271            
272             Ced ahead of calculating (sum of) ranks between or within and there is no hashref of data available.
273            
274             =item Named variable does not exist
275            
276             Ced by sum_of_ranks_between and sum_of_ranks_within if the value of the optional argument B does not exist as pre-loaded data; either in a call to L or L, or as B in the present method.
277            
278             =back
279            
280             =head1 REFERENCES
281            
282             Siegal, S. (1956). I. New York, NY, US: McGraw-Hill
283            
284             =head1 AUTHOR
285            
286             Roderick Garton, C<< >>
287            
288             =head1 BUGS AND LIMITATIONS
289            
290             Please report any bugs or feature requests to C, or through
291             the web interface at L. I will be notified, and then you'll
292             automatically be notified of progress on your bug as I make changes.
293            
294             =head1 SUPPORT
295            
296             You can find documentation for this module with the perldoc command.
297            
298             perldoc Statistics::Data::Rank
299            
300            
301             You can also look for information at:
302            
303             =over 4
304            
305             =item * RT: CPAN's request tracker (report bugs here)
306            
307             L
308            
309             =item * AnnoCPAN: Annotated CPAN documentation
310            
311             L
312            
313             =item * CPAN Ratings
314            
315             L
316            
317             =item * Search CPAN
318            
319             L
320            
321             =back
322            
323             =head1 ACKNOWLEDGEMENTS
324            
325             L : loop for dealing with ties in calculating "ranks within" adapted from Boggs' "rank" function.
326            
327             =head1 LICENSE AND COPYRIGHT
328            
329             Copyright 2015 Roderick Garton.
330            
331             This program is free software; you can redistribute it and/or modify it
332             under the terms of either: the GNU General Public License as published
333             by the Free Software Foundation; or the Artistic License.
334            
335             See L for more information.
336            
337            
338             =cut
339            
340             1; # End of Statistics::Data::Rank