File Coverage

blib/lib/Statistics/Data/Rank.pm
Criterion Covered Total %
statement 24 130 18.4
branch 0 28 0.0
condition 0 6 0.0
subroutine 8 17 47.0
pod 5 5 100.0
total 37 186 19.8


line stmt bran cond sub pod time code
1             package Statistics::Data::Rank;
2            
3 1     1   15996 use 5.006;
  1         3  
  1         31  
4 1     1   4 use strict;
  1         1  
  1         29  
5 1     1   4 use warnings FATAL => 'all';
  1         7  
  1         37  
6 1     1   7 use base qw(Statistics::Data);
  1         1  
  1         586  
7 1     1   21594 use Carp qw(croak);
  1         1  
  1         43  
8 1     1   4 use List::AllUtils qw(sum0);
  1         1  
  1         36  
9 1     1   488 use Statistics::Lite qw(count mean min);
  1         1017  
  1         65  
10 1     1   5 use String::Util qw(hascontent);
  1         1  
  1         923  
11             $Statistics::Data::Rank::VERSION = '0.02';
12            
13             =head1 NAME
14            
15             Statistics::Data::Rank - Utilities for ranking data
16            
17             =head1 VERSION
18            
19             This is documentation for Version 0.02, released February 2015.
20            
21             =head1 SYNOPSIS
22            
23             use Statistics::Data::Rank;
24             my $rank = Statistics::Data::Rank->new();
25             my %vars = ('nodrug' => [174, 224, 260], 'placebo' => [261, 213, 231], 'morphine' => [199, 143, 113]);
26             my $ranks_href = $rankd->ranks_between(data => \%vars); # pre-load data:
27             $rankd->load(\%vars);
28             $ranks_href = $rankd->ranks_within();
29             my $sor = $rankd->sum_of_ranks_within(); # or _between()
30             # or specify which vars to rank/sum-rank:
31             $sor = $rankd->sum_of_ranks_within(lab => [qw/placebo morphine/]);
32            
33             =head1 DESCRIPTION
34            
35             Performs ranking of nammed data, either by an independent, between-variable method (as in Kruskall-Wallis test), or a dependent, cross-variable method (as in Friedman test). Methods return hash of ranks and sum-of-ranks. Data must be pre-loaded (as per L or sent to the methods with the argument B as a hash-ref of array-refs. Output is tested ahead of installation to ensure it matches published data (Siegal, 1956).
36            
37             =head1 SUBROUTINES/METHODS
38            
39             =head2 new
40            
41             $rankd = Statistics::Data->new();
42            
43             Constructor, expecting/accepting no args. Inherited from L.
44            
45             =head2 load, add, unload
46            
47             $rankd->load('a' => [1, 4], 'b' => [3, 7]);
48            
49             The given data can now be used by any of the following methods. This is inherited from L, and all its other methods are available here via the class object. Only passing of data as a hash of arrays (HOA) is supported for now. Alternatively, give each of the following methods the HOA for the optional named argument B.
50            
51             =head2 ranks_between
52            
53             $ranks_href = $rankd->ranks_between(data => $values_href);
54             $ranks_href = $rankd->ranks_between(lab => [qw/fez bop/]); # two, say, of previously loaded data
55             $ranks_href = $rankd->ranks_between(); # all of any previously loaded data
56             ($ranks_href, $ties_aref, $nties) = $rankd->ranks_between(data => $values_href);
57            
58             Given a hash of arefs where the keys are names (groups, treatments) of the sample data (each as an aref), return a hash of the ranks of each value under each name, after pooling all the data and ranking them with a link to their name. Ties are resolved by giving each tied score the mean of the ranks for which it is tied (see Siegal, 1956, p. 188ff). If called in list context, then a reference to an array of the number of variables having the same value per its rank, and a scalar for the number of ties, are also returned. Before ranking, data are checked for numeracy, and any non-numeric or empty values are culled.
59            
60             Used, e.g., by Kruskal-Wallis ANOVA, L ANOVA, Dwass-Steel comparison, and Worsley-cluster tests.
61            
62             =cut
63            
64             sub ranks_between {
65 0     0 1   my ( $self, %args ) = @_;
66 0 0         my $data =
67             $args{'data'}
68             ? delete $args{'data'}
69             : $self->get_hoa_by_lab_numonly_indep(%args);
70 0           croak 'Variable data must be numeric and not empty'
71             if not ref $data
72 0 0 0       or not scalar keys %{$data}; # $self->all_numeric( values %{$data} );
73 0           my ( $ranks_href, $xtied_aref, $nties, $ties_var ) = _ranks_between($data);
74             return
75 0 0         wantarray ? ( $ranks_href, $xtied_aref, $nties, $ties_var ) : $ranks_href;
76             }
77            
78             =head2 ranks_within
79            
80             $ranks_href = $rankd->ranks_within(data => $values_href); # pass data now
81             $ranks_href = $rankd->ranks_within(); # using all of any previously loaded data
82             ($ranks_href, $ties_href) = $rankd->ranks_within();
83            
84             Given a hash of arefs where the keys are variable names, and the values are their actual sample data (each as an aref), returns a hash of the ranks of each value under each name, calculated dependently (per the values across individual indices). So if 'a' => [1, 3, 7] and 'b' => [4, 5, 6], the ranks returned will be 'a' => [1, 2, 6] and 'b' => [3, 4, 5]. Ties are resolved by giving each tied score the mean of the ranks for which it is tied (see Siegal, 1956, p. 188ff). If called in list context, then a reference to hash of aref is also returned, giving the number of variables having the same value at each index for a rank. Before ranking, data are checked for numeracy, and any non-numeric or empty values are culled.
85            
86             Used, e.g., by L and L tests.
87            
88             =cut
89            
90             sub ranks_within {
91 0     0 1   my ( $self, %args ) = @_;
92 0 0         my $data =
93             $args{'data'}
94             ? delete $args{'data'}
95             : $self->get_hoa_by_lab_numonly_across(%args);
96 0           croak 'Variable data must be numeric and not empty'
97             if not ref $data
98 0 0 0       or not scalar keys %{$data}; # $self->all_numeric( values %{$data} );
99 0           my ( $ranks_href, $xtied_href ) = _ranks_within($data);
100 0 0         return wantarray ? ( $ranks_href, $xtied_href ) : $ranks_href;
101             }
102            
103             =head2 sum_of_ranks_between
104            
105             $sor = $rankd->sum_of_ranks_between(); # all pre-loaded data
106             $sor = $rankd->sum_of_ranks_between(data => HASHREF); # or using these data
107             $sor = $rankd->sum_of_ranks_between(lab => STRING); # or for a particular load
108            
109             Returns the sum of ranks for (1) the entire dataset, either as given in argument B, or all pre-loaded variables; or for a particular pre-loaded dataset (variable) as given in the named argument B, where (assuming more than one variable), all values have been pooled and ordered by value per variable.
110            
111             =cut
112            
113             sub sum_of_ranks_between {
114 0     0 1   my ( $self, %args ) = @_;
115 0           my $lab = delete $args{'lab'};
116 0           my $ranks_href = $self->ranks_between(%args);
117 0 0         if ( hascontent($lab) ) {
118 0 0         croak 'Named variable does not exist'
119             if !exists $ranks_href->{$lab};
120 0           return sum0( @{ $ranks_href->{$lab} } );
  0            
121             }
122             else {
123             return {
124 0           map { $_ => sum0( @{ $ranks_href->{$_} } ) }
  0            
  0            
125 0           keys %{$ranks_href}
126             };
127             }
128             }
129            
130             =head2 sum_of_ranks_within
131            
132             $sor = $rankd->sum_of_ranks_within(); # all pre-loaded data
133             $sor = $rankd->sum_of_ranks_within(data => HASHREF); # or using these data
134             $sor = $rankd->sum_of_ranks_within(lab => STRING); # or for a particular load
135            
136             If called in array context, the sum-href is returned followed by the href of ties (useful for some statistic). Otherwise, it returns the href of summed ranks. The sum for a particular named variable can also be returned by the argument B.
137            
138             =cut
139            
140             sub sum_of_ranks_within {
141 0     0 1   my ( $self, %args ) = @_;
142 0           my $lab = delete $args{'lab'};
143 0           my ( $ranks_href, $xtied_href ) = $self->ranks_within(%args);
144 0 0         if ( hascontent($lab) ) {
145 0 0         croak 'Named variable does not exist'
146             if !exists $ranks_href->{$lab};
147 0           return sum0( @{ $ranks_href->{$lab} } );
  0            
148             }
149             else {
150 0           my $sums =
151 0           { map { $_ => sum0( @{ $ranks_href->{$_} } ) } keys %{$ranks_href} };
  0            
  0            
152 0 0         return wantarray ? ( $sums, $xtied_href ) : $sums;
153             }
154             }
155            
156             =head2 sumsq_ranks_within
157            
158             Returns the sum of the squared sums-of-ranks calculated dependently (per the values across individual indices). Used in L. Expects a hashref of the variables, keyed by name. Called in list context, also returns a hash of the tied ranks.
159            
160             =cut
161            
162             sub sumsq_ranks_within {
163 0     0 1   my ( $self, %args ) = @_;
164 0           my ( $ranks_href, $xtied_href ) = $self->ranks_within(%args);
165 0           my $sumsq = sum0( map { sum0( @{$_} )**2 } values %{$ranks_href} );
  0            
  0            
  0            
166 0 0         return wantarray ? ( $sumsq, $xtied_href ) : $sumsq;
167             }
168            
169             sub _ranks_between {
170 0     0     my $href_of_data = shift;
171 0           my $href_of_lab_by_values = _hash_of_aref_names_per_values($href_of_data);
172 0           my @sorted = sort { $a <=> $b } keys %{$href_of_lab_by_values};
  0            
  0            
173 0           my ( $nties, $ties_var, @xtied, %ranks ) = ( 1, 0 );
174 0           for my $i ( 0 .. scalar @sorted - 1 ) { # loop thru all values in order
175 0           my @groups = @{ $href_of_lab_by_values->{ $sorted[$i] } };
  0            
176 0           my $nties_i = scalar @groups; # for values within all and any group
177 0 0         if ( $nties_i > 1 ) { # must be ties
178 0           $ties_var += ( $nties_i**3 - $nties_i );
179 0           for (@groups) {
180 0           push @{ $ranks{$_} }, mean( $nties .. $nties + $nties_i - 1 );
  0            
181             }
182 0           $nties += $nties_i;
183             }
184             else {
185 0           push @{ $ranks{ $groups[0] } }, $nties++;
  0            
186             }
187 0           push @xtied, $nties_i;
188             }
189 0           $nties--;
190 0           return ( \%ranks, \@xtied, $nties, $ties_var )
191             ; # rank hash-of-arefs, tie-correction, N, ari of tied group Ns
192             }
193            
194             sub _ranks_within {
195 0     0     my $href_of_data = shift;
196 0           my ( $old, $cur, $col, $ties, $av_rank, %ranks, %row_values ) = ( 0, 0 );
197 0           my %xtied = ();
198            
199             # - set the averaged ranks, going down each index:
200             # - list the values at this index in each data-array:
201             # - a value might occur in more than one var at this index, so store an array of the vars:
202            
203 0           for my $i ( 0 .. _min_n_of_hoa($href_of_data) - 1 ) {
204 0           for ( keys %{$href_of_data} ) {
  0            
205 0           push @{ $row_values{ ( @{ $href_of_data->{$_} } )[$i] } },
  0            
  0            
206             $_; # hash with values as keys and names as arefs
207             }
208            
209             # loop adapted from Boggs' "rank" function in Statistics-RankCorrelation:
210 0           for my $rval ( sort { $a <=> $b } keys %row_values ) {
  0            
211 0           $ties =
212 0           scalar @{ $row_values{$rval} }; # N vars of same value per source
213 0           $cur += $ties;
214 0 0         if ( $ties > 1 ) {
215 0           $av_rank = $old + ( $ties + 1 ) / 2; # average tied data
216 0           for ( @{ $row_values{$rval} } ) {
  0            
217 0           push @{ $ranks{$_} }, $av_rank;
  0            
218             }
219 0           push @{ $xtied{$i} }, $ties;
  0            
220             }
221             else {
222 0           push @{ $ranks{ $row_values{$rval}[0] } }, $cur;
  0            
223 0           push @{ $xtied{$i} }, $ties;
  0            
224             }
225 0           $old = $cur;
226             }
227 0           ( $old, $cur, %row_values ) = ( 0, 0 );
228             }
229 0           return ( \%ranks, \%xtied );
230             }
231            
232             # create a hash from a hash of named data where the keys are the values of the data linked to an aref of the names
233            
234             sub _hash_of_aref_names_per_values {
235 0     0     my $hoa = shift;
236 0           my %grouped = ();
237 0           for my $name ( keys %{$hoa} ) {
  0            
238 0           for ( @{ $hoa->{$name} } ) {
  0            
239 0           push @{ $grouped{$_} }, $name;
  0            
240             }
241             }
242 0           return \%grouped;
243             }
244            
245             sub _min_n_of_hoa {
246 0     0     my $data = shift;
247 0           return min( map { count( @{ $data->{$_} } ) } keys %{$data} );
  0            
  0            
  0            
248             }
249            
250             =head1 DEPENDENCIES
251            
252             L : used for summing.
253            
254             L : used as base.
255            
256             L : for basic decriptives.
257            
258             L : string content checking.
259            
260             =head1 DIAGNOSTICS
261            
262             =over 4
263            
264             =item Variable data must be numeric and not empty
265            
266             Ced ahead of calculating (sum of) ranks between or within and there is no hashref of data available.
267            
268             =item Named variable does not exist
269            
270             Ced by sum_of_ranks_between and sum_of_ranks_within if the value of the optional argument B does not exist as pre-loaded data; either in a call to L or L, or as B in the present method.
271            
272             =back
273            
274             =head1 REFERENCES
275            
276             Siegal, S. (1956). I. New York, NY, US: McGraw-Hill
277            
278             =head1 AUTHOR
279            
280             Roderick Garton, C<< >>
281            
282             =head1 BUGS AND LIMITATIONS
283            
284             Please report any bugs or feature requests to C, or through
285             the web interface at L. I will be notified, and then you'll
286             automatically be notified of progress on your bug as I make changes.
287            
288             =head1 SUPPORT
289            
290             You can find documentation for this module with the perldoc command.
291            
292             perldoc Statistics::Data::Rank
293            
294            
295             You can also look for information at:
296            
297             =over 4
298            
299             =item * RT: CPAN's request tracker (report bugs here)
300            
301             L
302            
303             =item * AnnoCPAN: Annotated CPAN documentation
304            
305             L
306            
307             =item * CPAN Ratings
308            
309             L
310            
311             =item * Search CPAN
312            
313             L
314            
315             =back
316            
317             =head1 ACKNOWLEDGEMENTS
318            
319             L : loop for dealing with ties in calculating "ranks within" adapted from Boggs' "rank" function.
320            
321             =head1 LICENSE AND COPYRIGHT
322            
323             Copyright 2015 Roderick Garton.
324            
325             This program is free software; you can redistribute it and/or modify it
326             under the terms of either: the GNU General Public License as published
327             by the Free Software Foundation; or the Artistic License.
328            
329             See L for more information.
330            
331            
332             =cut
333            
334             1; # End of Statistics::Data::Rank