File Coverage

blib/lib/Statistics/Descriptive/Weighted.pm
Criterion Covered Total %
statement 234 264 88.6
branch 70 88 79.5
condition 24 40 60.0
subroutine 27 36 75.0
pod n/a
total 355 428 82.9


line stmt bran cond sub pod time code
1             package Statistics::Descriptive::Weighted;
2             $VERSION = '0.7';
3 1     1   42703 use Statistics::Descriptive;
  1         22300  
  1         42  
4 1     1   1327 use Data::Dumper;
  1         21359  
  1         513  
5              
6             package Statistics::Descriptive::Weighted::Sparse;
7 1     1   10 use strict;
  1         10  
  1         47  
8 1     1   5 use vars qw($AUTOLOAD @ISA %fields);
  1         2  
  1         85  
9              
10             @ISA = qw(Statistics::Descriptive::Sparse);
11              
12 1     1   5 use Carp qw(cluck confess);
  1         1  
  1         987  
13              
14             ##Define a new field to be used as method, to
15             ##augment the ones inherited
16             %fields = (
17             mean => 0,
18             weight => 0,
19             sum_squares => 0,
20             weight_homozyg => 0,
21             biased_variance => 0,
22             biased_standard_deviation => 0
23             );
24              
25             __PACKAGE__->_make_accessors( [ grep { $_ ne "weight" } keys(%fields) ] );
26             __PACKAGE__->_make_private_accessors(["weight"]);
27              
28             ##Have to override the base method to add new fields to the object
29             ##The proxy method from base class is still valid
30             sub new {
31 15     15   3322 my $proto = shift;
32 15   33     87 my $class = ref($proto) || $proto;
33 15         87 my $self = $class->SUPER::new(); ##Create my self re SUPER
34 15         500 @{ $self->{'_permitted'} } {keys %fields} = values %fields;
  15         65  
35 15         52 @{ $self } {keys %fields} = values %fields;
  15         43  
36 15         47 bless ($self, $class); #Re-anneal the object
37 15         55 return $self;
38             }
39              
40             sub add_data {
41 22     22   43 my $self = shift; ##Myself
42 22         35 my $oldmean;
43             my $oldweight;
44 0         0 my ($min,$max);
45 0         0 my $aref;
46              
47 22 100 66     182 if ( (not ref $_[0] eq 'ARRAY') || (exists $_[1] and (not (ref $_[1] eq 'ARRAY') || @{$_[0]} != @{$_[1]} ) ) ) {
      66        
      66        
48 4         599 cluck "WARNING: Expected input are two references to two arrays of equal length; first data, then positive weights. Second array is optional.\n";
49 4         468 return undef;
50             }
51              
52 18         36 my ($datum,$weight) = @_;
53              
54             ##Calculate new mean, pseudo-variance, min and max;
55             ##The on-line weighted incremental algorithm for variance is based on West 1979 from Wikipedia
56             ##D. H. D. West (1979). Communications of the ACM, 22, 9, 532-535: Updating Mean and Variance Estimates: An Improved Method
57              
58             ## NEW in Version 0.4:
59             ## I calculate a sample weighted variance based on normalized weights rather than the sample size
60             ## correction factor is: 1 / (1 - sum [w_i / (sum w_i) ]^2)
61              
62             ## call H = sum [w_i / (sum w_i) ]^2. An online update eq for H is H_new = (sum.w_old^2 * H_old) + weight^2) / sum.w^2
63              
64             ## correction factor is then 1 / (1 - H_new)
65              
66 18         22 my $weighterror;
67 18         55 for (0..$#$datum ) {
68 247 100       539 if (not defined $$weight[$_]) {
69 4         8 $$weight[$_] = 1;
70             }
71 247 100       530 if ($$weight[$_] <= 0) {
72 1         10 $weighterror = 1;
73 1         3 next;
74             }
75 246         355 $oldmean = $self->{mean};
76 246         285 $oldweight = $self->{weight};
77 246         343 $self->{weight} += $$weight[$_];
78 246         653 $self->{weight_homozyg} = ((($oldweight ** 2 * $self->{weight_homozyg}) + $$weight[$_] ** 2) / ( $self->{weight} ** 2 ));
79 246         295 $self->{count}++;
80 246         400 $self->{sum} += ($$weight[$_] * $$datum[$_]);
81 246         628 $self->{mean} += (($$weight[$_] / $self->{weight} ) * ($$datum[$_] - $oldmean));
82 246         600 $self->{sum_squares} += (($$weight[$_] / $self->{weight} ) * ($$datum[$_] - $oldmean) ** 2) * $oldweight;
83 246 100 100     1108 if (not defined $self->{max} or $$datum[$_] > $self->{max}) {
84 243         368 $self->{max} = $$datum[$_];
85             }
86 246 100 100     1042 if (not defined $self->{min} or $$datum[$_] < $self->{min}) {
87 15         32 $self->{min} = $$datum[$_];
88             }
89             }
90 18 100       198 cluck "WARNING: One or more data with nonpositive weights were skipped.\n" if ($weighterror);
91 18         179 $self->{sample_range} = $self->{max} - $self->{min};
92              
93 18 100       48 if ($self->{count} > 1) {
94 14         41 $self->{variance} = ($self->{sum_squares} / ((1 - $self->{weight_homozyg}) * $self->{weight}));
95 14         43 $self->{standard_deviation} = sqrt( $self->{variance});
96 14         30 $self->{biased_variance} = ($self->{sum_squares} / $self->{weight});
97 14         28 $self->{biased_standard_deviation} = sqrt( $self->{biased_variance});
98             }
99 18         67 return 1;
100             }
101              
102             sub weight {
103 12     12   8374 my $self = shift;
104 12 100       37 if (@_ > 0) {
105 2         399 cluck "WARNING: Sparse statistics object expects zero arguments to weight function, returns sum of weights.";
106             }
107 12         228 return $self->_weight();
108             }
109              
110             ## OVERRIDES FOR UNSUPPORTED FUNCTIONS
111              
112             sub mindex{
113 0     0   0 confess "ERROR: Statistics::Descriptive::Weighted does not support this function.";
114             }
115              
116             sub maxdex{
117 0     0   0 confess "ERROR: Statistics::Descriptive::Weighted does not support this function.";
118             }
119              
120             1;
121              
122             package Statistics::Descriptive::Weighted::Full;
123              
124 1     1   6 use Carp qw(cluck confess);
  1         3  
  1         62  
125 1     1   1218 use Tree::Treap;
  1         3  
  1         36  
126 1     1   10 use strict;
  1         1  
  1         45  
127 1     1   5 use vars qw(@ISA %fields);
  1         3  
  1         3590  
128              
129             @ISA = qw(Statistics::Descriptive::Weighted::Sparse);
130              
131             ##Create a list of fields not to remove when data is updated
132             %fields = (
133             _permitted => undef, ##Place holder for the inherited key hash
134             data => undef, ##keys from variate values to a hashref with keys weight, cdf, tail-prob
135             did_cdf => undef, ##flag to indicate whether CDF/quantile fun has been computed or not
136             quantile => undef, ##"hash" for quantile function
137             percentile => undef, ##"hash" for percentile function
138             maxweight => 0,
139             mode => undef,
140             order => 1,
141             _reserved => undef, ##Place holder for this lookup hash
142             );
143              
144             ##Have to override the base method to add the data to the object
145             ##The proxy method from above is still valid
146             sub new {
147 8     8   3455 my $proto = shift;
148 8   33     43 my $class = ref($proto) || $proto;
149 8         39 my $self = $class->SUPER::new(); ##Create my self re SUPER
150 8         41 $self->{data} = new Tree::Treap("num"); ## inserts data by numeric comparison
151 8         16 $self->{did_cdf} = 0;
152 8         25 $self->{maxweight} = 0;
153 8         27 $self->{quantile} = new Tree::Treap("num");
154 8         31 $self->{percentile} = new Tree::Treap("num");
155 8         18 $self->{order} = 1;
156 8         17 $self->{'_reserved'} = \%fields;
157 8         21 bless ($self, $class);
158 8         41 return $self;
159             }
160              
161             ## The treap gives relatively fast search and good performance on possibly sorted data
162             ## The choice is motivated by heavy intended use for Empirical Distribution Function
163             ## A lot of work is done at insertion for faster computation on search
164             ## THE ACTUAL DATA INSERTION IS DONE AT FUNCTION _addweight
165             ## The data structure loses information. Like a hash keys appear only once.
166             ## The value of a key is its sum of weight for that key, and the cumulative weight
167             sub add_data {
168 9     9   20 my $self = shift;
169 9         11 my $key;
170              
171 9 50 33     95 if ( (not ref $_[0] eq 'ARRAY') || (exists $_[1] and (not (ref $_[1] eq 'ARRAY') || @{$_[0]} != @{$_[1]} ) ) ) {
      66        
      33        
172 0         0 cluck "WARNING: Expected input are two references to two arrays of equal length; first data, then positive weights. Second array is optional.\n";
173 0         0 return undef;
174             }
175              
176 9         152 my ($datum,$weight) = @_;
177 9         14 my $filterdatum = [];
178 9         26 my $filterweight = [];
179 9         12 my $weighterror;
180             my $newweight;
181 9         30 for (0..$#$datum) {
182 222 100       517 if (not defined $$weight[$_]) {
183 5         14 $$weight[$_] = 1;
184             }
185 222 100       619 if ($$weight[$_] > 0) {
186 221         422 push @$filterdatum,$$datum[$_];
187 221         1240 push @$filterweight,$$weight[$_];
188 221         558 $newweight = $self->_addweight($$datum[$_], $$weight[$_]);
189 221 100       614 if ($newweight > $self->{maxweight}) {
190 9         14 $self->{maxweight} = $newweight;
191 9         61 $self->{mode} = $$datum[$_];
192             }
193             }
194             else {
195 1         4 $weighterror = 1;
196             }
197             }
198 9 100       264 cluck "WARNING: One or more data with nonpositive weights were skipped.\n" if ($weighterror);
199 9         265 $self->SUPER::add_data($filterdatum,$filterweight); ##Perform base statistics on the data
200             ##Clear the did_cdf flag
201 9         15 $self->{did_cdf} = 0;
202             ##Need to delete all cached keys
203 9         16 foreach $key (keys %{ $self }) { # Check each key in the object
  9         78  
204             # If it's a reserved key for this class, keep it
205 223 100       460 next if exists $self->{'_reserved'}->{$key};
206             # If it comes from the base class, keep it
207 143 100       342 next if exists $self->{'_permitted'}->{$key};
208 8         21 delete $self->{$key}; # Delete the out of date cached key
209             }
210 9         131 return 1;
211             }
212              
213             sub count {
214 232     232   2223 my $self = shift;
215 232 100       448 if (@_ == 1) { ##Inquire
    50          
216 221         699 my $val = $self->{data}->get_val($_[0]);
217 221 100       1099 return (defined $val ? ${ $val }{'count'} : $val);
  1         5  
218             }
219             elsif (@_ == 0) { ##Inquire
220 11         39 return $self->{count};
221             }
222             else {
223 0         0 cluck "WARNING: Only 1 or fewer arguments expected.";
224             }
225 0         0 return 1;
226             }
227              
228             sub weight {
229 442     442   1704 my $self = shift;
230 442 100       901 if (@_ == 1) { ##Inquire
    50          
231 440         1438 my $val = $self->{data}->get_val($_[0]);
232 440 100       1462 return (defined $val ? ${ $val }{'weight'} : $val);
  219         599  
233             }
234             elsif (@_ == 0) { ##Inquire
235 2         11 return $self->{weight};
236             }
237             else {
238 0         0 cluck "WARNING: Only 1 or fewer arguments expected.";
239             }
240 0         0 return 1;
241             }
242              
243             sub _addweight {
244 221     221   251 my $self = shift;
245 221   100     513 my $oldweight = ($self->weight($_[0]) || 0);
246 221         310 my $newweight = $_[1] + $oldweight;
247 221         761 my $value = $self->{data}->get_val($_[0]);
248 221 100       541 my $weights = ($value ? $$value{'weights'} : [] );
249 221         491 push @$weights, $_[1];
250 221 100       469 my $orders = ($value ? $$value{'order'} : [] );
251 221         461 push @$orders, $self->{order}++;
252 221   100     473 my $newcount = ($self->count($_[0]) || 0) + 1;
253 221 50       411 if (@_ == 2) { ##Assign
254 221         1196 my $values = {'weight' => $newweight, 'weights' => $weights, 'count' => $newcount, 'order' => $orders, 'cdf' => undef, 'rt_tail_prob' => undef, 'percentile' => undef};
255 221         812 $self->{data}->insert($_[0],$values);
256             }
257             else {
258 0         0 cluck "WARNING: Only two arguments (key, addend) expected.";
259             }
260 221         448 return $newweight;
261             }
262              
263             sub _do_cdf {
264 6     6   10 my $self = shift;
265 6         10 my $cumweight = 0;
266 6         29 foreach my $key ($self->{data}->keys()){
267 217         650 my $value = $self->{data}->get_val($key);
268 217         447 my $keyweight = $self->weight($key);
269              
270 217         272 my $oldcumweight = $cumweight;
271 217         262 $cumweight += $keyweight;
272              
273 217         375 my $propcumweight = $cumweight / $self->{weight};
274 217         359 my $right_tail_prob = (1 - ($oldcumweight / $self->{weight}));
275 217         350 my $percentile = ((100 / $self->{weight}) * ($cumweight - ($keyweight / 2)));
276              
277 217         342 $$value{'cdf'} = $propcumweight;
278 217         303 $$value{'rt_tail_prob'} = $right_tail_prob;
279 217         4108 $$value{'percentile'} = $percentile;
280              
281 217         825 $self->{data}->insert($key,$value);
282 217         667 $self->{quantile}->insert($propcumweight,$key);
283 217         633 $self->{percentile}->insert($percentile,$key);
284             }
285 6         120 $self->{did_cdf} = 1;
286 6         13 return 1;
287             }
288              
289             sub quantile {
290 11     11   295 my $self = shift;
291 11 100       36 $self->_do_cdf() unless $self->{did_cdf};
292 11 50       27 if (@_ == 1) { ##Inquire
293 11         13 my $proportion = shift;
294 11 50 33     55 cluck "WARNING: Expects an argument between 0 and 1 inclusive." if ($proportion < 0 or $proportion > 1);
295 11         40 my @keys = $self->{quantile}->range_keys($proportion, undef);
296 11         15 my $key = $keys[0]; ## GET THE SMALLEST QUANTILE g.e. $proportion
297 11         33 return $self->{quantile}->get_val($keys[0]);
298             }
299             else {
300 0         0 cluck "WARNING: Exactly 1 argument expected.";
301 0         0 return undef;
302             }
303             }
304              
305             sub percentile {
306 4     4   7 my $self = shift;
307 4 100       16 $self->_do_cdf() unless $self->{did_cdf};
308 4 50       12 if (@_ != 1) {
309 0         0 cluck "WARNING: Exactly 1 argument expected.";
310             }
311 4         5 my $percent = shift;
312 4 50 33     20 if ($percent < 0 or $percent > 100) {
313 0         0 cluck "WARNING: Expects an argument between 0 and 100 inclusive.";
314             }
315 4         6 my $percentile;
316 4 50       19 if ($percent < $self->{percentile}->minimum()) {
    50          
317 0         0 $percentile = $self->{data}->minimum();
318             } elsif ($percent > $self->{percentile}->maximum()) {
319 0         0 $percentile = $self->{data}->maximum();
320             } else {
321 4         15 my @lekeys = $self->{percentile}->range_keys(undef,$percent);
322 4         8 my $lekey = $lekeys[-1];
323 4         18 my @gekeys = $self->{percentile}->range_keys($percent, undef);
324 4         7 my $gekey = $gekeys[0];
325 4         16 my $leval = $self->{percentile}->get_val($lekey);
326 4         6 $percentile = $leval;
327 4 100       13 if ($gekey != $lekey) {
328 3         11 my $geval = $self->{percentile}->get_val($gekey);
329 3         15 $percentile += ($percent - $lekey) / ($gekey - $lekey) * ($geval - $leval);
330             }
331             }
332 4         22 return $percentile;
333             }
334              
335              
336             sub median {
337 2     2   4 my $self = shift;
338              
339             ##Cached?
340 2 50       19 return $self->{median} if defined $self->{median};
341 2         7 return $self->{median} = $self->percentile(50);
342             }
343              
344             sub mode {
345 1     1   306 my $self = shift;
346 1         6 return $self->{mode};
347             }
348              
349             sub cdf {
350 11     11   56 my $self = shift;
351 11 50       34 $self->_do_cdf() unless $self->{did_cdf};
352 11 50       29 if (@_ == 1) { ##Inquire
353 11         16 my $value = shift;
354 11 100       41 return 0 if ($self->{data}->minimum() > $value);
355 10         38 my @keys = $self->{data}->range_keys(undef, $value);
356 10         37 my $key = $keys[-1]; ## GET THE LARGEST OBSERVED VALUE l.e. $value
357 10         17 return ${ $self->{data}->get_val($key) }{'cdf'};
  10         45  
358             }
359             else {
360 0         0 cluck "WARNING: Exactly 1 argument expected.";
361 0         0 return undef;
362             }
363             }
364              
365             sub survival {
366 4     4   7 my $self = shift;
367 4 50       12 $self->_do_cdf() unless $self->{did_cdf};
368 4 50       10 if (@_ == 1) { ##Inquire
369 4         6 my $value = shift;
370 4 100       13 return 1 if ($self->{data}->minimum() > $value);
371 3         12 my @keys = $self->{data}->range_keys(undef, $value);
372 3         5 my $key = $keys[-1]; ## GET THE LARGEST OBSERVED VALUE l.e. $value
373 3         4 return 1 - (${ $self->{data}->get_val($key) }{'cdf'});
  3         9  
374             }
375             else {
376 0         0 cluck "WARNING: Only 1 argument expected.";
377 0         0 return undef;
378             }
379             }
380              
381             sub rtp {
382 6     6   9 my $self = shift;
383 6 50       18 $self->_do_cdf() unless $self->{did_cdf};
384 6 50       17 if (@_ == 1) { ##Inquire
385 6         7 my $value = shift;
386 6 100       22 return 0 if ($self->{data}->maximum() < $value);
387 3         13 my @keys = $self->{data}->range_keys($value, undef);
388 3         6 my $key = $keys[0]; ## GET THE SMALLEST OBSERVED VALUE g.e. $value
389 3         5 return ${ $self->{data}->get_val($key) }{'rt_tail_prob'};
  3         9  
390             }
391             else {
392 0         0 cluck "WARNING: Only 1 argument expected.";
393 0         0 return undef;
394             }
395             }
396              
397             sub get_data {
398 1     1   2 my $self = shift;
399 1 50       7 $self->_do_cdf() unless $self->{did_cdf};
400 1         6 my ($uniqkeys, $sumweights, $keys, $weights, $counts, $cdfs, $rtps, $percentiles, $order) = ([],[],[],[],[],[],[],[],[]);
401 1         6 my $key = $self->{'data'}->minimum();
402 1         5 while (defined $key){
403 5         15 my $value = $self->{data}->get_val($key);
404 5         10 push @$uniqkeys, $key;
405 5         15 push @$sumweights, $$value{'weight'};
406 5         32 foreach my $weight (@{ $$value{'weights'} } ) {
  5         12  
407 6         11 push @$keys, $key;
408 6         12 push @$weights, $weight;
409             }
410 5         8 push @$order, @{ $$value{'order'}};
  5         10  
411 5         9 push @$counts, $$value{'count'};
412 5         8 push @$cdfs, $$value{'cdf'};
413 5         6 push @$rtps, $$value{'rt_tail_prob'};
414 5         22 push @$percentiles, $$value{'percentile'};
415 5         16 $key = $self->{'data'}->successor($key);
416             }
417 1         26 return {'uniqvars' => $uniqkeys, 'sumweights' => $sumweights, 'counts' => $counts, 'cdfs' => $cdfs, 'rtps' => $rtps, 'vars' => $keys, 'weights' => $weights, 'percentiles' => $percentiles, 'order' => $order};
418             }
419              
420             sub print {
421 1     1   294 my $self = shift;
422 1         5 print Data::Dumper->Dump([$self->get_data()]);
423             }
424              
425             ## OVERRIDES FOR UNSUPPORTED FUNCTIONS
426              
427             sub sort_data{
428 0     0     confess "ERROR: Statistics::Descriptive::Weighted does not support this function.";
429             }
430              
431             sub presorted{
432 0     0     confess "ERROR: Statistics::Descriptive::Weighted does not support this function.";
433             }
434              
435             sub harmonic_mean{
436 0     0     confess "ERROR: Statistics::Descriptive::Weighted does not support this function.";
437             }
438              
439             sub geometric_mean{
440 0     0     confess "ERROR: Statistics::Descriptive::Weighted does not support this function.";
441             }
442              
443             sub trimmed_mean{
444 0     0     confess "ERROR: Statistics::Descriptive::Weighted does not support this function.";
445             }
446              
447             sub frequency_distribution{
448 0     0     confess "ERROR: Statistics::Descriptive::Weighted does not support this function.";
449             }
450              
451             sub least_squares_fit{
452 0     0     confess "ERROR: Statistics::Descriptive::Weighted does not support this function.";
453             }
454              
455              
456             1;
457              
458             package Statistics::Descriptive;
459              
460             ##All modules return true.
461             1;
462              
463             __END__