File Coverage

blib/lib/Statistics/Descriptive/Weighted.pm
Criterion Covered Total %
statement 65 255 25.4
branch 7 80 8.7
condition 8 31 25.8
subroutine 11 36 30.5
pod n/a
total 91 402 22.6


line stmt bran cond sub pod time code
1             package Statistics::Descriptive::Weighted;
2             $VERSION = '0.8';
3 1     1   14215 use Statistics::Descriptive;
  1         20379  
  1         27  
4 1     1   795 use Data::Dumper;
  1         10045  
  1         75  
5              
6             package Statistics::Descriptive::Weighted::Sparse;
7 1     1   6 use strict;
  1         5  
  1         44  
8 1     1   3 use vars qw($AUTOLOAD @ISA %fields);
  1         1  
  1         80  
9              
10             @ISA = qw(Statistics::Descriptive::Sparse);
11              
12 1     1   6 use Carp qw(cluck confess);
  1         2  
  1         546  
13              
14             ##Define a new field to be used as method, to
15             ##augment the ones inherited
16             %fields = (
17             weight => 0,
18             sum_squares => 0,
19             weight_homozyg => 0,
20             biased_variance => 0,
21             biased_standard_deviation => 0,
22             );
23              
24             ##Have to override the base method to add new fields to the object
25             ##The proxy method from base class is still valid
26             sub new {
27 1     1   13 my $proto = shift;
28 1   33     6 my $class = ref($proto) || $proto;
29 1         10 my $self = $class->SUPER::new(); ##Create my self re SUPER
30 1         35 @{ $self->{'_permitted'} } {keys %fields} = values %fields;
  1         4  
31 1         4 @{ $self } {keys %fields} = values %fields;
  1         3  
32 1         4 bless ($self, $class); #Re-anneal the object
33 1         3 return $self;
34             }
35              
36             sub add_data {
37 1     1   7 my $self = shift; ##Myself
38 1         1 my $oldmean;
39             my $oldweight;
40 0         0 my ($min,$max);
41 0         0 my $aref;
42            
43 1 50 33     11 unless ((@_ == 2 and ref $_[0] eq 'ARRAY' and ref $_[1] eq 'ARRAY' and @{ $_[0] } == @{ $_[1] })) {
  1   33     1  
  1   33     4  
44 0         0 cluck "Expected input are two references to two arrays of equal length; first data, then positive weights.\n";
45 0         0 return undef;
46             }
47              
48 1         2 my ($datum,$weight) = @_;
49              
50             ##Calculate new mean, pseudo-variance, min and max;
51             ##The on-line weighted incremental algorithm for variance is based on West 1979 from Wikipedia
52             ##D. H. D. West (1979). Communications of the ACM, 22, 9, 532-535: Updating Mean and Variance Estimates: An Improved Method
53            
54             ## NEW in Version 0.4:
55             ## I calculate a sample weighted variance based on normalized weights rather than the sample size
56             ## correction factor is: 1 / (1 - sum [w_i / (sum w_i) ]^2)
57              
58             ## call H = sum [w_i / (sum w_i) ]^2. An online update eq for H is H_new = (sum.w_old^2 * H_old) + weight^2) / sum.w^2
59            
60             ## correction factor is then 1 / (1 - H_new)
61              
62 1         2 my $weighterror;
63 1         4 for (0..$#$datum ) {
64 4 50       11 if ($$weight[$_] <= 0) {
65 0         0 $weighterror = 1;
66 0         0 next;
67             }
68 4         4 $oldmean = $self->{mean};
69 4         4 $oldweight = $self->{weight};
70 4         5 $self->{weight} += $$weight[$_];
71 4         10 $self->{weight_homozyg} = ((($oldweight ** 2 * $self->{weight_homozyg}) + $$weight[$_] ** 2) / ( $self->{weight} ** 2 ));
72 4         7 $self->{count}++;
73 4         23 $self->{sum} += ($$weight[$_] * $$datum[$_]);
74 4         10 $self->{mean} += (($$weight[$_] / $self->{weight} ) * ($$datum[$_] - $oldmean));
75 4         9 $self->{sum_squares} += (($$weight[$_] / $self->{weight} ) * ($$datum[$_] - $oldmean) ** 2) * $oldweight;
76 4 50 66     19 if (not defined $self->{max} or $$datum[$_] > $self->{max}) {
77 4         7 $self->{max} = $$datum[$_];
78             }
79 4 100 66     18 if (not defined $self->{min} or $$datum[$_] < $self->{min}) {
80 1         3 $self->{min} = $$datum[$_];
81             }
82             }
83 1 50       5 cluck "One or more data with nonpositive weights were skipped.\n" if ($weighterror);
84 1         4 $self->{sample_range} = $self->{max} - $self->{min};
85 1 50       6 if ($self->{count} > 1) {
86 1         6 $self->{variance} = ($self->{sum_squares} / ((1 - $self->{weight_homozyg}) * $self->{weight}));
87 1         8 $self->{standard_deviation} = sqrt( $self->{variance});
88 1         3 $self->{biased_variance} = ($self->{sum_squares} / $self->{weight});
89 1         3 $self->{biased_standard_deviation} = sqrt( $self->{biased_variance});
90             }
91 1         4 return 1;
92             }
93              
94             sub weight {
95 0     0     my $self = shift;
96 0 0         if (@_ > 0) {
97 0           cluck "Sparse statistics object expects zero arguments to weight function, returns sum of weights.";
98             }
99 0           return $self->{weight};
100             }
101              
102             ## OVERRIDES FOR UNSUPPORTED FUNCTIONS
103              
104             sub mindex{
105 0     0     confess "Statistics::Descriptive::Weighted does not support this function.";
106             }
107              
108             sub maxdex{
109 0     0     confess "Statistics::Descriptive::Weighted does not support this function.";
110             }
111              
112             1;
113              
114             package Statistics::Descriptive::Weighted::Full;
115              
116 1     1   4 use Carp qw(cluck confess);
  1         1  
  1         40  
117 1     1   513 use Tree::Treap;
  1         2  
  1         34  
118 1     1   7 use strict;
  1         2  
  1         78  
119 1     1   4 use vars qw(@ISA %fields);
  1         1  
  1         1705  
120              
121             @ISA = qw(Statistics::Descriptive::Weighted::Sparse);
122              
123             ##Create a list of fields not to remove when data is updated
124             %fields = (
125             _permitted => undef, ##Place holder for the inherited key hash
126             data => undef, ##keys from variate values to a hashref with keys weight, cdf, tail-prob
127             did_cdf => undef, ##flag to indicate whether CDF/quantile fun has been computed or not
128             quantile => undef, ##"hash" for quantile function
129             percentile => undef, ##"hash" for percentile function
130             maxweight => 0,
131             mode => undef,
132             order => 1,
133             _reserved => undef, ##Place holder for this lookup hash
134             );
135              
136             ##Have to override the base method to add the data to the object
137             ##The proxy method from above is still valid
138             sub new {
139 0     0     my $proto = shift;
140 0   0       my $class = ref($proto) || $proto;
141 0           my $self = $class->SUPER::new(); ##Create my self re SUPER
142 0           $self->{data} = new Tree::Treap("num"); ## inserts data by numeric comparison
143 0           $self->{did_cdf} = 0;
144 0           $self->{maxweight} = 0;
145 0           $self->{quantile} = new Tree::Treap("num");
146 0           $self->{percentile} = new Tree::Treap("num");
147 0           $self->{order} = 1;
148 0           $self->{'_reserved'} = \%fields;
149 0           bless ($self, $class);
150 0           return $self;
151             }
152              
153             ## The treap gives relatively fast search and good performance on possibly sorted data
154             ## The choice is motivated by heavy intended use for Empirical Distribution Function
155             ## A lot of work is done at insertion for faster computation on search
156             ## THE ACTUAL DATA INSERTION IS DONE AT FUNCTION _addweight
157             ## The data structure loses information. Like a hash keys appear only once.
158             ## The value of a key is its sum of weight for that key, and the cumulative weight
159             sub add_data {
160 0     0     my $self = shift;
161 0           my $key;
162              
163 0           my ($datum,$weight) = @_;
164 0           my $filterdatum = [];
165 0           my $filterweight = [];
166 0           my $weighterror;
167             my $newweight;
168 0           for (0..$#$datum) {
169 0 0         if ($$weight[$_] > 0) {
170 0           push @$filterdatum,$$datum[$_];
171 0           push @$filterweight,$$weight[$_];
172 0           $newweight = $self->_addweight($$datum[$_], $$weight[$_]);
173 0 0         if ($newweight > $self->{maxweight}) {
174 0           $self->{maxweight} = $newweight;
175 0           $self->{mode} = $$datum[$_];
176             }
177             }
178             else {
179 0           $weighterror = 1;
180             }
181             }
182 0 0         cluck "One or more data with nonpositive weights were skipped.\n" if ($weighterror);
183 0           $self->SUPER::add_data($filterdatum,$filterweight); ##Perform base statistics on the data
184             ##Clear the did_cdf flag
185 0           $self->{did_cdf} = 0;
186             ##Need to delete all cached keys
187 0           foreach $key (keys %{ $self }) { # Check each key in the object
  0            
188             # If it's a reserved key for this class, keep it
189 0 0         next if exists $self->{'_reserved'}->{$key};
190             # If it comes from the base class, keep it
191 0 0         next if exists $self->{'_permitted'}->{$key};
192 0           delete $self->{$key}; # Delete the out of date cached key
193             }
194 0           return 1;
195             }
196              
197             sub count {
198 0     0     my $self = shift;
199 0 0         if (@_ == 1) { ##Inquire
    0          
200 0           my $val = $self->{data}->get_val($_[0]);
201 0 0         return (defined $val ? ${ $val }{'count'} : $val);
  0            
202             }
203             elsif (@_ == 0) { ##Inquire
204 0           return $self->{count};
205             }
206             else {
207 0           cluck "Only 1 or fewer arguments expected.";
208             }
209 0           return 1;
210             }
211              
212             sub weight {
213 0     0     my $self = shift;
214 0 0         if (@_ == 1) { ##Inquire
    0          
215 0           my $val = $self->{data}->get_val($_[0]);
216 0 0         return (defined $val ? ${ $val }{'weight'} : $val);
  0            
217             }
218             elsif (@_ == 0) { ##Inquire
219 0           return $self->{weight};
220             }
221             else {
222 0           cluck "Only 1 or fewer arguments expected.";
223             }
224 0           return 1;
225             }
226              
227             sub _addweight {
228 0     0     my $self = shift;
229 0   0       my $oldweight = ($self->weight($_[0]) || 0);
230 0           my $newweight = $_[1] + $oldweight;
231 0           my $value = $self->{data}->get_val($_[0]);
232 0 0         my $weights = ($value ? $$value{'weights'} : [] );
233 0           push @$weights, $_[1];
234 0 0         my $orders = ($value ? $$value{'order'} : [] );
235 0           push @$orders, $self->{order}++;
236 0   0       my $newcount = ($self->count($_[0]) || 0) + 1;
237 0 0         if (@_ == 2) { ##Assign
238 0           my $values = {'weight' => $newweight, 'weights' => $weights, 'count' => $newcount, 'order' => $orders, 'cdf' => undef, 'rt_tail_prob' => undef, 'percentile' => undef};
239 0           $self->{data}->insert($_[0],$values);
240             }
241             else {
242 0           cluck "Only two arguments (key, addend) expected.";
243             }
244 0           return $newweight;
245             }
246              
247             sub _do_cdf {
248 0     0     my $self = shift;
249 0           my $cumweight = 0;
250 0           foreach my $key ($self->{data}->keys()){
251 0           my $value = $self->{data}->get_val($key);
252 0           my $keyweight = $self->weight($key);
253              
254 0           my $oldcumweight = $cumweight;
255 0           $cumweight += $keyweight;
256              
257 0           my $propcumweight = $cumweight / $self->{weight};
258 0           my $right_tail_prob = (1 - ($oldcumweight / $self->{weight}));
259 0           my $percentile = ((100 / $self->{weight}) * ($cumweight - ($keyweight / 2)));
260              
261 0           $$value{'cdf'} = $propcumweight;
262 0           $$value{'rt_tail_prob'} = $right_tail_prob;
263 0           $$value{'percentile'} = $percentile;
264              
265 0           $self->{data}->insert($key,$value);
266 0           $self->{quantile}->insert($propcumweight,$key);
267 0           $self->{percentile}->insert($percentile,$key);
268             }
269 0           $self->{did_cdf} = 1;
270 0           return 1;
271             }
272              
273             sub quantile {
274 0     0     my $self = shift;
275 0 0         $self->_do_cdf() unless $self->{did_cdf};
276 0 0         if (@_ == 1) { ##Inquire
277 0           my $proportion = shift;
278 0 0 0       cluck "expects an argument between 0 and 1 inclusive." if ($proportion < 0 or $proportion > 1);
279 0           my @keys = $self->{quantile}->range_keys($proportion, undef);
280 0           my $key = $keys[0]; ## GET THE SMALLEST QUANTILE g.e. $proportion
281 0           return $self->{quantile}->get_val($keys[0]);
282             }
283             else {
284 0           cluck "exactly 1 argument expected.";
285 0           return undef;
286             }
287             }
288              
289             sub percentile {
290 0     0     my $self = shift;
291 0 0         $self->_do_cdf() unless $self->{did_cdf};
292 0 0         if (@_ == 1) { ##Inquire
293 0           my $percent = shift;
294 0 0 0       cluck "expects an argument between 0 and 100 inclusive." if ($percent < 0 or $percent > 100);
295 0 0         if ($percent < $self->{percentile}->minimum()) {
    0          
296 0           return $self->{data}->minimum();
297             }
298             elsif ($percent > $self->{percentile}->maximum()) {
299 0           return $self->{data}->maximum();
300             }
301             else {
302 0           my @gekeys = $self->{percentile}->range_keys($percent, undef);
303 0           my $gekey = $gekeys[0];
304 0           my $geval = $self->{percentile}->get_val($gekey);
305 0           my @lekeys = $self->{percentile}->range_keys(undef,$percent);
306 0           my $lekey = $lekeys[-1];
307 0           my $leval = $self->{percentile}->get_val($lekey);
308 0           return ($leval + (($percent - $lekey) / ($gekey - $lekey) * ($geval - $leval)));
309             }
310             }
311             else {
312 0           cluck "exactly 1 argument expected.";
313             }
314 0           return 1;
315             }
316              
317              
318             sub median {
319 0     0     my $self = shift;
320              
321             ##Cached?
322 0 0         return $self->{median} if defined $self->{median};
323 0           return $self->{median} = $self->percentile(50);
324             }
325              
326             sub mode {
327 0     0     my $self = shift;
328 0           return $self->{mode};
329             }
330              
331             sub cdf {
332 0     0     my $self = shift;
333 0 0         $self->_do_cdf() unless $self->{did_cdf};
334 0 0         if (@_ == 1) { ##Inquire
335 0           my $value = shift;
336 0 0         return 0 if ($self->{data}->minimum() > $value);
337 0           my @keys = $self->{data}->range_keys(undef, $value);
338 0           my $key = $keys[-1]; ## GET THE LARGEST OBSERVED VALUE l.e. $value
339 0           return ${ $self->{data}->get_val($key) }{'cdf'};
  0            
340             }
341             else {
342 0           cluck "exactly 1 argument expected.";
343 0           return undef;
344             }
345             }
346              
347             sub survival {
348 0     0     my $self = shift;
349 0 0         $self->_do_cdf() unless $self->{did_cdf};
350 0 0         if (@_ == 1) { ##Inquire
351 0           my $value = shift;
352 0 0         return 1 if ($self->{data}->minimum() > $value);
353 0           my @keys = $self->{data}->range_keys(undef, $value);
354 0           my $key = $keys[-1]; ## GET THE LARGEST OBSERVED VALUE l.e. $value
355 0           return 1 - (${ $self->{data}->get_val($key) }{'cdf'});
  0            
356             }
357             else {
358 0           cluck "only 1 argument expected.";
359 0           return undef;
360             }
361             }
362              
363             sub rtp {
364 0     0     my $self = shift;
365 0 0         $self->_do_cdf() unless $self->{did_cdf};
366 0 0         if (@_ == 1) { ##Inquire
367 0           my $value = shift;
368 0 0         return 0 if ($self->{data}->maximum() < $value);
369 0           my @keys = $self->{data}->range_keys($value, undef);
370 0           my $key = $keys[0]; ## GET THE SMALLEST OBSERVED VALUE g.e. $value
371 0           return ${ $self->{data}->get_val($key) }{'rt_tail_prob'};
  0            
372             }
373             else {
374 0           cluck "only 1 argument expected.";
375 0           return undef;
376             }
377             }
378              
379             sub get_data {
380 0     0     my $self = shift;
381 0 0         $self->_do_cdf() unless $self->{did_cdf};
382 0           my ($uniqkeys, $sumweights, $keys, $weights, $counts, $cdfs, $rtps, $percentiles, $order) = ([],[],[],[],[],[],[],[],[]);
383 0           my $key = $self->{'data'}->minimum();
384 0           while (defined $key){
385 0           my $value = $self->{data}->get_val($key);
386 0           push @$uniqkeys, $key;
387 0           push @$sumweights, $$value{'weight'};
388 0           foreach my $weight (@{ $$value{'weights'} } ) {
  0            
389 0           push @$keys, $key;
390 0           push @$weights, $weight;
391             }
392 0           push @$order, @{ $$value{'order'}};
  0            
393 0           push @$counts, $$value{'count'};
394 0           push @$cdfs, $$value{'cdf'};
395 0           push @$rtps, $$value{'rt_tail_prob'};
396 0           push @$percentiles, $$value{'percentile'};
397 0           $key = $self->{'data'}->successor($key);
398             }
399 0           return {'uniqvars' => $uniqkeys, 'sumweights' => $sumweights, 'counts' => $counts, 'cdfs' => $cdfs, 'rtps' => $rtps, 'vars' => $keys, 'weights' => $weights, 'percentiles' => $percentiles, 'order' => $order};
400             }
401              
402             sub print {
403 0     0     my $self = shift;
404 0           print Data::Dumper->Dump([$self->get_data()]);
405             }
406              
407             ## OVERRIDES FOR UNSUPPORTED FUNCTIONS
408              
409             sub sort_data{
410 0     0     confess "Statistics::Descriptive::Weighted does not support this function.";
411             }
412              
413             sub presorted{
414 0     0     confess "Statistics::Descriptive::Weighted does not support this function.";
415             }
416              
417             sub harmonic_mean{
418 0     0     confess "Statistics::Descriptive::Weighted does not support this function.";
419             }
420              
421             sub geometric_mean{
422 0     0     confess "Statistics::Descriptive::Weighted does not support this function.";
423             }
424              
425             sub trimmed_mean{
426 0     0     confess "Statistics::Descriptive::Weighted does not support this function.";
427             }
428              
429             sub frequency_distribution{
430 0     0     confess "Statistics::Descriptive::Weighted does not support this function.";
431             }
432              
433             sub least_squares_fit{
434 0     0     confess "Statistics::Descriptive::Weighted does not support this function.";
435             }
436              
437              
438             1;
439              
440             package Statistics::Descriptive;
441              
442             ##All modules return true.
443             1;
444              
445             __END__