File Coverage

blib/lib/Statistics/Descriptive/Weighted.pm

Criterion	Covered	Total	%
statement	65	255	25.4
branch	7	80	8.7
condition	8	31	25.8
subroutine	11	36	30.5
pod			n/a
total	91	402	22.6

line	stmt	bran	cond	sub	time	code
1						package Statistics::Descriptive::Weighted;
2						$VERSION = '0.8';
3	1			1	14215	use Statistics::Descriptive;
	1				20379
	1				27
4	1			1	795	use Data::Dumper;
	1				10045
	1				75
5
6						package Statistics::Descriptive::Weighted::Sparse;
7	1			1	6	use strict;
	1				5
	1				44
8	1			1	3	use vars qw($AUTOLOAD @ISA %fields);
	1				1
	1				80
9
10						@ISA = qw(Statistics::Descriptive::Sparse);
11
12	1			1	6	use Carp qw(cluck confess);
	1				2
	1				546
13
14						##Define a new field to be used as method, to
15						##augment the ones inherited
16						%fields = (
17						weight => 0,
18						sum_squares => 0,
19						weight_homozyg => 0,
20						biased_variance => 0,
21						biased_standard_deviation => 0,
22						);
23
24						##Have to override the base method to add new fields to the object
25						##The proxy method from base class is still valid
26						sub new {
27	1			1	13	my $proto = shift;
28	1		33		6	my $class = ref($proto) \|\| $proto;
29	1				10	my $self = $class->SUPER::new(); ##Create my self re SUPER
30	1				35	@{ $self->{'_permitted'} } {keys %fields} = values %fields;
	1				4
31	1				4	@{ $self } {keys %fields} = values %fields;
	1				3
32	1				4	bless ($self, $class); #Re-anneal the object
33	1				3	return $self;
34						}
35
36						sub add_data {
37	1			1	7	my $self = shift; ##Myself
38	1				1	my $oldmean;
39						my $oldweight;
40	0				0	my ($min,$max);
41	0				0	my $aref;
42
43	1	50	33		11	unless ((@_ == 2 and ref $_[0] eq 'ARRAY' and ref $_[1] eq 'ARRAY' and @{ $_[0] } == @{ $_[1] })) {
	1		33		1
	1		33		4
44	0				0	cluck "Expected input are two references to two arrays of equal length; first data, then positive weights.\n";
45	0				0	return undef;
46						}
47
48	1				2	my ($datum,$weight) = @_;
49
50						##Calculate new mean, pseudo-variance, min and max;
51						##The on-line weighted incremental algorithm for variance is based on West 1979 from Wikipedia
52						##D. H. D. West (1979). Communications of the ACM, 22, 9, 532-535: Updating Mean and Variance Estimates: An Improved Method
53
54						## NEW in Version 0.4:
55						## I calculate a sample weighted variance based on normalized weights rather than the sample size
56						## correction factor is: 1 / (1 - sum [w_i / (sum w_i) ]^2)
57
58						## call H = sum [w_i / (sum w_i) ]^2. An online update eq for H is H_new = (sum.w_old^2 * H_old) + weight^2) / sum.w^2
59
60						## correction factor is then 1 / (1 - H_new)
61
62	1				2	my $weighterror;
63	1				4	for (0..$#$datum ) {
64	4	50			11	if ($$weight[$_] <= 0) {
65	0				0	$weighterror = 1;
66	0				0	next;
67						}
68	4				4	$oldmean = $self->{mean};
69	4				4	$oldweight = $self->{weight};
70	4				5	$self->{weight} += $$weight[$_];
71	4				10	$self->{weight_homozyg} = ((($oldweight ** 2 * $self->{weight_homozyg}) + $$weight[$_] 2) / ( $self->{weight} 2 ));
72	4				7	$self->{count}++;
73	4				23	$self->{sum} += ($$weight[$_] * $$datum[$_]);
74	4				10	$self->{mean} += (($$weight[$_] / $self->{weight} ) * ($$datum[$_] - $oldmean));
75	4				9	$self->{sum_squares} += (($$weight[$_] / $self->{weight} ) * ($$datum[$_] - $oldmean) ** 2) * $oldweight;
76	4	50	66		19	if (not defined $self->{max} or $$datum[$_] > $self->{max}) {
77	4				7	$self->{max} = $$datum[$_];
78						}
79	4	100	66		18	if (not defined $self->{min} or $$datum[$_] < $self->{min}) {
80	1				3	$self->{min} = $$datum[$_];
81						}
82						}
83	1	50			5	cluck "One or more data with nonpositive weights were skipped.\n" if ($weighterror);
84	1				4	$self->{sample_range} = $self->{max} - $self->{min};
85	1	50			6	if ($self->{count} > 1) {
86	1				6	$self->{variance} = ($self->{sum_squares} / ((1 - $self->{weight_homozyg}) * $self->{weight}));
87	1				8	$self->{standard_deviation} = sqrt( $self->{variance});
88	1				3	$self->{biased_variance} = ($self->{sum_squares} / $self->{weight});
89	1				3	$self->{biased_standard_deviation} = sqrt( $self->{biased_variance});
90						}
91	1				4	return 1;
92						}
93
94						sub weight {
95	0			0		my $self = shift;
96	0	0				if (@_ > 0) {
97	0					cluck "Sparse statistics object expects zero arguments to weight function, returns sum of weights.";
98						}
99	0					return $self->{weight};
100						}
101
102						## OVERRIDES FOR UNSUPPORTED FUNCTIONS
103
104						sub mindex{
105	0			0		confess "Statistics::Descriptive::Weighted does not support this function.";
106						}
107
108						sub maxdex{
109	0			0		confess "Statistics::Descriptive::Weighted does not support this function.";
110						}
111
112						1;
113
114						package Statistics::Descriptive::Weighted::Full;
115
116	1			1	4	use Carp qw(cluck confess);
	1				1
	1				40
117	1			1	513	use Tree::Treap;
	1				2
	1				34
118	1			1	7	use strict;
	1				2
	1				78
119	1			1	4	use vars qw(@ISA %fields);
	1				1
	1				1705
120
121						@ISA = qw(Statistics::Descriptive::Weighted::Sparse);
122
123						##Create a list of fields not to remove when data is updated
124						%fields = (
125						_permitted => undef, ##Place holder for the inherited key hash
126						data => undef, ##keys from variate values to a hashref with keys weight, cdf, tail-prob
127						did_cdf => undef, ##flag to indicate whether CDF/quantile fun has been computed or not
128						quantile => undef, ##"hash" for quantile function
129						percentile => undef, ##"hash" for percentile function
130						maxweight => 0,
131						mode => undef,
132						order => 1,
133						_reserved => undef, ##Place holder for this lookup hash
134						);
135
136						##Have to override the base method to add the data to the object
137						##The proxy method from above is still valid
138						sub new {
139	0			0		my $proto = shift;
140	0		0			my $class = ref($proto) \|\| $proto;
141	0					my $self = $class->SUPER::new(); ##Create my self re SUPER
142	0					$self->{data} = new Tree::Treap("num"); ## inserts data by numeric comparison
143	0					$self->{did_cdf} = 0;
144	0					$self->{maxweight} = 0;
145	0					$self->{quantile} = new Tree::Treap("num");
146	0					$self->{percentile} = new Tree::Treap("num");
147	0					$self->{order} = 1;
148	0					$self->{'_reserved'} = \%fields;
149	0					bless ($self, $class);
150	0					return $self;
151						}
152
153						## The treap gives relatively fast search and good performance on possibly sorted data
154						## The choice is motivated by heavy intended use for Empirical Distribution Function
155						## A lot of work is done at insertion for faster computation on search
156						## THE ACTUAL DATA INSERTION IS DONE AT FUNCTION _addweight
157						## The data structure loses information. Like a hash keys appear only once.
158						## The value of a key is its sum of weight for that key, and the cumulative weight
159						sub add_data {
160	0			0		my $self = shift;
161	0					my $key;
162
163	0					my ($datum,$weight) = @_;
164	0					my $filterdatum = [];
165	0					my $filterweight = [];
166	0					my $weighterror;
167						my $newweight;
168	0					for (0..$#$datum) {
169	0	0				if ($$weight[$_] > 0) {
170	0					push @$filterdatum,$$datum[$_];
171	0					push @$filterweight,$$weight[$_];
172	0					$newweight = $self->_addweight($$datum[$_], $$weight[$_]);
173	0	0				if ($newweight > $self->{maxweight}) {
174	0					$self->{maxweight} = $newweight;
175	0					$self->{mode} = $$datum[$_];
176						}
177						}
178						else {
179	0					$weighterror = 1;
180						}
181						}
182	0	0				cluck "One or more data with nonpositive weights were skipped.\n" if ($weighterror);
183	0					$self->SUPER::add_data($filterdatum,$filterweight); ##Perform base statistics on the data
184						##Clear the did_cdf flag
185	0					$self->{did_cdf} = 0;
186						##Need to delete all cached keys
187	0					foreach $key (keys %{ $self }) { # Check each key in the object
	0
188						# If it's a reserved key for this class, keep it
189	0	0				next if exists $self->{'_reserved'}->{$key};
190						# If it comes from the base class, keep it
191	0	0				next if exists $self->{'_permitted'}->{$key};
192	0					delete $self->{$key}; # Delete the out of date cached key
193						}
194	0					return 1;
195						}
196
197						sub count {
198	0			0		my $self = shift;
199	0	0				if (@_ == 1) { ##Inquire
		0
200	0					my $val = $self->{data}->get_val($_[0]);
201	0	0				return (defined $val ? ${ $val }{'count'} : $val);
	0
202						}
203						elsif (@_ == 0) { ##Inquire
204	0					return $self->{count};
205						}
206						else {
207	0					cluck "Only 1 or fewer arguments expected.";
208						}
209	0					return 1;
210						}
211
212						sub weight {
213	0			0		my $self = shift;
214	0	0				if (@_ == 1) { ##Inquire
		0
215	0					my $val = $self->{data}->get_val($_[0]);
216	0	0				return (defined $val ? ${ $val }{'weight'} : $val);
	0
217						}
218						elsif (@_ == 0) { ##Inquire
219	0					return $self->{weight};
220						}
221						else {
222	0					cluck "Only 1 or fewer arguments expected.";
223						}
224	0					return 1;
225						}
226
227						sub _addweight {
228	0			0		my $self = shift;
229	0		0			my $oldweight = ($self->weight($_[0]) \|\| 0);
230	0					my $newweight = $_[1] + $oldweight;
231	0					my $value = $self->{data}->get_val($_[0]);
232	0	0				my $weights = ($value ? $$value{'weights'} : [] );
233	0					push @$weights, $_[1];
234	0	0				my $orders = ($value ? $$value{'order'} : [] );
235	0					push @$orders, $self->{order}++;
236	0		0			my $newcount = ($self->count($_[0]) \|\| 0) + 1;
237	0	0				if (@_ == 2) { ##Assign
238	0					my $values = {'weight' => $newweight, 'weights' => $weights, 'count' => $newcount, 'order' => $orders, 'cdf' => undef, 'rt_tail_prob' => undef, 'percentile' => undef};
239	0					$self->{data}->insert($_[0],$values);
240						}
241						else {
242	0					cluck "Only two arguments (key, addend) expected.";
243						}
244	0					return $newweight;
245						}
246
247						sub _do_cdf {
248	0			0		my $self = shift;
249	0					my $cumweight = 0;
250	0					foreach my $key ($self->{data}->keys()){
251	0					my $value = $self->{data}->get_val($key);
252	0					my $keyweight = $self->weight($key);
253
254	0					my $oldcumweight = $cumweight;
255	0					$cumweight += $keyweight;
256
257	0					my $propcumweight = $cumweight / $self->{weight};
258	0					my $right_tail_prob = (1 - ($oldcumweight / $self->{weight}));
259	0					my $percentile = ((100 / $self->{weight}) * ($cumweight - ($keyweight / 2)));
260
261	0					$$value{'cdf'} = $propcumweight;
262	0					$$value{'rt_tail_prob'} = $right_tail_prob;
263	0					$$value{'percentile'} = $percentile;
264
265	0					$self->{data}->insert($key,$value);
266	0					$self->{quantile}->insert($propcumweight,$key);
267	0					$self->{percentile}->insert($percentile,$key);
268						}
269	0					$self->{did_cdf} = 1;
270	0					return 1;
271						}
272
273						sub quantile {
274	0			0		my $self = shift;
275	0	0				$self->_do_cdf() unless $self->{did_cdf};
276	0	0				if (@_ == 1) { ##Inquire
277	0					my $proportion = shift;
278	0	0	0			cluck "expects an argument between 0 and 1 inclusive." if ($proportion < 0 or $proportion > 1);
279	0					my @keys = $self->{quantile}->range_keys($proportion, undef);
280	0					my $key = $keys[0]; ## GET THE SMALLEST QUANTILE g.e. $proportion
281	0					return $self->{quantile}->get_val($keys[0]);
282						}
283						else {
284	0					cluck "exactly 1 argument expected.";
285	0					return undef;
286						}
287						}
288
289						sub percentile {
290	0			0		my $self = shift;
291	0	0				$self->_do_cdf() unless $self->{did_cdf};
292	0	0				if (@_ == 1) { ##Inquire
293	0					my $percent = shift;
294	0	0	0			cluck "expects an argument between 0 and 100 inclusive." if ($percent < 0 or $percent > 100);
295	0	0				if ($percent < $self->{percentile}->minimum()) {
		0
296	0					return $self->{data}->minimum();
297						}
298						elsif ($percent > $self->{percentile}->maximum()) {
299	0					return $self->{data}->maximum();
300						}
301						else {
302	0					my @gekeys = $self->{percentile}->range_keys($percent, undef);
303	0					my $gekey = $gekeys[0];
304	0					my $geval = $self->{percentile}->get_val($gekey);
305	0					my @lekeys = $self->{percentile}->range_keys(undef,$percent);
306	0					my $lekey = $lekeys[-1];
307	0					my $leval = $self->{percentile}->get_val($lekey);
308	0					return ($leval + (($percent - $lekey) / ($gekey - $lekey) * ($geval - $leval)));
309						}
310						}
311						else {
312	0					cluck "exactly 1 argument expected.";
313						}
314	0					return 1;
315						}
316
317
318						sub median {
319	0			0		my $self = shift;
320
321						##Cached?
322	0	0				return $self->{median} if defined $self->{median};
323	0					return $self->{median} = $self->percentile(50);
324						}
325
326						sub mode {
327	0			0		my $self = shift;
328	0					return $self->{mode};
329						}
330
331						sub cdf {
332	0			0		my $self = shift;
333	0	0				$self->_do_cdf() unless $self->{did_cdf};
334	0	0				if (@_ == 1) { ##Inquire
335	0					my $value = shift;
336	0	0				return 0 if ($self->{data}->minimum() > $value);
337	0					my @keys = $self->{data}->range_keys(undef, $value);
338	0					my $key = $keys[-1]; ## GET THE LARGEST OBSERVED VALUE l.e. $value
339	0					return ${ $self->{data}->get_val($key) }{'cdf'};
	0
340						}
341						else {
342	0					cluck "exactly 1 argument expected.";
343	0					return undef;
344						}
345						}
346
347						sub survival {
348	0			0		my $self = shift;
349	0	0				$self->_do_cdf() unless $self->{did_cdf};
350	0	0				if (@_ == 1) { ##Inquire
351	0					my $value = shift;
352	0	0				return 1 if ($self->{data}->minimum() > $value);
353	0					my @keys = $self->{data}->range_keys(undef, $value);
354	0					my $key = $keys[-1]; ## GET THE LARGEST OBSERVED VALUE l.e. $value
355	0					return 1 - (${ $self->{data}->get_val($key) }{'cdf'});
	0
356						}
357						else {
358	0					cluck "only 1 argument expected.";
359	0					return undef;
360						}
361						}
362
363						sub rtp {
364	0			0		my $self = shift;
365	0	0				$self->_do_cdf() unless $self->{did_cdf};
366	0	0				if (@_ == 1) { ##Inquire
367	0					my $value = shift;
368	0	0				return 0 if ($self->{data}->maximum() < $value);
369	0					my @keys = $self->{data}->range_keys($value, undef);
370	0					my $key = $keys[0]; ## GET THE SMALLEST OBSERVED VALUE g.e. $value
371	0					return ${ $self->{data}->get_val($key) }{'rt_tail_prob'};
	0
372						}
373						else {
374	0					cluck "only 1 argument expected.";
375	0					return undef;
376						}
377						}
378
379						sub get_data {
380	0			0		my $self = shift;
381	0	0				$self->_do_cdf() unless $self->{did_cdf};
382	0					my ($uniqkeys, $sumweights, $keys, $weights, $counts, $cdfs, $rtps, $percentiles, $order) = ([],[],[],[],[],[],[],[],[]);
383	0					my $key = $self->{'data'}->minimum();
384	0					while (defined $key){
385	0					my $value = $self->{data}->get_val($key);
386	0					push @$uniqkeys, $key;
387	0					push @$sumweights, $$value{'weight'};
388	0					foreach my $weight (@{ $$value{'weights'} } ) {
	0
389	0					push @$keys, $key;
390	0					push @$weights, $weight;
391						}
392	0					push @$order, @{ $$value{'order'}};
	0
393	0					push @$counts, $$value{'count'};
394	0					push @$cdfs, $$value{'cdf'};
395	0					push @$rtps, $$value{'rt_tail_prob'};
396	0					push @$percentiles, $$value{'percentile'};
397	0					$key = $self->{'data'}->successor($key);
398						}
399	0					return {'uniqvars' => $uniqkeys, 'sumweights' => $sumweights, 'counts' => $counts, 'cdfs' => $cdfs, 'rtps' => $rtps, 'vars' => $keys, 'weights' => $weights, 'percentiles' => $percentiles, 'order' => $order};
400						}
401
402						sub print {
403	0			0		my $self = shift;
404	0					print Data::Dumper->Dump([$self->get_data()]);
405						}
406
407						## OVERRIDES FOR UNSUPPORTED FUNCTIONS
408
409						sub sort_data{
410	0			0		confess "Statistics::Descriptive::Weighted does not support this function.";
411						}
412
413						sub presorted{
414	0			0		confess "Statistics::Descriptive::Weighted does not support this function.";
415						}
416
417						sub harmonic_mean{
418	0			0		confess "Statistics::Descriptive::Weighted does not support this function.";
419						}
420
421						sub geometric_mean{
422	0			0		confess "Statistics::Descriptive::Weighted does not support this function.";
423						}
424
425						sub trimmed_mean{
426	0			0		confess "Statistics::Descriptive::Weighted does not support this function.";
427						}
428
429						sub frequency_distribution{
430	0			0		confess "Statistics::Descriptive::Weighted does not support this function.";
431						}
432
433						sub least_squares_fit{
434	0			0		confess "Statistics::Descriptive::Weighted does not support this function.";
435						}
436
437
438						1;
439
440						package Statistics::Descriptive;
441
442						##All modules return true.
443						1;
444
445						__END__