File Coverage

blib/lib/Statistics/Discrete.pm

Criterion	Covered	Total	%
statement	113	491	23.0
branch	19	142	13.3
condition	2	27	7.4
subroutine	19	44	43.1
pod	18	22	81.8
total	171	726	23.5

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2							#
3							# Statistics::Discrete
4							#
5							# Chiara Orsini, CAIDA, UC San Diego
6							# chiara@caida.org
7							#
8							# Copyright (C) 2014 The Regents of the University of California.
9							#
10							# Statistics::Discrete is free software: you can redistribute it and/or modify
11							# it under the terms of the GNU General Public License as published by
12							# the Free Software Foundation, either version 3 of the License, or
13							# (at your option) any later version.
14							#
15							# Statistics::Discrete is distributed in the hope that it will be useful,
16							# but WITHOUT ANY WARRANTY; without even the implied warranty of
17							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18							# GNU General Public License for more details.
19							#
20							# You should have received a copy of the GNU General Public License
21							# along with Statistics::Discrete. If not, see .
22							#
23
24							package Statistics::Discrete;
25
26
27	1			1		33479	use 5.014002;
	1					4
28	1			1		4	use strict;
	1					2
	1					23
29	1			1		4	use warnings;
	1					5
	1					34
30	1			1		5	use List::Util;
	1					1
	1					70
31	1			1		754	use POSIX;
	1					7200
	1					4
32	1			1		4496	use Storable 'dclone';
	1					6036
	1					100
33
34
35
36							our $VERSION = '0.05';
37
38	1			1		8	use constant NO_BINNING => 0;
	1					2
	1					83
39	1			1		5	use constant LIN_BINNING => 1;
	1					2
	1					39
40	1			1		4	use constant LOG_BINNING => 2;
	1					3
	1					42
41
42	1			1		5	use constant LOG_BASE => 10;
	1					2
	1					36
43	1			1		4	use constant DEFAULT_BINS_NUMBER => 10;
	1					2
	1					5092
44
45
46							# Constructor
47							sub new {
48	1			1	0	420	my $class = shift;
49	1					2	my $self = {};
50	1					4	$self->{"data_frequency"} = {}; # anonymous hash constructor
51							# binning default values
52	1					3	$self->{"bin-type"} = NO_BINNING;
53	1					2	$self->{"optimal-binning"} = 1;
54							# $self->{"bins"} - bins' structure
55							# $self->{"num-bins"} - num bins when optimal binning is off
56							# on-demand stored stats
57							# $self->{"stats"}{"Desc"} - descriptive statistics
58							# $self->{"stats"}{"Dist"} - distributions
59							# $self->{"stats"}{"Dist"}{"binned"} - binned distributions
60							# bind self and class
61	1					2	bless $self, $class;
62	1					3	return $self;
63							}
64
65							################## Methods to load/add data ##################
66
67							sub add_data {
68	1			1	0	685	my $self = shift;
69	1					4	my @data_to_add = @_;
70	1					1	my $data;
71	1					3	foreach $data(@data_to_add) {
72	10	100				28	if(defined($self->{"data_frequency"}{$data})) {
73	4					6	$self->{"data_frequency"}{$data}++;
74							}
75							else {
76	6					20	$self->{"data_frequency"}{$data} = 1 ;
77							}
78							}
79							# data has changed - stats and bins need to be computed again
80	1					3	delete $self->{"stats"};
81	1					2	delete $self->{"bins"};
82	1					3	return;
83							}
84
85							sub add_data_from_file {
86	0			0	0	0	my $self = shift;
87	0					0	my $filename = shift;
88	0					0	my @cols;
89							my $f;
90	0					0	my $line;
91	0	0				0	open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
92	0					0	while($line = <$f>) {
93	0					0	chomp($line);
94	0	0				0	if($line =~ /^\s*#/) {
95	0					0	next;
96							}
97	0					0	@cols = split /\s/ , $line;
98	0	0				0	if(scalar @cols == 1) { # assume list of values
99	0	0				0	if(defined($self->{"data_frequency"}{$cols[0]})) {
100	0					0	$self->{"data_frequency"}{$cols[0]}++;
101							}
102							else {
103	0					0	$self->{"data_frequency"}{$cols[0]} = 1 ;
104							}
105							}
106	0	0				0	if(scalar @cols == 2) { # assume list of id - value pairs
107	0	0				0	if(defined($self->{"data_frequency"}{$cols[1]})) {
108	0					0	$self->{"data_frequency"}{$cols[1]}++;
109							}
110							else {
111	0					0	$self->{"data_frequency"}{$cols[1]} = 1 ;
112							}
113							}
114							# File format not recognized
115	0	0	0			0	if(scalar @cols != 1 and scalar @cols != 2){
116	0					0	print "File format not recognized\n";
117	0					0	close($f);
118	0					0	delete $self->{"stats"};
119	0					0	delete $self->{"bins"};
120	0					0	return;
121							}
122							}
123	0					0	close($f);
124							# data has changed - stats and bins need to be computed again
125	0					0	delete $self->{"stats"};
126	0					0	delete $self->{"bins"};
127	0					0	return;
128							}
129
130							sub add_data_from_frequencyfile {
131	0			0	0	0	my $self = shift;
132	0					0	my $filename = shift;
133	0					0	my @cols;
134							my $f;
135	0					0	my $line;
136	0	0				0	open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
137	0					0	while($line = <$f>) {
138	0					0	chomp($line);
139	0	0				0	if($line =~ /^\s*#/) {
140	0					0	next;
141							}
142	0					0	@cols = split /\s+/ , $line;
143							#DEBUG print $line . "\t" . @cols . "\n";
144	0	0				0	if(scalar @cols == 2) { # assume list of value - count
145	0	0				0	if(defined($self->{"data_frequency"}{$cols[0]})) {
146	0					0	$self->{"data_frequency"}{$cols[0]} += $cols[1];
147							}
148							else {
149	0					0	$self->{"data_frequency"}{$cols[0]} = $cols[1];
150							}
151	0					0	next;
152							}
153							# File format not recognized
154	0	0				0	if(scalar @cols != 2){
155	0					0	print "File format not recognized\n";
156	0					0	close($f);
157	0					0	delete $self->{"stats"};
158	0					0	delete $self->{"bins"};
159	0					0	return;
160							}
161							}
162	0					0	close($f);
163							# data has changed - stats and bins need to be computed again
164	0					0	delete $self->{"stats"};
165	0					0	delete $self->{"bins"};
166	0					0	return;
167							}
168
169
170							################## Descriptive Statistics ##################
171
172							# Minimum value
173							# compute the minimum value, save the statistic, and return it
174							sub minimum {
175	1			1	1	9	my $self = shift;
176	1	50				6	if(!defined($self->{"stats"}{"Desc"}{"min"})) {
177	1					3	my $count = $self->count();
178	1	50				5	if($count > 0) {
179	1					15	$self->{"stats"}{"Desc"}{"min"} = List::Util::min(keys $self->{"data_frequency"});
180							}
181							else {
182	0					0	$self->{"stats"}{"Desc"}{"min"} = 0;
183							}
184							}
185	1					6	return $self->{"stats"}{"Desc"}{"min"};
186							}
187
188
189							# Maximum value
190							# compute the minimum value, save the statistic, and return it
191							sub maximum {
192	1			1	1	2	my $self = shift;
193	1	50				6	if(!defined($self->{"stats"}{"Desc"}{"max"})) {
194	1					2	my $count = $self->count();
195	1	50				4	if($count > 0) {
196	1					10	$self->{"stats"}{"Desc"}{"max"} = List::Util::max(keys $self->{"data_frequency"});
197							}
198							else {
199	0					0	$self->{"stats"}{"Desc"}{"max"} = 0;
200							}
201							}
202	1					5	return $self->{"stats"}{"Desc"}{"max"};
203							}
204
205
206							# count the number of samples provided
207							# save the statistic, and return it
208							sub count {
209	6			6	1	12	my $self = shift;
210	6	100				18	if(!defined($self->{"stats"}{"Desc"}{"count"})) {
211	1					3	my $count = 0;
212	1					1	my $data;
213	1					2	foreach $data(keys %{$self->{"data_frequency"}}) {
	1					7
214	6					10	$count += $self->{"data_frequency"}{$data};
215							}
216	1					4	$self->{"stats"}{"Desc"}{"count"} = $count;
217							}
218	6					15	return $self->{"stats"}{"Desc"}{"count"};
219							}
220
221
222							# http://en.wikipedia.org/wiki/Expected_value
223							# the weighted average of the possible values,
224							# using their probabilities as their weights
225							sub mean {
226	2			2	1	5	my $self = shift;
227	2	100				8	if(!defined($self->{"stats"}{"Desc"}{"mean"})) {
228	1					1	my $cumul_value = 0;
229	1					3	my $count = $self->count();
230	1					2	my $v;
231	1					2	foreach $v( keys %{$self->{"data_frequency"}}) {
	1					15
232	6					13	$cumul_value += $v * $self->{"data_frequency"}{$v};
233							}
234	1	50				20	if($count > 0) {
235	1					4	$self->{"stats"}{"Desc"}{"mean"} = $cumul_value / $count;
236							}
237							else {
238	0					0	$self->{"stats"}{"Desc"}{"mean"} = 0;
239							}
240							}
241	2					7	return $self->{"stats"}{"Desc"}{"mean"};
242							}
243
244
245
246							# http://en.wikipedia.org/wiki/Median
247							# the median is the numerical value separating
248							# the higher half of a data sample, a population,
249							# or a probability distribution, from the lower half.
250							sub median {
251	1			1	1	3	my $self = shift;
252	1	50				10	if(!defined($self->{"stats"}{"Desc"}{"median"})) {
253	1					4	my $count = $self->count();
254	1					2	my $odd = 0;
255	1	50				6	if($count % 2 != 0) {
256	0					0	$odd = 1;
257							}
258	1					15	my $median_index = floor($count/2) + $odd;
259	1					2	my $current_index = 0;
260
261	1					1	my ($v,$i);
262	1					2	foreach $v(sort {$a<=>$b} keys %{$self->{"data_frequency"}}) {
	8					13
	1					9
263	6					14	for($i=0; $i < $self->{"data_frequency"}{$v}; $i++) {
264	10					9	$current_index++;
265	10	100				22	if($current_index == $median_index) {
266	1					3	$self->{"stats"}{"Desc"}{"median"} = $v;
267							}
268	10	100	66			44	if($current_index == ($median_index+1) and
269							$odd == 0) {
270	1					5	$self->{"stats"}{"Desc"}{"median"} = ($self->{"stats"}{"Desc"}{"median"} + $v)/2;
271							}
272
273							}
274							}
275							}
276	1					5	return $self->{"stats"}{"Desc"}{"median"};
277							}
278
279
280							# http://en.wikipedia.org/wiki/Percentile
281							# A percentile (or a centile) is a measure
282							# used in statistics indicating the value
283							# below which a given percentage of observations
284							# in a group of observations fall.
285							sub percentile {
286	0			0	1	0	my $self = shift;
287	0					0	my $perc = shift;
288	0					0	my $max_probability = $perc/100;
289	0					0	my $cdf = $self->empirical_distribution_function();
290	0					0	my $v = 0;
291	0					0	my $previous_v = 0;
292	0					0	my $epsilon = 0;
293	0					0	foreach $v(sort {$a<=>$b} keys %{$cdf}) {
	0					0
	0					0
294	0					0	$epsilon = $v - $previous_v;
295	0	0				0	if($cdf->{$v} > $max_probability) {
296	0					0	return $v;
297							}
298	0					0	$previous_v = $v;
299							}
300							# if the program is here then $perc == 1
301							# then we return $v+epsilon
302	0					0	return $v+$epsilon;
303							}
304
305
306
307							# http://en.wikipedia.org/wiki/Variance
308							# The variance of a random variable X is
309							# its second central moment, the expected
310							# value of the squared deviation from the mean
311							sub variance {
312	1			1	1	1	my $self = shift;
313	1	50				5	if(!defined($self->{"stats"}{"Desc"}{"variance"})) {
314	1					3	my $mean = $self->mean();
315	1					3	my $count = $self->count();
316	1					2	my $cumul_value = 0;
317	1					2	my $square_mean = 0;
318	1					1	my $v;
319	1					7	foreach $v(keys %{$self->{"data_frequency"}}) {
	1					4
320	6					13	$cumul_value += ($v*2) $self->{"data_frequency"}{$v};
321							}
322	1	50				4	if($count > 0) {
323	1					2	$square_mean = $cumul_value / $count;
324							}
325	1					4	$self->{"stats"}{"Desc"}{"variance"} = $square_mean - ($mean**2);
326							}
327	1					19	return $self->{"stats"}{"Desc"}{"variance"};
328							}
329
330
331							# http://en.wikipedia.org/wiki/Standard_deviation
332							# The standard deviation of a random variable
333							# is the square root of its variance.
334							sub standard_deviation {
335	0			0	1		my $self = shift;
336	0	0					if(!defined($self->{"stats"}{"Desc"}{"sdev"})) {
337	0						my $variance = $self->variance();
338	0						$self->{"stats"}{"Desc"}{"sdev"} = sqrt($variance);
339							}
340	0						return $self->{"stats"}{"Desc"}{"sdev"};
341							}
342
343
344							################## Distributions Statistics ##################
345
346							# http://en.wikipedia.org/wiki/Frequency_distribution
347							# the frequency distribution is an arrangement of the values
348							# that one or more variables take in a sample. Each entry in
349							# the table contains the frequency or count of the occurrences
350							# of values within a particular group or interval
351							sub frequency_distribution {
352	0			0	1		my $self = shift;
353	0	0					if($self->count() == 0) {
354	0						print "WARNING: empty dataset, returning empty frequency distribution. \n";
355	0						return (); # empty hash
356							}
357	0	0					if($self->{"bin-type"} == NO_BINNING) {
358							# $self->{"stats"}{"Dist"}{"frequency"}
359	0						return $self->_frequency_distribution();
360							}
361							else {
362							# $self->{"stats"}{"Dist"}{"binned"}{"frequency"}
363	0						return $self->_binned_frequency_distribution();
364							}
365							}
366
367
368							# http://en.wikipedia.org/wiki/Probability_mass_function
369							# the probability mass function (pmf) is a function that gives
370							# the probability that a discrete random variable is exactly
371							# equal to some value.
372							sub probability_mass_function {
373	0			0	1		my $self = shift;
374	0	0					if($self->count() == 0) {
375	0						print "WARNING: empty dataset, returning empty probability mass function. \n";
376	0						return (); # empty hash
377							}
378	0	0					if($self->{"bin-type"} == NO_BINNING) {
379							# $self->{"stats"}{"Dist"}{"pmf"}
380	0						return $self->_probability_mass_function();
381							}
382							else {
383							# $self->{"stats"}{"Dist"}{"binned"}{"pmf"}
384	0						return $self->_binned_probability_mass_function();
385							}
386							}
387
388
389							# http://en.wikipedia.org/wiki/Empirical_distribution_function
390							# the empirical distribution function, or empirical cdf, is the
391							# cumulative distribution function associated with the empirical
392							# measure of the sample. This cdf is a step function that jumps up
393							# by 1/n at each of the n data points.
394							sub empirical_distribution_function {
395	0			0	1		my $self = shift;
396	0	0					if($self->count() == 0) {
397	0						print "WARNING: empty dataset, returning empty emptirical distribution function. \n";
398	0						return (); # empty hash
399							}
400	0	0					if($self->{"bin-type"} == NO_BINNING) {
401							# $self->{"stats"}{"Dist"}{"cdf"}
402	0						return $self->_empirical_distribution_function();
403							}
404							else {
405							# $self->{"stats"}{"Dist"}{"binned"}{"cdf"}
406	0						return $self->_binned_empirical_distribution_function();
407							}
408							}
409
410
411							# http://en.wikipedia.org/wiki/CCDF
412							# how often the random variable is above a particular level.
413							sub complementary_cumulative_distribution_function {
414	0			0	1		my $self = shift;
415	0	0					if($self->count() == 0) {
416	0						print "WARNING: empty dataset, returning empty complementary cumulative distribution function. \n";
417	0						return (); # empty hash
418							}
419	0	0					if($self->{"bin-type"} == NO_BINNING) {
420							# $self->{"stats"}{"Dist"}{"ccdf"}
421	0						return $self->_complementary_cumulative_distribution_function();
422							}
423							else {
424							# $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}
425	0						return $self->_binned_complementary_cumulative_distribution_function();
426							}
427							}
428
429
430
431							################## Regular Distributions Statistics ##################
432
433
434							sub _frequency_distribution {
435	0			0			my $self = shift;
436	0	0					if(!defined($self->{"stats"}{"Dist"}{"frequency"})) {
437	0						$self->{"stats"}{"Dist"}{"frequency"} = dclone $self->{"data_frequency"};
438							}
439	0						return $self->{"stats"}{"Dist"}{"frequency"};
440							}
441
442
443							sub _probability_mass_function {
444	0			0			my $self = shift;
445	0	0					if(!defined($self->{"stats"}{"Dist"}{"pmf"})) {
446	0						my $count = $self->count();
447	0						$self->_frequency_distribution();
448	0						my ($val,$freq);
449	0						foreach $val(keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
	0
450	0						$freq = $self->{"stats"}{"Dist"}{"frequency"}{$val};
451	0						$self->{"stats"}{"Dist"}{"pmf"}{$val} = $freq / $count;
452							}
453							}
454	0						return $self->{"stats"}{"Dist"}{"pmf"};
455							}
456
457
458							sub _empirical_distribution_function {
459	0			0			my $self = shift;
460	0	0					if(!defined($self->{"stats"}{"Dist"}{"cdf"})) {
461	0						my $count = $self->count();
462	0						$self->_frequency_distribution();
463	0						my $val;
464	0						my $cumul_freq = 0;
465	0						foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
	0
	0
466	0						$cumul_freq += $self->{"stats"}{"Dist"}{"frequency"}{$val};
467	0						$self->{"stats"}{"Dist"}{"cdf"}{$val} = $cumul_freq / $count;
468							}
469							}
470	0						return $self->{"stats"}{"Dist"}{"cdf"};
471							}
472
473
474							sub _complementary_cumulative_distribution_function {
475	0			0			my $self = shift;
476							# warning, this creates self->{"stats"} if it doesn't exist
477	0	0					if(!defined($self->{"stats"}{"Dist"}{"ccdf"})) {
478	0						my $count = $self->count();
479	0						$self->frequency_distribution();
480	0						my $val;
481	0						my $cumul_freq = $count;
482	0						foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
	0
	0
483	0						$cumul_freq -= $self->{"stats"}{"Dist"}{"frequency"}{$val};
484	0						$self->{"stats"}{"Dist"}{"ccdf"}{$val} = $cumul_freq / $count;
485							}
486							}
487	0						return $self->{"stats"}{"Dist"}{"ccdf"};
488							}
489
490
491
492							################## Binned Distributions Statistics ##################
493
494
495							sub _binned_frequency_distribution {
496	0			0			my $self = shift;
497	0	0					if(!defined($self->{"stats"}{"Dist"}{"binned"}{"frequency"})) {
498							# 1. get bins
499	0						$self->bins(); # set $self->{"bins"}
500							# 2. get regular frequency distribution
501	0						$self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
502							# 3. compute binned property
503	0						my $binned_frequency = {};
504	0						my $i = 0; # vals iterator
505	0						my @sorted_vals = (sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}});
	0
	0
506	0						my $num_vals = scalar @sorted_vals;
507	0						my $bi = 0; # bins iterator
508	0						my @sorted_bins = (sort {$a<=>$b} keys %{$self->{"bins"}});
	0
	0
509	0						my $num_bins = scalar @sorted_bins;
510	0						my $freq_sum = 0;
511	0						for(; $bi < $num_bins; $bi++) { # for each bin
512	0						for(; $i < $num_vals; $i++) {
513	0	0					if($sorted_vals[$i] < $self->{"bins"}->{$sorted_bins[$bi]}{"right"}) {
514	0						$freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_vals[$i]}
515							}
516							else {
517	0						$binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
518	0						$freq_sum = 0;
519	0						last;
520							}
521							}
522
523							# if last element of vals then we save the current freq sum
524							# in the current bin
525	0	0					if($i == $num_vals) {
526	0						$binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
527							}
528							}
529	0						$self->{"stats"}{"Dist"}{"binned"}{"frequency"} = $binned_frequency;
530							}
531	0						return $self->{"stats"}{"Dist"}{"binned"}{"frequency"};
532							}
533
534
535							sub _binned_probability_mass_function {
536	0			0			my $self = shift;
537	0	0					if(!defined($self->{"stats"}{"Dist"}{"binned"}{"pmf"})) {
538	0						my $count = $self->count();
539	0						my $bins = $self->bins(); # set $self->{"bins"}
540	0						my $bfd = $self->_binned_frequency_distribution();
541	0						my ($val,$freq, $interval);
542	0						foreach $val(keys %{$bfd}) {
	0
543	0						$freq = $self->{"stats"}{"Dist"}{"binned"}{"frequency"}{$val};
544	0						$interval = $self->{"bins"}{$val}{"right"} - $self->{"bins"}{$val}{"left"};
545	0	0					if($interval > 0) {
546	0						$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = ($freq / $count) / $interval;
547							}
548							else {
549	0						$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = $freq / $count;
550							}
551							}
552							}
553	0						return $self->{"stats"}{"Dist"}{"binned"}{"pmf"};
554							}
555
556
557							sub _binned_empirical_distribution_function {
558	0			0			my $self = shift;
559	0	0					if(!defined($self->{"stats"}{"Dist"}{"binned"}{"cdf"})) {
560	0						my $bpmf = $self->_binned_probability_mass_function();
561	0						my $val;
562	0						my $cumul_prob = 0;
563	0						foreach $val(sort {$a<=>$b} keys %{$bpmf}) {
	0
	0
564	0						$cumul_prob += $bpmf->{$val};
565	0						$self->{"stats"}{"Dist"}{"binned"}{"cdf"}{$val} = $cumul_prob;
566							}
567							}
568	0						return $self->{"stats"}{"Dist"}{"binned"}{"cdf"};
569							}
570
571
572							sub _binned_complementary_cumulative_distribution_function {
573	0			0			my $self = shift;
574	0	0					if(!defined($self->{"stats"}{"Dist"}{"binned"}{"ccdf"})) {
575	0						my $bpmf = $self->_binned_probability_mass_function();
576	0						my $val;
577	0						my $cumul_prob = 0;
578	0						my $old_cumul_prob = 0;
579	0						foreach $val(sort {$b<=>$a} keys %{$bpmf}) { # descending sort
	0
	0
580	0						$old_cumul_prob = $cumul_prob;
581	0						$cumul_prob += $bpmf->{$val};
582	0						$self->{"stats"}{"Dist"}{"binned"}{"ccdf"}{$val} = $old_cumul_prob;
583							}
584							}
585	0						return $self->{"stats"}{"Dist"}{"binned"}{"ccdf"};
586							}
587
588
589							################## Binned Related Functions ##################
590
591
592							sub set_binning_type {
593	0			0	1		my $self = shift;
594	0						my $bin_type = shift;
595							# check bin-type validity
596	0	0	0				if($bin_type != NO_BINNING && $bin_type != LIN_BINNING &&
			0
597							$bin_type != LOG_BINNING) {
598	0						die "Wrong binning type provided\n";
599							}
600							# if bin type is already set
601	0	0					if($bin_type == $self->{"bin-type"}) {
602	0						return;
603							}
604							# if bin-type is different
605	0						$self->{"bin-type"} = $bin_type;
606	0						delete $self->{"num-bins"};
607	0						delete $self->{"bins"};
608	0						delete $self->{"stats"}{"Dist"}{"binned"};
609							}
610
611
612							sub set_optimal_binning {
613	0			0	1		my $self = shift;
614	0	0					if( $self->{"optimal-binning"} == 0) {
615	0						delete $self->{"bins"};
616	0						delete $self->{"num-bins"};
617	0						delete $self->{"stats"}{"Dist"}{"binned"};
618							}
619	0						$self->{"optimal-binning"} = 1;
620							}
621
622
623							sub set_custom_num_bins {
624	0			0	1		my $self = shift;
625	0						my $nb = shift;
626	0	0					if($self->{"bin-type"} == NO_BINNING) {
627	0						die "Set a binning type before setting the num of bins\n";
628							}
629	0	0					if( $nb < 2 ) {
630	0						die "Wrong number of bins provided: " . $nb . "\n";
631							}
632							# check if there is a change
633	0	0	0				if( $self->{"optimal-binning"} == 0 and
			0
634							defined ($self->{"num-bins"}) and
635							$self->{"num-bins"} == $nb) {
636							# nothing changed -> nothing to do
637							}
638							else {
639							# update binning parameters
640	0						$self->{"optimal-binning"} = 0;
641	0						$self->{"num-bins"} = $nb;
642	0						delete $self->{"stats"}{"Dist"}{"binned"};
643	0						delete $self->{"bins"};
644							}
645							}
646
647
648							sub bins {
649	0			0	1		my $self = shift;
650	0	0					if($self->count() == 0) {
651	0						print "WARNING: empty dataset, returning empty bins. \n";
652	0						return {}; # empty array
653							}
654
655	0	0					if(!defined($self->{"bins"})) {
656							# CASE 1: if no_binning is set we do not
657							# save the binning structure
658	0	0					if( $self->{"bin-type"} == NO_BINNING) {
659	0						my $curbins = {};
660	0						my $v;
661	0						foreach $v(keys %{$self->{"data_frequency"}}) {
	0
662	0						my $ref = $v;
663	0						$curbins->{$v}{"left"} = $v;
664	0						$curbins->{$v}{"right"} = $v;
665							}
666	0						return $curbins;
667							}
668
669							# CASE 2: if a custom number of bins is
670							# provided, we lin-/log- bin the support
671							# accordingly
672	0	0					if($self->{"optimal-binning"} == 0) {
673	0	0					if($self->{"bin-type"} == LIN_BINNING) {
674	0						$self->{"bins"} = $self->compute_lin_bins($self->{"num-bins"});
675							}
676	0	0					if($self->{"bin-type"} == LOG_BINNING) {
677	0						$self->{"bins"} = $self->compute_log_bins($self->{"num-bins"});
678							}
679							}
680
681							# CASE 3: optimal binning
682							else {
683	0						my $res = $self->_optimal_binning();
684	0	0					if($res != 0) {
685	0						print "Optimal binning was not possible on this sample, ";
686	0						print "using the NO_BINNING mode" . "\n";
687	0						my $curbins = {};
688	0						my $v;
689	0						foreach $v(keys %{$self->{"data_frequency"}}) {
	0
690	0						my $ref = $v;
691	0						$curbins->{$v}{"left"} = $v;
692	0						$curbins->{$v}{"right"} = $v;
693							}
694	0						$self->{"bin-type"} = NO_BINNING;
695	0						$self->{"bins"} = $curbins;
696							}
697							}
698							}
699	0						return $self->{"bins"};
700							}
701
702
703							################## Optimal Binning Related Functions ##################
704
705							# Compute the optimal binning
706							sub _optimal_binning {
707	0			0			my $self = shift;
708							# LEGEND:
709							# b_* => bins
710							# f_* => binned frequency distribution
711							# d_* => binned density distribution
712							# s_* => sum of density distribution values
713							#
714							# start from extreme values ( min_num_bins and max_num_bins)
715							# and use the bisection method on the interval
716							# till the optimal is found (i.e. the binning such that
717							# the sum of the density function values is the closest
718							# to 1)
719	0						my $min_num_bins = 2;
720	0						my ($b_min, $f_min, $d_min, $s_min) = $self->_bin_attempt($min_num_bins);
721	0						my $max_num_bins = scalar $self->_full_support();
722	0	0					if($max_num_bins == 1) {
723	0						return -1;
724							}
725	0						my ($b_max, $f_max, $d_max, $s_max) = $self->_bin_attempt($max_num_bins);
726							# initial check
727	0	0	0				if( $s_min > 1 or $s_max < 1) {
728	0						return -1;
729							}
730	0						my $avg_num_bins;
731	0						my ($b_avg, $f_avg, $d_avg, $s_avg);
732	0						while( ($max_num_bins - $min_num_bins) > 1) {
733	0						$avg_num_bins = sprintf("%.f", (($max_num_bins + $min_num_bins)/2));
734	0						($b_avg, $f_avg, $d_avg, $s_avg) = $self->_bin_attempt($avg_num_bins);
735	0	0					if($s_avg < 1) {
736	0						$min_num_bins = $avg_num_bins;
737	0						($b_min, $f_min, $d_min, $s_min) = ($b_avg, $f_avg, $d_avg, $s_avg);
738							}
739							else {
740	0						$max_num_bins = $avg_num_bins;
741	0						($b_max, $f_max, $d_max, $s_max) = ($b_avg, $f_avg, $d_avg, $s_avg);
742							}
743							}
744							# the binning whose density sum is closer to 1 win
745	0	0					if( (1 - $s_min) < ($s_max-1) ) {
746	0						$self->{"bins"} = $b_min;
747	0						$self->{"Dist"}{"binned"}{"frequency"} = $f_min;
748	0						$self->{"Dist"}{"binned"}{"pmf"} = $d_min;
749							}
750							else {
751	0						$self->{"bins"} = $b_max;
752	0						$self->{"Dist"}{"binned"}{"frequency"} = $f_max;
753	0						$self->{"Dist"}{"binned"}{"pmf"} = $d_max;
754							}
755	0						return 0;
756							}
757
758
759							# bin the current data into $num_bins intervals
760							# and return: the bins, the binned frequency distribution
761							# the binned density distribution and the sum of the
762							# density values
763							sub _bin_attempt {
764	0			0			my $self = shift;
765	0						my $num_bins = shift;
766							# output values
767	0						my ($cur_bins, $cur_freq, $cur_density, $cur_sum);
768	0						$cur_sum = 0;
769							# get the bins
770	0	0					if( $self->{"bin-type"} == LIN_BINNING ) {
771	0						$cur_bins = $self->compute_lin_bins($num_bins);
772							}
773	0	0					if( $self->{"bin-type"} == LOG_BINNING ) {
774	0						$cur_bins =$self->compute_log_bins($num_bins);
775							}
776							# assert non binned frequency distribution is computed
777	0						$self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
778	0						my $count = $self->count();
779							#
780	0						my $s = 0; # support iterator
781	0						my @sorted_support = (sort {$a<=>$b} $self->_full_support());
	0
782	0						my $sup_size = scalar @sorted_support;
783	0						my $bi = 0; # bins iterator
784	0						my @sorted_bins = (sort {$a<=>$b} keys %{$cur_bins});
	0
	0
785	0						my $n_bins = scalar @sorted_bins; # assert(n_bins == num_bins)
786	0						my $ref = 0;
787	0						my $freq_sum = 0;
788	0						my $interval = 0;
789	0						for(; $bi < $n_bins; $bi++) { # for each bin
790	0						$ref = $sorted_bins[$bi];
791	0						$interval = $cur_bins->{$ref}{"right"} - $cur_bins->{$ref}{"left"};
792	0						for(; $s < $sup_size; $s++) { # for each support value
793							# if the current support value is less than the rightmost value in the bin
794	0	0					if($sorted_support[$s] < $cur_bins->{$ref}{"right"}) {
795	0						$freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_support[$s]};
796							}
797							# else we have to save the information for the current bin
798							else {
799	0						$cur_freq->{$ref} = $freq_sum;
800	0	0					if($interval > 0) {
801	0						$cur_density->{$ref} = ($freq_sum / $count) / $interval;
802							}
803							else {
804	0						$cur_density->{$ref} = $freq_sum / $count;
805							}
806	0						$cur_sum += $cur_density->{$ref};
807	0						$freq_sum = 0;
808	0						last; # we do not increment s
809							}
810							}
811							# if last element of the support has been reached
812							# we have to save the current freq sum in the current bin
813	0	0					if($s == $sup_size) {
814	0						$cur_freq->{$ref} = $freq_sum;
815	0	0					if($interval > 0) {
816	0						$cur_density->{$ref} = ($freq_sum / $count) / $interval;
817							}
818							else {
819	0						$cur_density->{$ref} = $freq_sum / $count;
820							}
821	0						$cur_sum += $cur_density->{$ref};
822							}
823							}
824							#DEBUG print "attempt: " . $num_bins . " sum: " . $cur_sum . "\n";
825	0						return ($cur_bins, $cur_freq, $cur_density, $cur_sum);
826							}
827
828
829
830							sub _full_support {
831	0			0			my $self = shift;
832	0						my $support = {};
833	0						my $v;
834	0						foreach $v( keys %{$self->{"data_frequency"}}) {
	0
835	0						$support->{$v} = 1;
836							}
837	0						return (keys %{$support});
	0
838							}
839
840
841
842
843							################## Utilities ##################
844
845							# return a structure describing a linear binning
846							# of the current data ($num_bins provided)
847							sub compute_lin_bins {
848	0			0	1		my $self = shift;
849	0						my $num_bins = shift;
850	0						my $min = $self->minimum();
851	0						my $max = $self->maximum();
852	0						my $linbins = {}; # reference to empty hash
853							# last bin includes just the maximum value
854	0						my $delta = abs($max - $min)/($num_bins-1);
855	0						my $i;
856							my $ref;
857	0						for($i=0; $i < $num_bins; $i++) {
858	0						$ref = $min + $i * $delta + $delta/2;
859	0						$linbins->{$ref}{"left"} = $min + $i * $delta;
860	0						$linbins->{$ref}{"right"} = $min + $i * $delta + $delta;
861							}
862	0						return $linbins;
863							}
864
865
866							# return a structure describing a logarithmic binning
867							# of the current data ($num_bins provided)
868							# bins are not set
869							sub compute_log_bins {
870	0			0	1		my $self = shift;
871	0						my $num_bins = shift;
872	0						my $min = $self->minimum();
873	0						my $max = $self->maximum();
874	0	0	0				if($min == 0 or $max == 0 or $min *$max <0) {
			0
875							# if the interval contains zero, then we cannot logbin
876							# we return the linear binning
877	0						print "Logarithmic binning was not possible on this sample, ";
878	0						print "switching to linear binning mode" . "\n";
879	0						$self->{"bin-type"} = LIN_BINNING;
880	0						return $self->compute_lin_bins($num_bins);
881							}
882	0						my $logbins = {}; # reference to empty hash
883	0						my $min_exp = log($min)/log(LOG_BASE);
884	0						my $max_exp = log($max+1)/log(LOG_BASE);
885	0						my $delta_exp = abs($max_exp - $min_exp)/$num_bins;
886	0						my $cur_exp;
887							my $i;
888	0						my $ref;
889	0						for($i=0; $i < $num_bins; $i++) {
890	0						$cur_exp = $min_exp + $i * $delta_exp;
891	0						$ref = LOG_BASE ** ($cur_exp + $delta_exp/2);
892	0						$logbins->{$ref}{"left"} = LOG_BASE ** $cur_exp;
893	0						$logbins->{$ref}{"right"} = LOG_BASE ** ($cur_exp + $delta_exp);
894							}
895	0						return $logbins;
896							}
897
898
899							1;