File Coverage

lib/Statistics/Discrete.pm

Criterion	Covered	Total	%
statement	109	487	22.3
branch	19	142	13.3
condition	2	27	7.4
subroutine	17	42	40.4
pod	18	22	81.8
total	165	720	22.9

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2							#
3							# Statistics::Discrete
4							#
5							# Chiara Orsini, CAIDA, UC San Diego
6							# chiara@caida.org
7							#
8							# Copyright (C) 2014 The Regents of the University of California.
9							#
10							# Statistics::Discrete is free software: you can redistribute it and/or modify
11							# it under the terms of the GNU General Public License as published by
12							# the Free Software Foundation, either version 3 of the License, or
13							# (at your option) any later version.
14							#
15							# Statistics::Discrete is distributed in the hope that it will be useful,
16							# but WITHOUT ANY WARRANTY; without even the implied warranty of
17							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18							# GNU General Public License for more details.
19							#
20							# You should have received a copy of the GNU General Public License
21							# along with Statistics::Discrete. If not, see .
22							#
23
24							package Statistics::Discrete;
25
26	1			1		25627	use 5.012;
	1					5
27	1			1		7	use strict;
	1					1
	1					24
28	1			1		5	use warnings;
	1					6
	1					33
29	1			1		13	use List::Util;
	1					2
	1					71
30	1			1		1047	use POSIX;
	1					8864
	1					7
31	1			1		6142	use Storable 'dclone';
	1					5043
	1					110
32
33							our $VERSION = '0.05.00';
34
35							# require Exporter;
36							# our @EXPORT_OK = ( 'NO_BINNING', 'LIN_BINNING', 'LOG_BINNING');
37
38							use constant {
39	1					105	NO_BINNING => 0,
40							LIN_BINNING => 1,
41							LOG_BINNING => 2,
42	1			1		8	};
	1					2
43
44	1			1		5	use constant LOG_BASE => 10;
	1					2
	1					45
45	1			1		5	use constant DEFAULT_BINS_NUMBER => 10;
	1					2
	1					5027
46
47
48							# Constructor
49							sub new {
50	1			1	0	469	my $class = shift;
51	1					2	my $self = {};
52	1					3	$self->{"data_frequency"} = {}; # anonymous hash constructor
53							# binning default values
54	1					3	$self->{"bin-type"} = NO_BINNING;
55	1					3	$self->{"optimal-binning"} = 1;
56							# $self->{"bins"} - bins' structure
57							# $self->{"num-bins"} - num bins when optimal binning is off
58							# on-demand stored stats
59							# $self->{"stats"}{"Desc"} - descriptive statistics
60							# $self->{"stats"}{"Dist"} - distributions
61							# $self->{"stats"}{"Dist"}{"binned"} - binned distributions
62							# bind self and class
63	1					2	bless $self, $class;
64	1					4	return $self;
65							}
66
67							################## Methods to load/add data ##################
68
69							sub add_data {
70	1			1	0	401	my $self = shift;
71	1					3	my @data_to_add = @_;
72	1					2	my $data;
73	1					2	foreach $data(@data_to_add) {
74	10	100				28	if(defined($self->{"data_frequency"}{$data})) {
75	4					9	$self->{"data_frequency"}{$data}++;
76							}
77							else {
78	6					17	$self->{"data_frequency"}{$data} = 1 ;
79							}
80							}
81							# data has changed - stats and bins need to be computed again
82	1					2	delete $self->{"stats"};
83	1					3	delete $self->{"bins"};
84	1					3	return;
85							}
86
87							sub add_data_from_file {
88	0			0	0	0	my $self = shift;
89	0					0	my $filename = shift;
90	0					0	my @cols;
91							my $f;
92	0					0	my $line;
93	0	0				0	open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
94	0					0	while($line = <$f>) {
95	0					0	chomp($line);
96	0	0				0	if($line =~ /^\s*#/) {
97	0					0	next;
98							}
99	0					0	@cols = split /\s/ , $line;
100	0	0				0	if(scalar @cols == 1) { # assume list of values
101	0	0				0	if(defined($self->{"data_frequency"}{$cols[0]})) {
102	0					0	$self->{"data_frequency"}{$cols[0]}++;
103							}
104							else {
105	0					0	$self->{"data_frequency"}{$cols[0]} = 1 ;
106							}
107							}
108	0	0				0	if(scalar @cols == 2) { # assume list of id - value pairs
109	0	0				0	if(defined($self->{"data_frequency"}{$cols[1]})) {
110	0					0	$self->{"data_frequency"}{$cols[1]}++;
111							}
112							else {
113	0					0	$self->{"data_frequency"}{$cols[1]} = 1 ;
114							}
115							}
116							# File format not recognized
117	0	0	0			0	if(scalar @cols != 1 and scalar @cols != 2){
118	0					0	print "File format not recognized\n";
119	0					0	close($f);
120	0					0	delete $self->{"stats"};
121	0					0	delete $self->{"bins"};
122	0					0	return;
123							}
124							}
125	0					0	close($f);
126							# data has changed - stats and bins need to be computed again
127	0					0	delete $self->{"stats"};
128	0					0	delete $self->{"bins"};
129	0					0	return;
130							}
131
132							sub add_data_from_frequencyfile {
133	0			0	0	0	my $self = shift;
134	0					0	my $filename = shift;
135	0					0	my @cols;
136							my $f;
137	0					0	my $line;
138	0	0				0	open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
139	0					0	while($line = <$f>) {
140	0					0	chomp($line);
141	0	0				0	if($line =~ /^\s*#/) {
142	0					0	next;
143							}
144	0					0	@cols = split /\s+/ , $line;
145							#DEBUG print $line . "\t" . @cols . "\n";
146	0	0				0	if(scalar @cols == 2) { # assume list of value - count
147	0	0				0	if(defined($self->{"data_frequency"}{$cols[0]})) {
148	0					0	$self->{"data_frequency"}{$cols[0]} += $cols[1];
149							}
150							else {
151	0					0	$self->{"data_frequency"}{$cols[0]} = $cols[1];
152							}
153	0					0	next;
154							}
155							# File format not recognized
156	0	0				0	if(scalar @cols != 2){
157	0					0	print "File format not recognized\n";
158	0					0	close($f);
159	0					0	delete $self->{"stats"};
160	0					0	delete $self->{"bins"};
161	0					0	return;
162							}
163							}
164	0					0	close($f);
165							# data has changed - stats and bins need to be computed again
166	0					0	delete $self->{"stats"};
167	0					0	delete $self->{"bins"};
168	0					0	return;
169							}
170
171
172							################## Descriptive Statistics ##################
173
174							# Minimum value
175							# compute the minimum value, save the statistic, and return it
176							sub minimum {
177	1			1	1	2	my $self = shift;
178	1	50				5	if(!defined($self->{"stats"}{"Desc"}{"min"})) {
179	1					25	my $count = $self->count();
180	1	50				9	if($count > 0) {
181	1					2	$self->{"stats"}{"Desc"}{"min"} = List::Util::min(keys %{$self->{"data_frequency"}});
	1					15
182							}
183							else {
184	0					0	$self->{"stats"}{"Desc"}{"min"} = 0;
185							}
186							}
187	1					5	return $self->{"stats"}{"Desc"}{"min"};
188							}
189
190
191							# Maximum value
192							# compute the minimum value, save the statistic, and return it
193							sub maximum {
194	1			1	1	2	my $self = shift;
195	1	50				5	if(!defined($self->{"stats"}{"Desc"}{"max"})) {
196	1					3	my $count = $self->count();
197	1	50				3	if($count > 0) {
198	1					2	$self->{"stats"}{"Desc"}{"max"} = List::Util::max(keys %{$self->{"data_frequency"}});
	1					8
199							}
200							else {
201	0					0	$self->{"stats"}{"Desc"}{"max"} = 0;
202							}
203							}
204	1					5	return $self->{"stats"}{"Desc"}{"max"};
205							}
206
207
208							# count the number of samples provided
209							# save the statistic, and return it
210							sub count {
211	6			6	1	15	my $self = shift;
212	6	100				19	if(!defined($self->{"stats"}{"Desc"}{"count"})) {
213	1					2	my $count = 0;
214	1					1	my $data;
215	1					2	foreach $data(keys %{$self->{"data_frequency"}}) {
	1					5
216	6					11	$count += $self->{"data_frequency"}{$data};
217							}
218	1					4	$self->{"stats"}{"Desc"}{"count"} = $count;
219							}
220	6					15	return $self->{"stats"}{"Desc"}{"count"};
221							}
222
223
224							# http://en.wikipedia.org/wiki/Expected_value
225							# the weighted average of the possible values,
226							# using their probabilities as their weights
227							sub mean {
228	2			2	1	5	my $self = shift;
229	2	100				7	if(!defined($self->{"stats"}{"Desc"}{"mean"})) {
230	1					2	my $cumul_value = 0;
231	1					4	my $count = $self->count();
232	1					2	my $v;
233	1					2	foreach $v( keys %{$self->{"data_frequency"}}) {
	1					4
234	6					14	$cumul_value += $v * $self->{"data_frequency"}{$v};
235							}
236	1	50				4	if($count > 0) {
237	1					4	$self->{"stats"}{"Desc"}{"mean"} = $cumul_value / $count;
238							}
239							else {
240	0					0	$self->{"stats"}{"Desc"}{"mean"} = 0;
241							}
242							}
243	2					8	return $self->{"stats"}{"Desc"}{"mean"};
244							}
245
246
247
248							# http://en.wikipedia.org/wiki/Median
249							# the median is the numerical value separating
250							# the higher half of a data sample, a population,
251							# or a probability distribution, from the lower half.
252							sub median {
253	1			1	1	2	my $self = shift;
254	1	50				6	if(!defined($self->{"stats"}{"Desc"}{"median"})) {
255	1					3	my $count = $self->count();
256	1					3	my $odd = 0;
257	1	50				4	if($count % 2 != 0) {
258	0					0	$odd = 1;
259							}
260	1					15	my $median_index = floor($count/2) + $odd;
261	1					2	my $current_index = 0;
262
263	1					2	my ($v,$i);
264	1					2	foreach $v(sort {$a<=>$b} keys %{$self->{"data_frequency"}}) {
	10					17
	1					7
265	6					18	for($i=0; $i < $self->{"data_frequency"}{$v}; $i++) {
266	10					11	$current_index++;
267	10	100				22	if($current_index == $median_index) {
268	1					3	$self->{"stats"}{"Desc"}{"median"} = $v;
269							}
270	10	100	66			45	if($current_index == ($median_index+1) and
271							$odd == 0) {
272	1					5	$self->{"stats"}{"Desc"}{"median"} = ($self->{"stats"}{"Desc"}{"median"} + $v)/2;
273							}
274
275							}
276							}
277							}
278	1					5	return $self->{"stats"}{"Desc"}{"median"};
279							}
280
281
282							# http://en.wikipedia.org/wiki/Percentile
283							# A percentile (or a centile) is a measure
284							# used in statistics indicating the value
285							# below which a given percentage of observations
286							# in a group of observations fall.
287							sub percentile {
288	0			0	1	0	my $self = shift;
289	0					0	my $perc = shift;
290	0					0	my $max_probability = $perc/100;
291	0					0	my $cdf = $self->empirical_distribution_function();
292	0					0	my $v = 0;
293	0					0	my $previous_v = 0;
294	0					0	my $epsilon = 0;
295	0					0	foreach $v(sort {$a<=>$b} keys %{$cdf}) {
	0					0
	0					0
296	0					0	$epsilon = $v - $previous_v;
297	0	0				0	if($cdf->{$v} > $max_probability) {
298	0					0	return $v;
299							}
300	0					0	$previous_v = $v;
301							}
302							# if the program is here then $perc == 1
303							# then we return $v+epsilon
304	0					0	return $v+$epsilon;
305							}
306
307
308
309							# http://en.wikipedia.org/wiki/Variance
310							# The variance of a random variable X is
311							# its second central moment, the expected
312							# value of the squared deviation from the mean
313							sub variance {
314	1			1	1	2	my $self = shift;
315	1	50				6	if(!defined($self->{"stats"}{"Desc"}{"variance"})) {
316	1					3	my $mean = $self->mean();
317	1					3	my $count = $self->count();
318	1					2	my $cumul_value = 0;
319	1					2	my $square_mean = 0;
320	1					7	my $v;
321	1					3	foreach $v(keys %{$self->{"data_frequency"}}) {
	1					12
322	6					16	$cumul_value += ($v*2) $self->{"data_frequency"}{$v};
323							}
324	1	50				4	if($count > 0) {
325	1					2	$square_mean = $cumul_value / $count;
326							}
327	1					4	$self->{"stats"}{"Desc"}{"variance"} = $square_mean - ($mean**2);
328							}
329	1					17	return $self->{"stats"}{"Desc"}{"variance"};
330							}
331
332
333							# http://en.wikipedia.org/wiki/Standard_deviation
334							# The standard deviation of a random variable
335							# is the square root of its variance.
336							sub standard_deviation {
337	0			0	1		my $self = shift;
338	0	0					if(!defined($self->{"stats"}{"Desc"}{"sdev"})) {
339	0						my $variance = $self->variance();
340	0						$self->{"stats"}{"Desc"}{"sdev"} = sqrt($variance);
341							}
342	0						return $self->{"stats"}{"Desc"}{"sdev"};
343							}
344
345
346							################## Distributions Statistics ##################
347
348							# http://en.wikipedia.org/wiki/Frequency_distribution
349							# the frequency distribution is an arrangement of the values
350							# that one or more variables take in a sample. Each entry in
351							# the table contains the frequency or count of the occurrences
352							# of values within a particular group or interval
353							sub frequency_distribution {
354	0			0	1		my $self = shift;
355	0	0					if($self->count() == 0) {
356	0						print "WARNING: empty dataset, returning empty frequency distribution. \n";
357	0						return (); # empty hash
358							}
359	0	0					if($self->{"bin-type"} == NO_BINNING) {
360							# $self->{"stats"}{"Dist"}{"frequency"}
361	0						return $self->_frequency_distribution();
362							}
363							else {
364							# $self->{"stats"}{"Dist"}{"binned"}{"frequency"}
365	0						return $self->_binned_frequency_distribution();
366							}
367							}
368
369
370							# http://en.wikipedia.org/wiki/Probability_mass_function
371							# the probability mass function (pmf) is a function that gives
372							# the probability that a discrete random variable is exactly
373							# equal to some value.
374							sub probability_mass_function {
375	0			0	1		my $self = shift;
376	0	0					if($self->count() == 0) {
377	0						print "WARNING: empty dataset, returning empty probability mass function. \n";
378	0						return (); # empty hash
379							}
380	0	0					if($self->{"bin-type"} == NO_BINNING) {
381							# $self->{"stats"}{"Dist"}{"pmf"}
382	0						return $self->_probability_mass_function();
383							}
384							else {
385							# $self->{"stats"}{"Dist"}{"binned"}{"pmf"}
386	0						return $self->_binned_probability_mass_function();
387							}
388							}
389
390
391							# http://en.wikipedia.org/wiki/Empirical_distribution_function
392							# the empirical distribution function, or empirical cdf, is the
393							# cumulative distribution function associated with the empirical
394							# measure of the sample. This cdf is a step function that jumps up
395							# by 1/n at each of the n data points.
396							sub empirical_distribution_function {
397	0			0	1		my $self = shift;
398	0	0					if($self->count() == 0) {
399	0						print "WARNING: empty dataset, returning empty emptirical distribution function. \n";
400	0						return (); # empty hash
401							}
402	0	0					if($self->{"bin-type"} == NO_BINNING) {
403							# $self->{"stats"}{"Dist"}{"cdf"}
404	0						return $self->_empirical_distribution_function();
405							}
406							else {
407							# $self->{"stats"}{"Dist"}{"binned"}{"cdf"}
408	0						return $self->_binned_empirical_distribution_function();
409							}
410							}
411
412
413							# http://en.wikipedia.org/wiki/CCDF
414							# how often the random variable is above a particular level.
415							sub complementary_cumulative_distribution_function {
416	0			0	1		my $self = shift;
417	0	0					if($self->count() == 0) {
418	0						print "WARNING: empty dataset, returning empty complementary cumulative distribution function. \n";
419	0						return (); # empty hash
420							}
421	0	0					if($self->{"bin-type"} == NO_BINNING) {
422							# $self->{"stats"}{"Dist"}{"ccdf"}
423	0						return $self->_complementary_cumulative_distribution_function();
424							}
425							else {
426							# $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}
427	0						return $self->_binned_complementary_cumulative_distribution_function();
428							}
429							}
430
431
432
433							################## Regular Distributions Statistics ##################
434
435
436							sub _frequency_distribution {
437	0			0			my $self = shift;
438	0	0					if(!defined($self->{"stats"}{"Dist"}{"frequency"})) {
439	0						$self->{"stats"}{"Dist"}{"frequency"} = dclone $self->{"data_frequency"};
440							}
441	0						return $self->{"stats"}{"Dist"}{"frequency"};
442							}
443
444
445							sub _probability_mass_function {
446	0			0			my $self = shift;
447	0	0					if(!defined($self->{"stats"}{"Dist"}{"pmf"})) {
448	0						my $count = $self->count();
449	0						$self->_frequency_distribution();
450	0						my ($val,$freq);
451	0						foreach $val(keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
	0
452	0						$freq = $self->{"stats"}{"Dist"}{"frequency"}{$val};
453	0						$self->{"stats"}{"Dist"}{"pmf"}{$val} = $freq / $count;
454							}
455							}
456	0						return $self->{"stats"}{"Dist"}{"pmf"};
457							}
458
459
460							sub _empirical_distribution_function {
461	0			0			my $self = shift;
462	0	0					if(!defined($self->{"stats"}{"Dist"}{"cdf"})) {
463	0						my $count = $self->count();
464	0						$self->_frequency_distribution();
465	0						my $val;
466	0						my $cumul_freq = 0;
467	0						foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
	0
	0
468	0						$cumul_freq += $self->{"stats"}{"Dist"}{"frequency"}{$val};
469	0						$self->{"stats"}{"Dist"}{"cdf"}{$val} = $cumul_freq / $count;
470							}
471							}
472	0						return $self->{"stats"}{"Dist"}{"cdf"};
473							}
474
475
476							sub _complementary_cumulative_distribution_function {
477	0			0			my $self = shift;
478							# warning, this creates self->{"stats"} if it doesn't exist
479	0	0					if(!defined($self->{"stats"}{"Dist"}{"ccdf"})) {
480	0						my $count = $self->count();
481	0						$self->frequency_distribution();
482	0						my $val;
483	0						my $cumul_freq = $count;
484	0						foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
	0
	0
485	0						$cumul_freq -= $self->{"stats"}{"Dist"}{"frequency"}{$val};
486	0						$self->{"stats"}{"Dist"}{"ccdf"}{$val} = $cumul_freq / $count;
487							}
488							}
489	0						return $self->{"stats"}{"Dist"}{"ccdf"};
490							}
491
492
493
494							################## Binned Distributions Statistics ##################
495
496
497							sub _binned_frequency_distribution {
498	0			0			my $self = shift;
499	0	0					if(!defined($self->{"stats"}{"Dist"}{"binned"}{"frequency"})) {
500							# 1. get bins
501	0						$self->bins(); # set $self->{"bins"}
502							# 2. get regular frequency distribution
503	0						$self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
504							# 3. compute binned property
505	0						my $binned_frequency = {};
506	0						my $i = 0; # vals iterator
507	0						my @sorted_vals = (sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}});
	0
	0
508	0						my $num_vals = scalar @sorted_vals;
509	0						my $bi = 0; # bins iterator
510	0						my @sorted_bins = (sort {$a<=>$b} keys %{$self->{"bins"}});
	0
	0
511	0						my $num_bins = scalar @sorted_bins;
512	0						my $freq_sum = 0;
513	0						for(; $bi < $num_bins; $bi++) { # for each bin
514	0						for(; $i < $num_vals; $i++) {
515	0	0					if($sorted_vals[$i] < $self->{"bins"}->{$sorted_bins[$bi]}{"right"}) {
516	0						$freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_vals[$i]}
517							}
518							else {
519	0						$binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
520	0						$freq_sum = 0;
521	0						last;
522							}
523							}
524
525							# if last element of vals then we save the current freq sum
526							# in the current bin
527	0	0					if($i == $num_vals) {
528	0						$binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
529							}
530							}
531	0						$self->{"stats"}{"Dist"}{"binned"}{"frequency"} = $binned_frequency;
532							}
533	0						return $self->{"stats"}{"Dist"}{"binned"}{"frequency"};
534							}
535
536
537							sub _binned_probability_mass_function {
538	0			0			my $self = shift;
539	0	0					if(!defined($self->{"stats"}{"Dist"}{"binned"}{"pmf"})) {
540	0						my $count = $self->count();
541	0						my $bins = $self->bins(); # set $self->{"bins"}
542	0						my $bfd = $self->_binned_frequency_distribution();
543	0						my ($val,$freq, $interval);
544	0						foreach $val(keys %{$bfd}) {
	0
545	0						$freq = $self->{"stats"}{"Dist"}{"binned"}{"frequency"}{$val};
546	0						$interval = $self->{"bins"}{$val}{"right"} - $self->{"bins"}{$val}{"left"};
547	0	0					if($interval > 0) {
548	0						$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = ($freq / $count) / $interval;
549							}
550							else {
551	0						$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = $freq / $count;
552							}
553							}
554							}
555	0						return $self->{"stats"}{"Dist"}{"binned"}{"pmf"};
556							}
557
558
559							sub _binned_empirical_distribution_function {
560	0			0			my $self = shift;
561	0	0					if(!defined($self->{"stats"}{"Dist"}{"binned"}{"cdf"})) {
562	0						my $bpmf = $self->_binned_probability_mass_function();
563	0						my $val;
564	0						my $cumul_prob = 0;
565	0						foreach $val(sort {$a<=>$b} keys %{$bpmf}) {
	0
	0
566	0						$cumul_prob += $bpmf->{$val};
567	0						$self->{"stats"}{"Dist"}{"binned"}{"cdf"}{$val} = $cumul_prob;
568							}
569							}
570	0						return $self->{"stats"}{"Dist"}{"binned"}{"cdf"};
571							}
572
573
574							sub _binned_complementary_cumulative_distribution_function {
575	0			0			my $self = shift;
576	0	0					if(!defined($self->{"stats"}{"Dist"}{"binned"}{"ccdf"})) {
577	0						my $bpmf = $self->_binned_probability_mass_function();
578	0						my $val;
579	0						my $cumul_prob = 0;
580	0						my $old_cumul_prob = 0;
581	0						foreach $val(sort {$b<=>$a} keys %{$bpmf}) { # descending sort
	0
	0
582	0						$old_cumul_prob = $cumul_prob;
583	0						$cumul_prob += $bpmf->{$val};
584	0						$self->{"stats"}{"Dist"}{"binned"}{"ccdf"}{$val} = $old_cumul_prob;
585							}
586							}
587	0						return $self->{"stats"}{"Dist"}{"binned"}{"ccdf"};
588							}
589
590
591							################## Binned Related Functions ##################
592
593
594							sub set_binning_type {
595	0			0	1		my $self = shift;
596	0						my $bin_type = shift;
597							# check bin-type validity
598	0	0	0				if($bin_type != NO_BINNING && $bin_type != LIN_BINNING &&
			0
599							$bin_type != LOG_BINNING) {
600	0						die "Wrong binning type provided\n";
601							}
602							# if bin type is already set
603	0	0					if($bin_type == $self->{"bin-type"}) {
604	0						return;
605							}
606							# if bin-type is different
607	0						$self->{"bin-type"} = $bin_type;
608	0						delete $self->{"num-bins"};
609	0						delete $self->{"bins"};
610	0						delete $self->{"stats"}{"Dist"}{"binned"};
611							}
612
613
614							sub set_optimal_binning {
615	0			0	1		my $self = shift;
616	0	0					if( $self->{"optimal-binning"} == 0) {
617	0						delete $self->{"bins"};
618	0						delete $self->{"num-bins"};
619	0						delete $self->{"stats"}{"Dist"}{"binned"};
620							}
621	0						$self->{"optimal-binning"} = 1;
622							}
623
624
625							sub set_custom_num_bins {
626	0			0	1		my $self = shift;
627	0						my $nb = shift;
628	0	0					if($self->{"bin-type"} == NO_BINNING) {
629	0						die "Set a binning type before setting the num of bins\n";
630							}
631	0	0					if( $nb < 2 ) {
632	0						die "Wrong number of bins provided: " . $nb . "\n";
633							}
634							# check if there is a change
635	0	0	0				if( $self->{"optimal-binning"} == 0 and
			0
636							defined ($self->{"num-bins"}) and
637							$self->{"num-bins"} == $nb) {
638							# nothing changed -> nothing to do
639							}
640							else {
641							# update binning parameters
642	0						$self->{"optimal-binning"} = 0;
643	0						$self->{"num-bins"} = $nb;
644	0						delete $self->{"stats"}{"Dist"}{"binned"};
645	0						delete $self->{"bins"};
646							}
647							}
648
649
650							sub bins {
651	0			0	1		my $self = shift;
652	0	0					if($self->count() == 0) {
653	0						print "WARNING: empty dataset, returning empty bins. \n";
654	0						return {}; # empty array
655							}
656
657	0	0					if(!defined($self->{"bins"})) {
658							# CASE 1: if no_binning is set we do not
659							# save the binning structure
660	0	0					if( $self->{"bin-type"} == NO_BINNING) {
661	0						my $curbins = {};
662	0						my $v;
663	0						foreach $v(keys %{$self->{"data_frequency"}}) {
	0
664	0						my $ref = $v;
665	0						$curbins->{$v}{"left"} = $v;
666	0						$curbins->{$v}{"right"} = $v;
667							}
668	0						return $curbins;
669							}
670
671							# CASE 2: if a custom number of bins is
672							# provided, we lin-/log- bin the support
673							# accordingly
674	0	0					if($self->{"optimal-binning"} == 0) {
675	0	0					if($self->{"bin-type"} == LIN_BINNING) {
676	0						$self->{"bins"} = $self->compute_lin_bins($self->{"num-bins"});
677							}
678	0	0					if($self->{"bin-type"} == LOG_BINNING) {
679	0						$self->{"bins"} = $self->compute_log_bins($self->{"num-bins"});
680							}
681							}
682
683							# CASE 3: optimal binning
684							else {
685	0						my $res = $self->_optimal_binning();
686	0	0					if($res != 0) {
687	0						print "Optimal binning was not possible on this sample, ";
688	0						print "using the NO_BINNING mode" . "\n";
689	0						my $curbins = {};
690	0						my $v;
691	0						foreach $v(keys %{$self->{"data_frequency"}}) {
	0
692	0						my $ref = $v;
693	0						$curbins->{$v}{"left"} = $v;
694	0						$curbins->{$v}{"right"} = $v;
695							}
696	0						$self->{"bin-type"} = NO_BINNING;
697	0						$self->{"bins"} = $curbins;
698							}
699							}
700							}
701	0						return $self->{"bins"};
702							}
703
704
705							################## Optimal Binning Related Functions ##################
706
707							# Compute the optimal binning
708							sub _optimal_binning {
709	0			0			my $self = shift;
710							# LEGEND:
711							# b_* => bins
712							# f_* => binned frequency distribution
713							# d_* => binned density distribution
714							# s_* => sum of density distribution values
715							#
716							# start from extreme values ( min_num_bins and max_num_bins)
717							# and use the bisection method on the interval
718							# till the optimal is found (i.e. the binning such that
719							# the sum of the density function values is the closest
720							# to 1)
721	0						my $min_num_bins = 2;
722	0						my ($b_min, $f_min, $d_min, $s_min) = $self->_bin_attempt($min_num_bins);
723	0						my $max_num_bins = scalar $self->_full_support();
724	0	0					if($max_num_bins == 1) {
725	0						return -1;
726							}
727	0						my ($b_max, $f_max, $d_max, $s_max) = $self->_bin_attempt($max_num_bins);
728							# initial check
729	0	0	0				if( $s_min > 1 or $s_max < 1) {
730	0						return -1;
731							}
732	0						my $avg_num_bins;
733	0						my ($b_avg, $f_avg, $d_avg, $s_avg);
734	0						while( ($max_num_bins - $min_num_bins) > 1) {
735	0						$avg_num_bins = sprintf("%.f", (($max_num_bins + $min_num_bins)/2));
736	0						($b_avg, $f_avg, $d_avg, $s_avg) = $self->_bin_attempt($avg_num_bins);
737	0	0					if($s_avg < 1) {
738	0						$min_num_bins = $avg_num_bins;
739	0						($b_min, $f_min, $d_min, $s_min) = ($b_avg, $f_avg, $d_avg, $s_avg);
740							}
741							else {
742	0						$max_num_bins = $avg_num_bins;
743	0						($b_max, $f_max, $d_max, $s_max) = ($b_avg, $f_avg, $d_avg, $s_avg);
744							}
745							}
746							# the binning whose density sum is closer to 1 win
747	0	0					if( (1 - $s_min) < ($s_max-1) ) {
748	0						$self->{"bins"} = $b_min;
749	0						$self->{"Dist"}{"binned"}{"frequency"} = $f_min;
750	0						$self->{"Dist"}{"binned"}{"pmf"} = $d_min;
751							}
752							else {
753	0						$self->{"bins"} = $b_max;
754	0						$self->{"Dist"}{"binned"}{"frequency"} = $f_max;
755	0						$self->{"Dist"}{"binned"}{"pmf"} = $d_max;
756							}
757	0						return 0;
758							}
759
760
761							# bin the current data into $num_bins intervals
762							# and return: the bins, the binned frequency distribution
763							# the binned density distribution and the sum of the
764							# density values
765							sub _bin_attempt {
766	0			0			my $self = shift;
767	0						my $num_bins = shift;
768							# output values
769	0						my ($cur_bins, $cur_freq, $cur_density, $cur_sum);
770	0						$cur_sum = 0;
771							# get the bins
772	0	0					if( $self->{"bin-type"} == LIN_BINNING ) {
773	0						$cur_bins = $self->compute_lin_bins($num_bins);
774							}
775	0	0					if( $self->{"bin-type"} == LOG_BINNING ) {
776	0						$cur_bins =$self->compute_log_bins($num_bins);
777							}
778							# assert non binned frequency distribution is computed
779	0						$self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
780	0						my $count = $self->count();
781							#
782	0						my $s = 0; # support iterator
783	0						my @sorted_support = (sort {$a<=>$b} $self->_full_support());
	0
784	0						my $sup_size = scalar @sorted_support;
785	0						my $bi = 0; # bins iterator
786	0						my @sorted_bins = (sort {$a<=>$b} keys %{$cur_bins});
	0
	0
787	0						my $n_bins = scalar @sorted_bins; # assert(n_bins == num_bins)
788	0						my $ref = 0;
789	0						my $freq_sum = 0;
790	0						my $interval = 0;
791	0						for(; $bi < $n_bins; $bi++) { # for each bin
792	0						$ref = $sorted_bins[$bi];
793	0						$interval = $cur_bins->{$ref}{"right"} - $cur_bins->{$ref}{"left"};
794	0						for(; $s < $sup_size; $s++) { # for each support value
795							# if the current support value is less than the rightmost value in the bin
796	0	0					if($sorted_support[$s] < $cur_bins->{$ref}{"right"}) {
797	0						$freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_support[$s]};
798							}
799							# else we have to save the information for the current bin
800							else {
801	0						$cur_freq->{$ref} = $freq_sum;
802	0	0					if($interval > 0) {
803	0						$cur_density->{$ref} = ($freq_sum / $count) / $interval;
804							}
805							else {
806	0						$cur_density->{$ref} = $freq_sum / $count;
807							}
808	0						$cur_sum += $cur_density->{$ref};
809	0						$freq_sum = 0;
810	0						last; # we do not increment s
811							}
812							}
813							# if last element of the support has been reached
814							# we have to save the current freq sum in the current bin
815	0	0					if($s == $sup_size) {
816	0						$cur_freq->{$ref} = $freq_sum;
817	0	0					if($interval > 0) {
818	0						$cur_density->{$ref} = ($freq_sum / $count) / $interval;
819							}
820							else {
821	0						$cur_density->{$ref} = $freq_sum / $count;
822							}
823	0						$cur_sum += $cur_density->{$ref};
824							}
825							}
826							#DEBUG print "attempt: " . $num_bins . " sum: " . $cur_sum . "\n";
827	0						return ($cur_bins, $cur_freq, $cur_density, $cur_sum);
828							}
829
830
831
832							sub _full_support {
833	0			0			my $self = shift;
834	0						my $support = {};
835	0						my $v;
836	0						foreach $v( keys %{$self->{"data_frequency"}}) {
	0
837	0						$support->{$v} = 1;
838							}
839	0						return (keys %{$support});
	0
840							}
841
842
843
844
845							################## Utilities ##################
846
847							# return a structure describing a linear binning
848							# of the current data ($num_bins provided)
849							sub compute_lin_bins {
850	0			0	1		my $self = shift;
851	0						my $num_bins = shift;
852	0						my $min = $self->minimum();
853	0						my $max = $self->maximum();
854	0						my $linbins = {}; # reference to empty hash
855							# last bin includes just the maximum value
856	0						my $delta = abs($max - $min)/($num_bins-1);
857	0						my $i;
858							my $ref;
859	0						for($i=0; $i < $num_bins; $i++) {
860	0						$ref = $min + $i * $delta + $delta/2;
861	0						$linbins->{$ref}{"left"} = $min + $i * $delta;
862	0						$linbins->{$ref}{"right"} = $min + $i * $delta + $delta;
863							}
864	0						return $linbins;
865							}
866
867
868							# return a structure describing a logarithmic binning
869							# of the current data ($num_bins provided)
870							# bins are not set
871							sub compute_log_bins {
872	0			0	1		my $self = shift;
873	0						my $num_bins = shift;
874	0						my $min = $self->minimum();
875	0						my $max = $self->maximum();
876	0	0	0				if($min == 0 or $max == 0 or $min *$max <0) {
			0
877							# if the interval contains zero, then we cannot logbin
878							# we return the linear binning
879	0						print "Logarithmic binning was not possible on this sample, ";
880	0						print "switching to linear binning mode" . "\n";
881	0						$self->{"bin-type"} = LIN_BINNING;
882	0						return $self->compute_lin_bins($num_bins);
883							}
884	0						my $logbins = {}; # reference to empty hash
885	0						my $min_exp = log($min)/log(LOG_BASE);
886	0						my $max_exp = log($max+1)/log(LOG_BASE);
887	0						my $delta_exp = abs($max_exp - $min_exp)/$num_bins;
888	0						my $cur_exp;
889							my $i;
890	0						my $ref;
891	0						for($i=0; $i < $num_bins; $i++) {
892	0						$cur_exp = $min_exp + $i * $delta_exp;
893	0						$ref = LOG_BASE ** ($cur_exp + $delta_exp/2);
894	0						$logbins->{$ref}{"left"} = LOG_BASE ** $cur_exp;
895	0						$logbins->{$ref}{"right"} = LOG_BASE ** ($cur_exp + $delta_exp);
896							}
897	0						return $logbins;
898							}
899
900
901							1;