File Coverage

blib/lib/Statistics/Discrete.pm
Criterion Covered Total %
statement 113 491 23.0
branch 19 142 13.3
condition 2 27 7.4
subroutine 19 44 43.1
pod 18 22 81.8
total 171 726 23.5


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             #
3             # Statistics::Discrete
4             #
5             # Chiara Orsini, CAIDA, UC San Diego
6             # chiara@caida.org
7             #
8             # Copyright (C) 2014 The Regents of the University of California.
9             #
10             # Statistics::Discrete is free software: you can redistribute it and/or modify
11             # it under the terms of the GNU General Public License as published by
12             # the Free Software Foundation, either version 3 of the License, or
13             # (at your option) any later version.
14             #
15             # Statistics::Discrete is distributed in the hope that it will be useful,
16             # but WITHOUT ANY WARRANTY; without even the implied warranty of
17             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18             # GNU General Public License for more details.
19             #
20             # You should have received a copy of the GNU General Public License
21             # along with Statistics::Discrete. If not, see .
22             #
23              
24             package Statistics::Discrete;
25              
26              
27 1     1   33479 use 5.014002;
  1         4  
28 1     1   4 use strict;
  1         2  
  1         23  
29 1     1   4 use warnings;
  1         5  
  1         34  
30 1     1   5 use List::Util;
  1         1  
  1         70  
31 1     1   754 use POSIX;
  1         7200  
  1         4  
32 1     1   4496 use Storable 'dclone';
  1         6036  
  1         100  
33              
34              
35              
36             our $VERSION = '0.05';
37              
38 1     1   8 use constant NO_BINNING => 0;
  1         2  
  1         83  
39 1     1   5 use constant LIN_BINNING => 1;
  1         2  
  1         39  
40 1     1   4 use constant LOG_BINNING => 2;
  1         3  
  1         42  
41              
42 1     1   5 use constant LOG_BASE => 10;
  1         2  
  1         36  
43 1     1   4 use constant DEFAULT_BINS_NUMBER => 10;
  1         2  
  1         5092  
44              
45              
46             # Constructor
47             sub new {
48 1     1 0 420 my $class = shift;
49 1         2 my $self = {};
50 1         4 $self->{"data_frequency"} = {}; # anonymous hash constructor
51             # binning default values
52 1         3 $self->{"bin-type"} = NO_BINNING;
53 1         2 $self->{"optimal-binning"} = 1;
54             # $self->{"bins"} - bins' structure
55             # $self->{"num-bins"} - num bins when optimal binning is off
56             # on-demand stored stats
57             # $self->{"stats"}{"Desc"} - descriptive statistics
58             # $self->{"stats"}{"Dist"} - distributions
59             # $self->{"stats"}{"Dist"}{"binned"} - binned distributions
60             # bind self and class
61 1         2 bless $self, $class;
62 1         3 return $self;
63             }
64              
65             ################## Methods to load/add data ##################
66              
67             sub add_data {
68 1     1 0 685 my $self = shift;
69 1         4 my @data_to_add = @_;
70 1         1 my $data;
71 1         3 foreach $data(@data_to_add) {
72 10 100       28 if(defined($self->{"data_frequency"}{$data})) {
73 4         6 $self->{"data_frequency"}{$data}++;
74             }
75             else {
76 6         20 $self->{"data_frequency"}{$data} = 1 ;
77             }
78             }
79             # data has changed - stats and bins need to be computed again
80 1         3 delete $self->{"stats"};
81 1         2 delete $self->{"bins"};
82 1         3 return;
83             }
84              
85             sub add_data_from_file {
86 0     0 0 0 my $self = shift;
87 0         0 my $filename = shift;
88 0         0 my @cols;
89             my $f;
90 0         0 my $line;
91 0 0       0 open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
92 0         0 while($line = <$f>) {
93 0         0 chomp($line);
94 0 0       0 if($line =~ /^\s*#/) {
95 0         0 next;
96             }
97 0         0 @cols = split /\s/ , $line;
98 0 0       0 if(scalar @cols == 1) { # assume list of values
99 0 0       0 if(defined($self->{"data_frequency"}{$cols[0]})) {
100 0         0 $self->{"data_frequency"}{$cols[0]}++;
101             }
102             else {
103 0         0 $self->{"data_frequency"}{$cols[0]} = 1 ;
104             }
105             }
106 0 0       0 if(scalar @cols == 2) { # assume list of id - value pairs
107 0 0       0 if(defined($self->{"data_frequency"}{$cols[1]})) {
108 0         0 $self->{"data_frequency"}{$cols[1]}++;
109             }
110             else {
111 0         0 $self->{"data_frequency"}{$cols[1]} = 1 ;
112             }
113             }
114             # File format not recognized
115 0 0 0     0 if(scalar @cols != 1 and scalar @cols != 2){
116 0         0 print "File format not recognized\n";
117 0         0 close($f);
118 0         0 delete $self->{"stats"};
119 0         0 delete $self->{"bins"};
120 0         0 return;
121             }
122             }
123 0         0 close($f);
124             # data has changed - stats and bins need to be computed again
125 0         0 delete $self->{"stats"};
126 0         0 delete $self->{"bins"};
127 0         0 return;
128             }
129              
130             sub add_data_from_frequencyfile {
131 0     0 0 0 my $self = shift;
132 0         0 my $filename = shift;
133 0         0 my @cols;
134             my $f;
135 0         0 my $line;
136 0 0       0 open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
137 0         0 while($line = <$f>) {
138 0         0 chomp($line);
139 0 0       0 if($line =~ /^\s*#/) {
140 0         0 next;
141             }
142 0         0 @cols = split /\s+/ , $line;
143             #DEBUG print $line . "\t" . @cols . "\n";
144 0 0       0 if(scalar @cols == 2) { # assume list of value - count
145 0 0       0 if(defined($self->{"data_frequency"}{$cols[0]})) {
146 0         0 $self->{"data_frequency"}{$cols[0]} += $cols[1];
147             }
148             else {
149 0         0 $self->{"data_frequency"}{$cols[0]} = $cols[1];
150             }
151 0         0 next;
152             }
153             # File format not recognized
154 0 0       0 if(scalar @cols != 2){
155 0         0 print "File format not recognized\n";
156 0         0 close($f);
157 0         0 delete $self->{"stats"};
158 0         0 delete $self->{"bins"};
159 0         0 return;
160             }
161             }
162 0         0 close($f);
163             # data has changed - stats and bins need to be computed again
164 0         0 delete $self->{"stats"};
165 0         0 delete $self->{"bins"};
166 0         0 return;
167             }
168              
169              
170             ################## Descriptive Statistics ##################
171              
172             # Minimum value
173             # compute the minimum value, save the statistic, and return it
174             sub minimum {
175 1     1 1 9 my $self = shift;
176 1 50       6 if(!defined($self->{"stats"}{"Desc"}{"min"})) {
177 1         3 my $count = $self->count();
178 1 50       5 if($count > 0) {
179 1         15 $self->{"stats"}{"Desc"}{"min"} = List::Util::min(keys $self->{"data_frequency"});
180             }
181             else {
182 0         0 $self->{"stats"}{"Desc"}{"min"} = 0;
183             }
184             }
185 1         6 return $self->{"stats"}{"Desc"}{"min"};
186             }
187              
188              
189             # Maximum value
190             # compute the minimum value, save the statistic, and return it
191             sub maximum {
192 1     1 1 2 my $self = shift;
193 1 50       6 if(!defined($self->{"stats"}{"Desc"}{"max"})) {
194 1         2 my $count = $self->count();
195 1 50       4 if($count > 0) {
196 1         10 $self->{"stats"}{"Desc"}{"max"} = List::Util::max(keys $self->{"data_frequency"});
197             }
198             else {
199 0         0 $self->{"stats"}{"Desc"}{"max"} = 0;
200             }
201             }
202 1         5 return $self->{"stats"}{"Desc"}{"max"};
203             }
204              
205              
206             # count the number of samples provided
207             # save the statistic, and return it
208             sub count {
209 6     6 1 12 my $self = shift;
210 6 100       18 if(!defined($self->{"stats"}{"Desc"}{"count"})) {
211 1         3 my $count = 0;
212 1         1 my $data;
213 1         2 foreach $data(keys %{$self->{"data_frequency"}}) {
  1         7  
214 6         10 $count += $self->{"data_frequency"}{$data};
215             }
216 1         4 $self->{"stats"}{"Desc"}{"count"} = $count;
217             }
218 6         15 return $self->{"stats"}{"Desc"}{"count"};
219             }
220              
221              
222             # http://en.wikipedia.org/wiki/Expected_value
223             # the weighted average of the possible values,
224             # using their probabilities as their weights
225             sub mean {
226 2     2 1 5 my $self = shift;
227 2 100       8 if(!defined($self->{"stats"}{"Desc"}{"mean"})) {
228 1         1 my $cumul_value = 0;
229 1         3 my $count = $self->count();
230 1         2 my $v;
231 1         2 foreach $v( keys %{$self->{"data_frequency"}}) {
  1         15  
232 6         13 $cumul_value += $v * $self->{"data_frequency"}{$v};
233             }
234 1 50       20 if($count > 0) {
235 1         4 $self->{"stats"}{"Desc"}{"mean"} = $cumul_value / $count;
236             }
237             else {
238 0         0 $self->{"stats"}{"Desc"}{"mean"} = 0;
239             }
240             }
241 2         7 return $self->{"stats"}{"Desc"}{"mean"};
242             }
243              
244              
245              
246             # http://en.wikipedia.org/wiki/Median
247             # the median is the numerical value separating
248             # the higher half of a data sample, a population,
249             # or a probability distribution, from the lower half.
250             sub median {
251 1     1 1 3 my $self = shift;
252 1 50       10 if(!defined($self->{"stats"}{"Desc"}{"median"})) {
253 1         4 my $count = $self->count();
254 1         2 my $odd = 0;
255 1 50       6 if($count % 2 != 0) {
256 0         0 $odd = 1;
257             }
258 1         15 my $median_index = floor($count/2) + $odd;
259 1         2 my $current_index = 0;
260              
261 1         1 my ($v,$i);
262 1         2 foreach $v(sort {$a<=>$b} keys %{$self->{"data_frequency"}}) {
  8         13  
  1         9  
263 6         14 for($i=0; $i < $self->{"data_frequency"}{$v}; $i++) {
264 10         9 $current_index++;
265 10 100       22 if($current_index == $median_index) {
266 1         3 $self->{"stats"}{"Desc"}{"median"} = $v;
267             }
268 10 100 66     44 if($current_index == ($median_index+1) and
269             $odd == 0) {
270 1         5 $self->{"stats"}{"Desc"}{"median"} = ($self->{"stats"}{"Desc"}{"median"} + $v)/2;
271             }
272              
273             }
274             }
275             }
276 1         5 return $self->{"stats"}{"Desc"}{"median"};
277             }
278              
279              
280             # http://en.wikipedia.org/wiki/Percentile
281             # A percentile (or a centile) is a measure
282             # used in statistics indicating the value
283             # below which a given percentage of observations
284             # in a group of observations fall.
285             sub percentile {
286 0     0 1 0 my $self = shift;
287 0         0 my $perc = shift;
288 0         0 my $max_probability = $perc/100;
289 0         0 my $cdf = $self->empirical_distribution_function();
290 0         0 my $v = 0;
291 0         0 my $previous_v = 0;
292 0         0 my $epsilon = 0;
293 0         0 foreach $v(sort {$a<=>$b} keys %{$cdf}) {
  0         0  
  0         0  
294 0         0 $epsilon = $v - $previous_v;
295 0 0       0 if($cdf->{$v} > $max_probability) {
296 0         0 return $v;
297             }
298 0         0 $previous_v = $v;
299             }
300             # if the program is here then $perc == 1
301             # then we return $v+epsilon
302 0         0 return $v+$epsilon;
303             }
304              
305              
306              
307             # http://en.wikipedia.org/wiki/Variance
308             # The variance of a random variable X is
309             # its second central moment, the expected
310             # value of the squared deviation from the mean
311             sub variance {
312 1     1 1 1 my $self = shift;
313 1 50       5 if(!defined($self->{"stats"}{"Desc"}{"variance"})) {
314 1         3 my $mean = $self->mean();
315 1         3 my $count = $self->count();
316 1         2 my $cumul_value = 0;
317 1         2 my $square_mean = 0;
318 1         1 my $v;
319 1         7 foreach $v(keys %{$self->{"data_frequency"}}) {
  1         4  
320 6         13 $cumul_value += ($v**2) * $self->{"data_frequency"}{$v};
321             }
322 1 50       4 if($count > 0) {
323 1         2 $square_mean = $cumul_value / $count;
324             }
325 1         4 $self->{"stats"}{"Desc"}{"variance"} = $square_mean - ($mean**2);
326             }
327 1         19 return $self->{"stats"}{"Desc"}{"variance"};
328             }
329              
330              
331             # http://en.wikipedia.org/wiki/Standard_deviation
332             # The standard deviation of a random variable
333             # is the square root of its variance.
334             sub standard_deviation {
335 0     0 1   my $self = shift;
336 0 0         if(!defined($self->{"stats"}{"Desc"}{"sdev"})) {
337 0           my $variance = $self->variance();
338 0           $self->{"stats"}{"Desc"}{"sdev"} = sqrt($variance);
339             }
340 0           return $self->{"stats"}{"Desc"}{"sdev"};
341             }
342              
343              
344             ################## Distributions Statistics ##################
345              
346             # http://en.wikipedia.org/wiki/Frequency_distribution
347             # the frequency distribution is an arrangement of the values
348             # that one or more variables take in a sample. Each entry in
349             # the table contains the frequency or count of the occurrences
350             # of values within a particular group or interval
351             sub frequency_distribution {
352 0     0 1   my $self = shift;
353 0 0         if($self->count() == 0) {
354 0           print "WARNING: empty dataset, returning empty frequency distribution. \n";
355 0           return (); # empty hash
356             }
357 0 0         if($self->{"bin-type"} == NO_BINNING) {
358             # $self->{"stats"}{"Dist"}{"frequency"}
359 0           return $self->_frequency_distribution();
360             }
361             else {
362             # $self->{"stats"}{"Dist"}{"binned"}{"frequency"}
363 0           return $self->_binned_frequency_distribution();
364             }
365             }
366              
367              
368             # http://en.wikipedia.org/wiki/Probability_mass_function
369             # the probability mass function (pmf) is a function that gives
370             # the probability that a discrete random variable is exactly
371             # equal to some value.
372             sub probability_mass_function {
373 0     0 1   my $self = shift;
374 0 0         if($self->count() == 0) {
375 0           print "WARNING: empty dataset, returning empty probability mass function. \n";
376 0           return (); # empty hash
377             }
378 0 0         if($self->{"bin-type"} == NO_BINNING) {
379             # $self->{"stats"}{"Dist"}{"pmf"}
380 0           return $self->_probability_mass_function();
381             }
382             else {
383             # $self->{"stats"}{"Dist"}{"binned"}{"pmf"}
384 0           return $self->_binned_probability_mass_function();
385             }
386             }
387              
388              
389             # http://en.wikipedia.org/wiki/Empirical_distribution_function
390             # the empirical distribution function, or empirical cdf, is the
391             # cumulative distribution function associated with the empirical
392             # measure of the sample. This cdf is a step function that jumps up
393             # by 1/n at each of the n data points.
394             sub empirical_distribution_function {
395 0     0 1   my $self = shift;
396 0 0         if($self->count() == 0) {
397 0           print "WARNING: empty dataset, returning empty emptirical distribution function. \n";
398 0           return (); # empty hash
399             }
400 0 0         if($self->{"bin-type"} == NO_BINNING) {
401             # $self->{"stats"}{"Dist"}{"cdf"}
402 0           return $self->_empirical_distribution_function();
403             }
404             else {
405             # $self->{"stats"}{"Dist"}{"binned"}{"cdf"}
406 0           return $self->_binned_empirical_distribution_function();
407             }
408             }
409              
410              
411             # http://en.wikipedia.org/wiki/CCDF
412             # how often the random variable is above a particular level.
413             sub complementary_cumulative_distribution_function {
414 0     0 1   my $self = shift;
415 0 0         if($self->count() == 0) {
416 0           print "WARNING: empty dataset, returning empty complementary cumulative distribution function. \n";
417 0           return (); # empty hash
418             }
419 0 0         if($self->{"bin-type"} == NO_BINNING) {
420             # $self->{"stats"}{"Dist"}{"ccdf"}
421 0           return $self->_complementary_cumulative_distribution_function();
422             }
423             else {
424             # $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}
425 0           return $self->_binned_complementary_cumulative_distribution_function();
426             }
427             }
428              
429              
430              
431             ################## Regular Distributions Statistics ##################
432              
433              
434             sub _frequency_distribution {
435 0     0     my $self = shift;
436 0 0         if(!defined($self->{"stats"}{"Dist"}{"frequency"})) {
437 0           $self->{"stats"}{"Dist"}{"frequency"} = dclone $self->{"data_frequency"};
438             }
439 0           return $self->{"stats"}{"Dist"}{"frequency"};
440             }
441              
442              
443             sub _probability_mass_function {
444 0     0     my $self = shift;
445 0 0         if(!defined($self->{"stats"}{"Dist"}{"pmf"})) {
446 0           my $count = $self->count();
447 0           $self->_frequency_distribution();
448 0           my ($val,$freq);
449 0           foreach $val(keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
  0            
450 0           $freq = $self->{"stats"}{"Dist"}{"frequency"}{$val};
451 0           $self->{"stats"}{"Dist"}{"pmf"}{$val} = $freq / $count;
452             }
453             }
454 0           return $self->{"stats"}{"Dist"}{"pmf"};
455             }
456              
457              
458             sub _empirical_distribution_function {
459 0     0     my $self = shift;
460 0 0         if(!defined($self->{"stats"}{"Dist"}{"cdf"})) {
461 0           my $count = $self->count();
462 0           $self->_frequency_distribution();
463 0           my $val;
464 0           my $cumul_freq = 0;
465 0           foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
  0            
  0            
466 0           $cumul_freq += $self->{"stats"}{"Dist"}{"frequency"}{$val};
467 0           $self->{"stats"}{"Dist"}{"cdf"}{$val} = $cumul_freq / $count;
468             }
469             }
470 0           return $self->{"stats"}{"Dist"}{"cdf"};
471             }
472              
473              
474             sub _complementary_cumulative_distribution_function {
475 0     0     my $self = shift;
476             # warning, this creates self->{"stats"} if it doesn't exist
477 0 0         if(!defined($self->{"stats"}{"Dist"}{"ccdf"})) {
478 0           my $count = $self->count();
479 0           $self->frequency_distribution();
480 0           my $val;
481 0           my $cumul_freq = $count;
482 0           foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
  0            
  0            
483 0           $cumul_freq -= $self->{"stats"}{"Dist"}{"frequency"}{$val};
484 0           $self->{"stats"}{"Dist"}{"ccdf"}{$val} = $cumul_freq / $count;
485             }
486             }
487 0           return $self->{"stats"}{"Dist"}{"ccdf"};
488             }
489              
490              
491              
492             ################## Binned Distributions Statistics ##################
493              
494              
495             sub _binned_frequency_distribution {
496 0     0     my $self = shift;
497 0 0         if(!defined($self->{"stats"}{"Dist"}{"binned"}{"frequency"})) {
498             # 1. get bins
499 0           $self->bins(); # set $self->{"bins"}
500             # 2. get regular frequency distribution
501 0           $self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
502             # 3. compute binned property
503 0           my $binned_frequency = {};
504 0           my $i = 0; # vals iterator
505 0           my @sorted_vals = (sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}});
  0            
  0            
506 0           my $num_vals = scalar @sorted_vals;
507 0           my $bi = 0; # bins iterator
508 0           my @sorted_bins = (sort {$a<=>$b} keys %{$self->{"bins"}});
  0            
  0            
509 0           my $num_bins = scalar @sorted_bins;
510 0           my $freq_sum = 0;
511 0           for(; $bi < $num_bins; $bi++) { # for each bin
512 0           for(; $i < $num_vals; $i++) {
513 0 0         if($sorted_vals[$i] < $self->{"bins"}->{$sorted_bins[$bi]}{"right"}) {
514 0           $freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_vals[$i]}
515             }
516             else {
517 0           $binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
518 0           $freq_sum = 0;
519 0           last;
520             }
521             }
522              
523             # if last element of vals then we save the current freq sum
524             # in the current bin
525 0 0         if($i == $num_vals) {
526 0           $binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
527             }
528             }
529 0           $self->{"stats"}{"Dist"}{"binned"}{"frequency"} = $binned_frequency;
530             }
531 0           return $self->{"stats"}{"Dist"}{"binned"}{"frequency"};
532             }
533              
534              
535             sub _binned_probability_mass_function {
536 0     0     my $self = shift;
537 0 0         if(!defined($self->{"stats"}{"Dist"}{"binned"}{"pmf"})) {
538 0           my $count = $self->count();
539 0           my $bins = $self->bins(); # set $self->{"bins"}
540 0           my $bfd = $self->_binned_frequency_distribution();
541 0           my ($val,$freq, $interval);
542 0           foreach $val(keys %{$bfd}) {
  0            
543 0           $freq = $self->{"stats"}{"Dist"}{"binned"}{"frequency"}{$val};
544 0           $interval = $self->{"bins"}{$val}{"right"} - $self->{"bins"}{$val}{"left"};
545 0 0         if($interval > 0) {
546 0           $self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = ($freq / $count) / $interval;
547             }
548             else {
549 0           $self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = $freq / $count;
550             }
551             }
552             }
553 0           return $self->{"stats"}{"Dist"}{"binned"}{"pmf"};
554             }
555              
556              
557             sub _binned_empirical_distribution_function {
558 0     0     my $self = shift;
559 0 0         if(!defined($self->{"stats"}{"Dist"}{"binned"}{"cdf"})) {
560 0           my $bpmf = $self->_binned_probability_mass_function();
561 0           my $val;
562 0           my $cumul_prob = 0;
563 0           foreach $val(sort {$a<=>$b} keys %{$bpmf}) {
  0            
  0            
564 0           $cumul_prob += $bpmf->{$val};
565 0           $self->{"stats"}{"Dist"}{"binned"}{"cdf"}{$val} = $cumul_prob;
566             }
567             }
568 0           return $self->{"stats"}{"Dist"}{"binned"}{"cdf"};
569             }
570              
571              
572             sub _binned_complementary_cumulative_distribution_function {
573 0     0     my $self = shift;
574 0 0         if(!defined($self->{"stats"}{"Dist"}{"binned"}{"ccdf"})) {
575 0           my $bpmf = $self->_binned_probability_mass_function();
576 0           my $val;
577 0           my $cumul_prob = 0;
578 0           my $old_cumul_prob = 0;
579 0           foreach $val(sort {$b<=>$a} keys %{$bpmf}) { # descending sort
  0            
  0            
580 0           $old_cumul_prob = $cumul_prob;
581 0           $cumul_prob += $bpmf->{$val};
582 0           $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}{$val} = $old_cumul_prob;
583             }
584             }
585 0           return $self->{"stats"}{"Dist"}{"binned"}{"ccdf"};
586             }
587              
588              
589             ################## Binned Related Functions ##################
590              
591              
592             sub set_binning_type {
593 0     0 1   my $self = shift;
594 0           my $bin_type = shift;
595             # check bin-type validity
596 0 0 0       if($bin_type != NO_BINNING && $bin_type != LIN_BINNING &&
      0        
597             $bin_type != LOG_BINNING) {
598 0           die "Wrong binning type provided\n";
599             }
600             # if bin type is already set
601 0 0         if($bin_type == $self->{"bin-type"}) {
602 0           return;
603             }
604             # if bin-type is different
605 0           $self->{"bin-type"} = $bin_type;
606 0           delete $self->{"num-bins"};
607 0           delete $self->{"bins"};
608 0           delete $self->{"stats"}{"Dist"}{"binned"};
609             }
610              
611              
612             sub set_optimal_binning {
613 0     0 1   my $self = shift;
614 0 0         if( $self->{"optimal-binning"} == 0) {
615 0           delete $self->{"bins"};
616 0           delete $self->{"num-bins"};
617 0           delete $self->{"stats"}{"Dist"}{"binned"};
618             }
619 0           $self->{"optimal-binning"} = 1;
620             }
621              
622              
623             sub set_custom_num_bins {
624 0     0 1   my $self = shift;
625 0           my $nb = shift;
626 0 0         if($self->{"bin-type"} == NO_BINNING) {
627 0           die "Set a binning type before setting the num of bins\n";
628             }
629 0 0         if( $nb < 2 ) {
630 0           die "Wrong number of bins provided: " . $nb . "\n";
631             }
632             # check if there is a change
633 0 0 0       if( $self->{"optimal-binning"} == 0 and
      0        
634             defined ($self->{"num-bins"}) and
635             $self->{"num-bins"} == $nb) {
636             # nothing changed -> nothing to do
637             }
638             else {
639             # update binning parameters
640 0           $self->{"optimal-binning"} = 0;
641 0           $self->{"num-bins"} = $nb;
642 0           delete $self->{"stats"}{"Dist"}{"binned"};
643 0           delete $self->{"bins"};
644             }
645             }
646              
647              
648             sub bins {
649 0     0 1   my $self = shift;
650 0 0         if($self->count() == 0) {
651 0           print "WARNING: empty dataset, returning empty bins. \n";
652 0           return {}; # empty array
653             }
654              
655 0 0         if(!defined($self->{"bins"})) {
656             # CASE 1: if no_binning is set we do not
657             # save the binning structure
658 0 0         if( $self->{"bin-type"} == NO_BINNING) {
659 0           my $curbins = {};
660 0           my $v;
661 0           foreach $v(keys %{$self->{"data_frequency"}}) {
  0            
662 0           my $ref = $v;
663 0           $curbins->{$v}{"left"} = $v;
664 0           $curbins->{$v}{"right"} = $v;
665             }
666 0           return $curbins;
667             }
668            
669             # CASE 2: if a custom number of bins is
670             # provided, we lin-/log- bin the support
671             # accordingly
672 0 0         if($self->{"optimal-binning"} == 0) {
673 0 0         if($self->{"bin-type"} == LIN_BINNING) {
674 0           $self->{"bins"} = $self->compute_lin_bins($self->{"num-bins"});
675             }
676 0 0         if($self->{"bin-type"} == LOG_BINNING) {
677 0           $self->{"bins"} = $self->compute_log_bins($self->{"num-bins"});
678             }
679             }
680              
681             # CASE 3: optimal binning
682             else {
683 0           my $res = $self->_optimal_binning();
684 0 0         if($res != 0) {
685 0           print "Optimal binning was not possible on this sample, ";
686 0           print "using the NO_BINNING mode" . "\n";
687 0           my $curbins = {};
688 0           my $v;
689 0           foreach $v(keys %{$self->{"data_frequency"}}) {
  0            
690 0           my $ref = $v;
691 0           $curbins->{$v}{"left"} = $v;
692 0           $curbins->{$v}{"right"} = $v;
693             }
694 0           $self->{"bin-type"} = NO_BINNING;
695 0           $self->{"bins"} = $curbins;
696             }
697             }
698             }
699 0           return $self->{"bins"};
700             }
701              
702              
703             ################## Optimal Binning Related Functions ##################
704              
705             # Compute the optimal binning
706             sub _optimal_binning {
707 0     0     my $self = shift;
708             # LEGEND:
709             # b_* => bins
710             # f_* => binned frequency distribution
711             # d_* => binned density distribution
712             # s_* => sum of density distribution values
713             #
714             # start from extreme values ( min_num_bins and max_num_bins)
715             # and use the bisection method on the interval
716             # till the optimal is found (i.e. the binning such that
717             # the sum of the density function values is the closest
718             # to 1)
719 0           my $min_num_bins = 2;
720 0           my ($b_min, $f_min, $d_min, $s_min) = $self->_bin_attempt($min_num_bins);
721 0           my $max_num_bins = scalar $self->_full_support();
722 0 0         if($max_num_bins == 1) {
723 0           return -1;
724             }
725 0           my ($b_max, $f_max, $d_max, $s_max) = $self->_bin_attempt($max_num_bins);
726             # initial check
727 0 0 0       if( $s_min > 1 or $s_max < 1) {
728 0           return -1;
729             }
730 0           my $avg_num_bins;
731 0           my ($b_avg, $f_avg, $d_avg, $s_avg);
732 0           while( ($max_num_bins - $min_num_bins) > 1) {
733 0           $avg_num_bins = sprintf("%.f", (($max_num_bins + $min_num_bins)/2));
734 0           ($b_avg, $f_avg, $d_avg, $s_avg) = $self->_bin_attempt($avg_num_bins);
735 0 0         if($s_avg < 1) {
736 0           $min_num_bins = $avg_num_bins;
737 0           ($b_min, $f_min, $d_min, $s_min) = ($b_avg, $f_avg, $d_avg, $s_avg);
738             }
739             else {
740 0           $max_num_bins = $avg_num_bins;
741 0           ($b_max, $f_max, $d_max, $s_max) = ($b_avg, $f_avg, $d_avg, $s_avg);
742             }
743             }
744             # the binning whose density sum is closer to 1 win
745 0 0         if( (1 - $s_min) < ($s_max-1) ) {
746 0           $self->{"bins"} = $b_min;
747 0           $self->{"Dist"}{"binned"}{"frequency"} = $f_min;
748 0           $self->{"Dist"}{"binned"}{"pmf"} = $d_min;
749             }
750             else {
751 0           $self->{"bins"} = $b_max;
752 0           $self->{"Dist"}{"binned"}{"frequency"} = $f_max;
753 0           $self->{"Dist"}{"binned"}{"pmf"} = $d_max;
754             }
755 0           return 0;
756             }
757              
758              
759             # bin the current data into $num_bins intervals
760             # and return: the bins, the binned frequency distribution
761             # the binned density distribution and the sum of the
762             # density values
763             sub _bin_attempt {
764 0     0     my $self = shift;
765 0           my $num_bins = shift;
766             # output values
767 0           my ($cur_bins, $cur_freq, $cur_density, $cur_sum);
768 0           $cur_sum = 0;
769             # get the bins
770 0 0         if( $self->{"bin-type"} == LIN_BINNING ) {
771 0           $cur_bins = $self->compute_lin_bins($num_bins);
772             }
773 0 0         if( $self->{"bin-type"} == LOG_BINNING ) {
774 0           $cur_bins =$self->compute_log_bins($num_bins);
775             }
776             # assert non binned frequency distribution is computed
777 0           $self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
778 0           my $count = $self->count();
779             #
780 0           my $s = 0; # support iterator
781 0           my @sorted_support = (sort {$a<=>$b} $self->_full_support());
  0            
782 0           my $sup_size = scalar @sorted_support;
783 0           my $bi = 0; # bins iterator
784 0           my @sorted_bins = (sort {$a<=>$b} keys %{$cur_bins});
  0            
  0            
785 0           my $n_bins = scalar @sorted_bins; # assert(n_bins == num_bins)
786 0           my $ref = 0;
787 0           my $freq_sum = 0;
788 0           my $interval = 0;
789 0           for(; $bi < $n_bins; $bi++) { # for each bin
790 0           $ref = $sorted_bins[$bi];
791 0           $interval = $cur_bins->{$ref}{"right"} - $cur_bins->{$ref}{"left"};
792 0           for(; $s < $sup_size; $s++) { # for each support value
793             # if the current support value is less than the rightmost value in the bin
794 0 0         if($sorted_support[$s] < $cur_bins->{$ref}{"right"}) {
795 0           $freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_support[$s]};
796             }
797             # else we have to save the information for the current bin
798             else {
799 0           $cur_freq->{$ref} = $freq_sum;
800 0 0         if($interval > 0) {
801 0           $cur_density->{$ref} = ($freq_sum / $count) / $interval;
802             }
803             else {
804 0           $cur_density->{$ref} = $freq_sum / $count;
805             }
806 0           $cur_sum += $cur_density->{$ref};
807 0           $freq_sum = 0;
808 0           last; # we do not increment s
809             }
810             }
811             # if last element of the support has been reached
812             # we have to save the current freq sum in the current bin
813 0 0         if($s == $sup_size) {
814 0           $cur_freq->{$ref} = $freq_sum;
815 0 0         if($interval > 0) {
816 0           $cur_density->{$ref} = ($freq_sum / $count) / $interval;
817             }
818             else {
819 0           $cur_density->{$ref} = $freq_sum / $count;
820             }
821 0           $cur_sum += $cur_density->{$ref};
822             }
823             }
824             #DEBUG print "attempt: " . $num_bins . " sum: " . $cur_sum . "\n";
825 0           return ($cur_bins, $cur_freq, $cur_density, $cur_sum);
826             }
827              
828              
829              
830             sub _full_support {
831 0     0     my $self = shift;
832 0           my $support = {};
833 0           my $v;
834 0           foreach $v( keys %{$self->{"data_frequency"}}) {
  0            
835 0           $support->{$v} = 1;
836             }
837 0           return (keys %{$support});
  0            
838             }
839              
840              
841              
842              
843             ################## Utilities ##################
844              
845             # return a structure describing a linear binning
846             # of the current data ($num_bins provided)
847             sub compute_lin_bins {
848 0     0 1   my $self = shift;
849 0           my $num_bins = shift;
850 0           my $min = $self->minimum();
851 0           my $max = $self->maximum();
852 0           my $linbins = {}; # reference to empty hash
853             # last bin includes just the maximum value
854 0           my $delta = abs($max - $min)/($num_bins-1);
855 0           my $i;
856             my $ref;
857 0           for($i=0; $i < $num_bins; $i++) {
858 0           $ref = $min + $i * $delta + $delta/2;
859 0           $linbins->{$ref}{"left"} = $min + $i * $delta;
860 0           $linbins->{$ref}{"right"} = $min + $i * $delta + $delta;
861             }
862 0           return $linbins;
863             }
864              
865              
866             # return a structure describing a logarithmic binning
867             # of the current data ($num_bins provided)
868             # bins are not set
869             sub compute_log_bins {
870 0     0 1   my $self = shift;
871 0           my $num_bins = shift;
872 0           my $min = $self->minimum();
873 0           my $max = $self->maximum();
874 0 0 0       if($min == 0 or $max == 0 or $min *$max <0) {
      0        
875             # if the interval contains zero, then we cannot logbin
876             # we return the linear binning
877 0           print "Logarithmic binning was not possible on this sample, ";
878 0           print "switching to linear binning mode" . "\n";
879 0           $self->{"bin-type"} = LIN_BINNING;
880 0           return $self->compute_lin_bins($num_bins);
881             }
882 0           my $logbins = {}; # reference to empty hash
883 0           my $min_exp = log($min)/log(LOG_BASE);
884 0           my $max_exp = log($max+1)/log(LOG_BASE);
885 0           my $delta_exp = abs($max_exp - $min_exp)/$num_bins;
886 0           my $cur_exp;
887             my $i;
888 0           my $ref;
889 0           for($i=0; $i < $num_bins; $i++) {
890 0           $cur_exp = $min_exp + $i * $delta_exp;
891 0           $ref = LOG_BASE ** ($cur_exp + $delta_exp/2);
892 0           $logbins->{$ref}{"left"} = LOG_BASE ** $cur_exp;
893 0           $logbins->{$ref}{"right"} = LOG_BASE ** ($cur_exp + $delta_exp);
894             }
895 0           return $logbins;
896             }
897              
898              
899             1;