File Coverage

lib/Statistics/Discrete.pm
Criterion Covered Total %
statement 109 487 22.3
branch 19 142 13.3
condition 2 27 7.4
subroutine 17 42 40.4
pod 18 22 81.8
total 165 720 22.9


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             #
3             # Statistics::Discrete
4             #
5             # Chiara Orsini, CAIDA, UC San Diego
6             # chiara@caida.org
7             #
8             # Copyright (C) 2014 The Regents of the University of California.
9             #
10             # Statistics::Discrete is free software: you can redistribute it and/or modify
11             # it under the terms of the GNU General Public License as published by
12             # the Free Software Foundation, either version 3 of the License, or
13             # (at your option) any later version.
14             #
15             # Statistics::Discrete is distributed in the hope that it will be useful,
16             # but WITHOUT ANY WARRANTY; without even the implied warranty of
17             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18             # GNU General Public License for more details.
19             #
20             # You should have received a copy of the GNU General Public License
21             # along with Statistics::Discrete. If not, see .
22             #
23              
24             package Statistics::Discrete;
25              
26 1     1   25627 use 5.012;
  1         5  
27 1     1   7 use strict;
  1         1  
  1         24  
28 1     1   5 use warnings;
  1         6  
  1         33  
29 1     1   13 use List::Util;
  1         2  
  1         71  
30 1     1   1047 use POSIX;
  1         8864  
  1         7  
31 1     1   6142 use Storable 'dclone';
  1         5043  
  1         110  
32              
33             our $VERSION = '0.05.00';
34              
35             # require Exporter;
36             # our @EXPORT_OK = ( 'NO_BINNING', 'LIN_BINNING', 'LOG_BINNING');
37              
38             use constant {
39 1         105 NO_BINNING => 0,
40             LIN_BINNING => 1,
41             LOG_BINNING => 2,
42 1     1   8 };
  1         2  
43              
44 1     1   5 use constant LOG_BASE => 10;
  1         2  
  1         45  
45 1     1   5 use constant DEFAULT_BINS_NUMBER => 10;
  1         2  
  1         5027  
46              
47              
48             # Constructor
49             sub new {
50 1     1 0 469 my $class = shift;
51 1         2 my $self = {};
52 1         3 $self->{"data_frequency"} = {}; # anonymous hash constructor
53             # binning default values
54 1         3 $self->{"bin-type"} = NO_BINNING;
55 1         3 $self->{"optimal-binning"} = 1;
56             # $self->{"bins"} - bins' structure
57             # $self->{"num-bins"} - num bins when optimal binning is off
58             # on-demand stored stats
59             # $self->{"stats"}{"Desc"} - descriptive statistics
60             # $self->{"stats"}{"Dist"} - distributions
61             # $self->{"stats"}{"Dist"}{"binned"} - binned distributions
62             # bind self and class
63 1         2 bless $self, $class;
64 1         4 return $self;
65             }
66              
67             ################## Methods to load/add data ##################
68              
69             sub add_data {
70 1     1 0 401 my $self = shift;
71 1         3 my @data_to_add = @_;
72 1         2 my $data;
73 1         2 foreach $data(@data_to_add) {
74 10 100       28 if(defined($self->{"data_frequency"}{$data})) {
75 4         9 $self->{"data_frequency"}{$data}++;
76             }
77             else {
78 6         17 $self->{"data_frequency"}{$data} = 1 ;
79             }
80             }
81             # data has changed - stats and bins need to be computed again
82 1         2 delete $self->{"stats"};
83 1         3 delete $self->{"bins"};
84 1         3 return;
85             }
86              
87             sub add_data_from_file {
88 0     0 0 0 my $self = shift;
89 0         0 my $filename = shift;
90 0         0 my @cols;
91             my $f;
92 0         0 my $line;
93 0 0       0 open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
94 0         0 while($line = <$f>) {
95 0         0 chomp($line);
96 0 0       0 if($line =~ /^\s*#/) {
97 0         0 next;
98             }
99 0         0 @cols = split /\s/ , $line;
100 0 0       0 if(scalar @cols == 1) { # assume list of values
101 0 0       0 if(defined($self->{"data_frequency"}{$cols[0]})) {
102 0         0 $self->{"data_frequency"}{$cols[0]}++;
103             }
104             else {
105 0         0 $self->{"data_frequency"}{$cols[0]} = 1 ;
106             }
107             }
108 0 0       0 if(scalar @cols == 2) { # assume list of id - value pairs
109 0 0       0 if(defined($self->{"data_frequency"}{$cols[1]})) {
110 0         0 $self->{"data_frequency"}{$cols[1]}++;
111             }
112             else {
113 0         0 $self->{"data_frequency"}{$cols[1]} = 1 ;
114             }
115             }
116             # File format not recognized
117 0 0 0     0 if(scalar @cols != 1 and scalar @cols != 2){
118 0         0 print "File format not recognized\n";
119 0         0 close($f);
120 0         0 delete $self->{"stats"};
121 0         0 delete $self->{"bins"};
122 0         0 return;
123             }
124             }
125 0         0 close($f);
126             # data has changed - stats and bins need to be computed again
127 0         0 delete $self->{"stats"};
128 0         0 delete $self->{"bins"};
129 0         0 return;
130             }
131              
132             sub add_data_from_frequencyfile {
133 0     0 0 0 my $self = shift;
134 0         0 my $filename = shift;
135 0         0 my @cols;
136             my $f;
137 0         0 my $line;
138 0 0       0 open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
139 0         0 while($line = <$f>) {
140 0         0 chomp($line);
141 0 0       0 if($line =~ /^\s*#/) {
142 0         0 next;
143             }
144 0         0 @cols = split /\s+/ , $line;
145             #DEBUG print $line . "\t" . @cols . "\n";
146 0 0       0 if(scalar @cols == 2) { # assume list of value - count
147 0 0       0 if(defined($self->{"data_frequency"}{$cols[0]})) {
148 0         0 $self->{"data_frequency"}{$cols[0]} += $cols[1];
149             }
150             else {
151 0         0 $self->{"data_frequency"}{$cols[0]} = $cols[1];
152             }
153 0         0 next;
154             }
155             # File format not recognized
156 0 0       0 if(scalar @cols != 2){
157 0         0 print "File format not recognized\n";
158 0         0 close($f);
159 0         0 delete $self->{"stats"};
160 0         0 delete $self->{"bins"};
161 0         0 return;
162             }
163             }
164 0         0 close($f);
165             # data has changed - stats and bins need to be computed again
166 0         0 delete $self->{"stats"};
167 0         0 delete $self->{"bins"};
168 0         0 return;
169             }
170              
171              
172             ################## Descriptive Statistics ##################
173              
174             # Minimum value
175             # compute the minimum value, save the statistic, and return it
176             sub minimum {
177 1     1 1 2 my $self = shift;
178 1 50       5 if(!defined($self->{"stats"}{"Desc"}{"min"})) {
179 1         25 my $count = $self->count();
180 1 50       9 if($count > 0) {
181 1         2 $self->{"stats"}{"Desc"}{"min"} = List::Util::min(keys %{$self->{"data_frequency"}});
  1         15  
182             }
183             else {
184 0         0 $self->{"stats"}{"Desc"}{"min"} = 0;
185             }
186             }
187 1         5 return $self->{"stats"}{"Desc"}{"min"};
188             }
189              
190              
191             # Maximum value
192             # compute the minimum value, save the statistic, and return it
193             sub maximum {
194 1     1 1 2 my $self = shift;
195 1 50       5 if(!defined($self->{"stats"}{"Desc"}{"max"})) {
196 1         3 my $count = $self->count();
197 1 50       3 if($count > 0) {
198 1         2 $self->{"stats"}{"Desc"}{"max"} = List::Util::max(keys %{$self->{"data_frequency"}});
  1         8  
199             }
200             else {
201 0         0 $self->{"stats"}{"Desc"}{"max"} = 0;
202             }
203             }
204 1         5 return $self->{"stats"}{"Desc"}{"max"};
205             }
206              
207              
208             # count the number of samples provided
209             # save the statistic, and return it
210             sub count {
211 6     6 1 15 my $self = shift;
212 6 100       19 if(!defined($self->{"stats"}{"Desc"}{"count"})) {
213 1         2 my $count = 0;
214 1         1 my $data;
215 1         2 foreach $data(keys %{$self->{"data_frequency"}}) {
  1         5  
216 6         11 $count += $self->{"data_frequency"}{$data};
217             }
218 1         4 $self->{"stats"}{"Desc"}{"count"} = $count;
219             }
220 6         15 return $self->{"stats"}{"Desc"}{"count"};
221             }
222              
223              
224             # http://en.wikipedia.org/wiki/Expected_value
225             # the weighted average of the possible values,
226             # using their probabilities as their weights
227             sub mean {
228 2     2 1 5 my $self = shift;
229 2 100       7 if(!defined($self->{"stats"}{"Desc"}{"mean"})) {
230 1         2 my $cumul_value = 0;
231 1         4 my $count = $self->count();
232 1         2 my $v;
233 1         2 foreach $v( keys %{$self->{"data_frequency"}}) {
  1         4  
234 6         14 $cumul_value += $v * $self->{"data_frequency"}{$v};
235             }
236 1 50       4 if($count > 0) {
237 1         4 $self->{"stats"}{"Desc"}{"mean"} = $cumul_value / $count;
238             }
239             else {
240 0         0 $self->{"stats"}{"Desc"}{"mean"} = 0;
241             }
242             }
243 2         8 return $self->{"stats"}{"Desc"}{"mean"};
244             }
245              
246              
247              
248             # http://en.wikipedia.org/wiki/Median
249             # the median is the numerical value separating
250             # the higher half of a data sample, a population,
251             # or a probability distribution, from the lower half.
252             sub median {
253 1     1 1 2 my $self = shift;
254 1 50       6 if(!defined($self->{"stats"}{"Desc"}{"median"})) {
255 1         3 my $count = $self->count();
256 1         3 my $odd = 0;
257 1 50       4 if($count % 2 != 0) {
258 0         0 $odd = 1;
259             }
260 1         15 my $median_index = floor($count/2) + $odd;
261 1         2 my $current_index = 0;
262              
263 1         2 my ($v,$i);
264 1         2 foreach $v(sort {$a<=>$b} keys %{$self->{"data_frequency"}}) {
  10         17  
  1         7  
265 6         18 for($i=0; $i < $self->{"data_frequency"}{$v}; $i++) {
266 10         11 $current_index++;
267 10 100       22 if($current_index == $median_index) {
268 1         3 $self->{"stats"}{"Desc"}{"median"} = $v;
269             }
270 10 100 66     45 if($current_index == ($median_index+1) and
271             $odd == 0) {
272 1         5 $self->{"stats"}{"Desc"}{"median"} = ($self->{"stats"}{"Desc"}{"median"} + $v)/2;
273             }
274              
275             }
276             }
277             }
278 1         5 return $self->{"stats"}{"Desc"}{"median"};
279             }
280              
281              
282             # http://en.wikipedia.org/wiki/Percentile
283             # A percentile (or a centile) is a measure
284             # used in statistics indicating the value
285             # below which a given percentage of observations
286             # in a group of observations fall.
287             sub percentile {
288 0     0 1 0 my $self = shift;
289 0         0 my $perc = shift;
290 0         0 my $max_probability = $perc/100;
291 0         0 my $cdf = $self->empirical_distribution_function();
292 0         0 my $v = 0;
293 0         0 my $previous_v = 0;
294 0         0 my $epsilon = 0;
295 0         0 foreach $v(sort {$a<=>$b} keys %{$cdf}) {
  0         0  
  0         0  
296 0         0 $epsilon = $v - $previous_v;
297 0 0       0 if($cdf->{$v} > $max_probability) {
298 0         0 return $v;
299             }
300 0         0 $previous_v = $v;
301             }
302             # if the program is here then $perc == 1
303             # then we return $v+epsilon
304 0         0 return $v+$epsilon;
305             }
306              
307              
308              
309             # http://en.wikipedia.org/wiki/Variance
310             # The variance of a random variable X is
311             # its second central moment, the expected
312             # value of the squared deviation from the mean
313             sub variance {
314 1     1 1 2 my $self = shift;
315 1 50       6 if(!defined($self->{"stats"}{"Desc"}{"variance"})) {
316 1         3 my $mean = $self->mean();
317 1         3 my $count = $self->count();
318 1         2 my $cumul_value = 0;
319 1         2 my $square_mean = 0;
320 1         7 my $v;
321 1         3 foreach $v(keys %{$self->{"data_frequency"}}) {
  1         12  
322 6         16 $cumul_value += ($v**2) * $self->{"data_frequency"}{$v};
323             }
324 1 50       4 if($count > 0) {
325 1         2 $square_mean = $cumul_value / $count;
326             }
327 1         4 $self->{"stats"}{"Desc"}{"variance"} = $square_mean - ($mean**2);
328             }
329 1         17 return $self->{"stats"}{"Desc"}{"variance"};
330             }
331              
332              
333             # http://en.wikipedia.org/wiki/Standard_deviation
334             # The standard deviation of a random variable
335             # is the square root of its variance.
336             sub standard_deviation {
337 0     0 1   my $self = shift;
338 0 0         if(!defined($self->{"stats"}{"Desc"}{"sdev"})) {
339 0           my $variance = $self->variance();
340 0           $self->{"stats"}{"Desc"}{"sdev"} = sqrt($variance);
341             }
342 0           return $self->{"stats"}{"Desc"}{"sdev"};
343             }
344              
345              
346             ################## Distributions Statistics ##################
347              
348             # http://en.wikipedia.org/wiki/Frequency_distribution
349             # the frequency distribution is an arrangement of the values
350             # that one or more variables take in a sample. Each entry in
351             # the table contains the frequency or count of the occurrences
352             # of values within a particular group or interval
353             sub frequency_distribution {
354 0     0 1   my $self = shift;
355 0 0         if($self->count() == 0) {
356 0           print "WARNING: empty dataset, returning empty frequency distribution. \n";
357 0           return (); # empty hash
358             }
359 0 0         if($self->{"bin-type"} == NO_BINNING) {
360             # $self->{"stats"}{"Dist"}{"frequency"}
361 0           return $self->_frequency_distribution();
362             }
363             else {
364             # $self->{"stats"}{"Dist"}{"binned"}{"frequency"}
365 0           return $self->_binned_frequency_distribution();
366             }
367             }
368              
369              
370             # http://en.wikipedia.org/wiki/Probability_mass_function
371             # the probability mass function (pmf) is a function that gives
372             # the probability that a discrete random variable is exactly
373             # equal to some value.
374             sub probability_mass_function {
375 0     0 1   my $self = shift;
376 0 0         if($self->count() == 0) {
377 0           print "WARNING: empty dataset, returning empty probability mass function. \n";
378 0           return (); # empty hash
379             }
380 0 0         if($self->{"bin-type"} == NO_BINNING) {
381             # $self->{"stats"}{"Dist"}{"pmf"}
382 0           return $self->_probability_mass_function();
383             }
384             else {
385             # $self->{"stats"}{"Dist"}{"binned"}{"pmf"}
386 0           return $self->_binned_probability_mass_function();
387             }
388             }
389              
390              
391             # http://en.wikipedia.org/wiki/Empirical_distribution_function
392             # the empirical distribution function, or empirical cdf, is the
393             # cumulative distribution function associated with the empirical
394             # measure of the sample. This cdf is a step function that jumps up
395             # by 1/n at each of the n data points.
396             sub empirical_distribution_function {
397 0     0 1   my $self = shift;
398 0 0         if($self->count() == 0) {
399 0           print "WARNING: empty dataset, returning empty emptirical distribution function. \n";
400 0           return (); # empty hash
401             }
402 0 0         if($self->{"bin-type"} == NO_BINNING) {
403             # $self->{"stats"}{"Dist"}{"cdf"}
404 0           return $self->_empirical_distribution_function();
405             }
406             else {
407             # $self->{"stats"}{"Dist"}{"binned"}{"cdf"}
408 0           return $self->_binned_empirical_distribution_function();
409             }
410             }
411              
412              
413             # http://en.wikipedia.org/wiki/CCDF
414             # how often the random variable is above a particular level.
415             sub complementary_cumulative_distribution_function {
416 0     0 1   my $self = shift;
417 0 0         if($self->count() == 0) {
418 0           print "WARNING: empty dataset, returning empty complementary cumulative distribution function. \n";
419 0           return (); # empty hash
420             }
421 0 0         if($self->{"bin-type"} == NO_BINNING) {
422             # $self->{"stats"}{"Dist"}{"ccdf"}
423 0           return $self->_complementary_cumulative_distribution_function();
424             }
425             else {
426             # $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}
427 0           return $self->_binned_complementary_cumulative_distribution_function();
428             }
429             }
430              
431              
432              
433             ################## Regular Distributions Statistics ##################
434              
435              
436             sub _frequency_distribution {
437 0     0     my $self = shift;
438 0 0         if(!defined($self->{"stats"}{"Dist"}{"frequency"})) {
439 0           $self->{"stats"}{"Dist"}{"frequency"} = dclone $self->{"data_frequency"};
440             }
441 0           return $self->{"stats"}{"Dist"}{"frequency"};
442             }
443              
444              
445             sub _probability_mass_function {
446 0     0     my $self = shift;
447 0 0         if(!defined($self->{"stats"}{"Dist"}{"pmf"})) {
448 0           my $count = $self->count();
449 0           $self->_frequency_distribution();
450 0           my ($val,$freq);
451 0           foreach $val(keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
  0            
452 0           $freq = $self->{"stats"}{"Dist"}{"frequency"}{$val};
453 0           $self->{"stats"}{"Dist"}{"pmf"}{$val} = $freq / $count;
454             }
455             }
456 0           return $self->{"stats"}{"Dist"}{"pmf"};
457             }
458              
459              
460             sub _empirical_distribution_function {
461 0     0     my $self = shift;
462 0 0         if(!defined($self->{"stats"}{"Dist"}{"cdf"})) {
463 0           my $count = $self->count();
464 0           $self->_frequency_distribution();
465 0           my $val;
466 0           my $cumul_freq = 0;
467 0           foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
  0            
  0            
468 0           $cumul_freq += $self->{"stats"}{"Dist"}{"frequency"}{$val};
469 0           $self->{"stats"}{"Dist"}{"cdf"}{$val} = $cumul_freq / $count;
470             }
471             }
472 0           return $self->{"stats"}{"Dist"}{"cdf"};
473             }
474              
475              
476             sub _complementary_cumulative_distribution_function {
477 0     0     my $self = shift;
478             # warning, this creates self->{"stats"} if it doesn't exist
479 0 0         if(!defined($self->{"stats"}{"Dist"}{"ccdf"})) {
480 0           my $count = $self->count();
481 0           $self->frequency_distribution();
482 0           my $val;
483 0           my $cumul_freq = $count;
484 0           foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
  0            
  0            
485 0           $cumul_freq -= $self->{"stats"}{"Dist"}{"frequency"}{$val};
486 0           $self->{"stats"}{"Dist"}{"ccdf"}{$val} = $cumul_freq / $count;
487             }
488             }
489 0           return $self->{"stats"}{"Dist"}{"ccdf"};
490             }
491              
492              
493              
494             ################## Binned Distributions Statistics ##################
495              
496              
497             sub _binned_frequency_distribution {
498 0     0     my $self = shift;
499 0 0         if(!defined($self->{"stats"}{"Dist"}{"binned"}{"frequency"})) {
500             # 1. get bins
501 0           $self->bins(); # set $self->{"bins"}
502             # 2. get regular frequency distribution
503 0           $self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
504             # 3. compute binned property
505 0           my $binned_frequency = {};
506 0           my $i = 0; # vals iterator
507 0           my @sorted_vals = (sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}});
  0            
  0            
508 0           my $num_vals = scalar @sorted_vals;
509 0           my $bi = 0; # bins iterator
510 0           my @sorted_bins = (sort {$a<=>$b} keys %{$self->{"bins"}});
  0            
  0            
511 0           my $num_bins = scalar @sorted_bins;
512 0           my $freq_sum = 0;
513 0           for(; $bi < $num_bins; $bi++) { # for each bin
514 0           for(; $i < $num_vals; $i++) {
515 0 0         if($sorted_vals[$i] < $self->{"bins"}->{$sorted_bins[$bi]}{"right"}) {
516 0           $freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_vals[$i]}
517             }
518             else {
519 0           $binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
520 0           $freq_sum = 0;
521 0           last;
522             }
523             }
524              
525             # if last element of vals then we save the current freq sum
526             # in the current bin
527 0 0         if($i == $num_vals) {
528 0           $binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
529             }
530             }
531 0           $self->{"stats"}{"Dist"}{"binned"}{"frequency"} = $binned_frequency;
532             }
533 0           return $self->{"stats"}{"Dist"}{"binned"}{"frequency"};
534             }
535              
536              
537             sub _binned_probability_mass_function {
538 0     0     my $self = shift;
539 0 0         if(!defined($self->{"stats"}{"Dist"}{"binned"}{"pmf"})) {
540 0           my $count = $self->count();
541 0           my $bins = $self->bins(); # set $self->{"bins"}
542 0           my $bfd = $self->_binned_frequency_distribution();
543 0           my ($val,$freq, $interval);
544 0           foreach $val(keys %{$bfd}) {
  0            
545 0           $freq = $self->{"stats"}{"Dist"}{"binned"}{"frequency"}{$val};
546 0           $interval = $self->{"bins"}{$val}{"right"} - $self->{"bins"}{$val}{"left"};
547 0 0         if($interval > 0) {
548 0           $self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = ($freq / $count) / $interval;
549             }
550             else {
551 0           $self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = $freq / $count;
552             }
553             }
554             }
555 0           return $self->{"stats"}{"Dist"}{"binned"}{"pmf"};
556             }
557              
558              
559             sub _binned_empirical_distribution_function {
560 0     0     my $self = shift;
561 0 0         if(!defined($self->{"stats"}{"Dist"}{"binned"}{"cdf"})) {
562 0           my $bpmf = $self->_binned_probability_mass_function();
563 0           my $val;
564 0           my $cumul_prob = 0;
565 0           foreach $val(sort {$a<=>$b} keys %{$bpmf}) {
  0            
  0            
566 0           $cumul_prob += $bpmf->{$val};
567 0           $self->{"stats"}{"Dist"}{"binned"}{"cdf"}{$val} = $cumul_prob;
568             }
569             }
570 0           return $self->{"stats"}{"Dist"}{"binned"}{"cdf"};
571             }
572              
573              
574             sub _binned_complementary_cumulative_distribution_function {
575 0     0     my $self = shift;
576 0 0         if(!defined($self->{"stats"}{"Dist"}{"binned"}{"ccdf"})) {
577 0           my $bpmf = $self->_binned_probability_mass_function();
578 0           my $val;
579 0           my $cumul_prob = 0;
580 0           my $old_cumul_prob = 0;
581 0           foreach $val(sort {$b<=>$a} keys %{$bpmf}) { # descending sort
  0            
  0            
582 0           $old_cumul_prob = $cumul_prob;
583 0           $cumul_prob += $bpmf->{$val};
584 0           $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}{$val} = $old_cumul_prob;
585             }
586             }
587 0           return $self->{"stats"}{"Dist"}{"binned"}{"ccdf"};
588             }
589              
590              
591             ################## Binned Related Functions ##################
592              
593              
594             sub set_binning_type {
595 0     0 1   my $self = shift;
596 0           my $bin_type = shift;
597             # check bin-type validity
598 0 0 0       if($bin_type != NO_BINNING && $bin_type != LIN_BINNING &&
      0        
599             $bin_type != LOG_BINNING) {
600 0           die "Wrong binning type provided\n";
601             }
602             # if bin type is already set
603 0 0         if($bin_type == $self->{"bin-type"}) {
604 0           return;
605             }
606             # if bin-type is different
607 0           $self->{"bin-type"} = $bin_type;
608 0           delete $self->{"num-bins"};
609 0           delete $self->{"bins"};
610 0           delete $self->{"stats"}{"Dist"}{"binned"};
611             }
612              
613              
614             sub set_optimal_binning {
615 0     0 1   my $self = shift;
616 0 0         if( $self->{"optimal-binning"} == 0) {
617 0           delete $self->{"bins"};
618 0           delete $self->{"num-bins"};
619 0           delete $self->{"stats"}{"Dist"}{"binned"};
620             }
621 0           $self->{"optimal-binning"} = 1;
622             }
623              
624              
625             sub set_custom_num_bins {
626 0     0 1   my $self = shift;
627 0           my $nb = shift;
628 0 0         if($self->{"bin-type"} == NO_BINNING) {
629 0           die "Set a binning type before setting the num of bins\n";
630             }
631 0 0         if( $nb < 2 ) {
632 0           die "Wrong number of bins provided: " . $nb . "\n";
633             }
634             # check if there is a change
635 0 0 0       if( $self->{"optimal-binning"} == 0 and
      0        
636             defined ($self->{"num-bins"}) and
637             $self->{"num-bins"} == $nb) {
638             # nothing changed -> nothing to do
639             }
640             else {
641             # update binning parameters
642 0           $self->{"optimal-binning"} = 0;
643 0           $self->{"num-bins"} = $nb;
644 0           delete $self->{"stats"}{"Dist"}{"binned"};
645 0           delete $self->{"bins"};
646             }
647             }
648              
649              
650             sub bins {
651 0     0 1   my $self = shift;
652 0 0         if($self->count() == 0) {
653 0           print "WARNING: empty dataset, returning empty bins. \n";
654 0           return {}; # empty array
655             }
656              
657 0 0         if(!defined($self->{"bins"})) {
658             # CASE 1: if no_binning is set we do not
659             # save the binning structure
660 0 0         if( $self->{"bin-type"} == NO_BINNING) {
661 0           my $curbins = {};
662 0           my $v;
663 0           foreach $v(keys %{$self->{"data_frequency"}}) {
  0            
664 0           my $ref = $v;
665 0           $curbins->{$v}{"left"} = $v;
666 0           $curbins->{$v}{"right"} = $v;
667             }
668 0           return $curbins;
669             }
670            
671             # CASE 2: if a custom number of bins is
672             # provided, we lin-/log- bin the support
673             # accordingly
674 0 0         if($self->{"optimal-binning"} == 0) {
675 0 0         if($self->{"bin-type"} == LIN_BINNING) {
676 0           $self->{"bins"} = $self->compute_lin_bins($self->{"num-bins"});
677             }
678 0 0         if($self->{"bin-type"} == LOG_BINNING) {
679 0           $self->{"bins"} = $self->compute_log_bins($self->{"num-bins"});
680             }
681             }
682              
683             # CASE 3: optimal binning
684             else {
685 0           my $res = $self->_optimal_binning();
686 0 0         if($res != 0) {
687 0           print "Optimal binning was not possible on this sample, ";
688 0           print "using the NO_BINNING mode" . "\n";
689 0           my $curbins = {};
690 0           my $v;
691 0           foreach $v(keys %{$self->{"data_frequency"}}) {
  0            
692 0           my $ref = $v;
693 0           $curbins->{$v}{"left"} = $v;
694 0           $curbins->{$v}{"right"} = $v;
695             }
696 0           $self->{"bin-type"} = NO_BINNING;
697 0           $self->{"bins"} = $curbins;
698             }
699             }
700             }
701 0           return $self->{"bins"};
702             }
703              
704              
705             ################## Optimal Binning Related Functions ##################
706              
707             # Compute the optimal binning
708             sub _optimal_binning {
709 0     0     my $self = shift;
710             # LEGEND:
711             # b_* => bins
712             # f_* => binned frequency distribution
713             # d_* => binned density distribution
714             # s_* => sum of density distribution values
715             #
716             # start from extreme values ( min_num_bins and max_num_bins)
717             # and use the bisection method on the interval
718             # till the optimal is found (i.e. the binning such that
719             # the sum of the density function values is the closest
720             # to 1)
721 0           my $min_num_bins = 2;
722 0           my ($b_min, $f_min, $d_min, $s_min) = $self->_bin_attempt($min_num_bins);
723 0           my $max_num_bins = scalar $self->_full_support();
724 0 0         if($max_num_bins == 1) {
725 0           return -1;
726             }
727 0           my ($b_max, $f_max, $d_max, $s_max) = $self->_bin_attempt($max_num_bins);
728             # initial check
729 0 0 0       if( $s_min > 1 or $s_max < 1) {
730 0           return -1;
731             }
732 0           my $avg_num_bins;
733 0           my ($b_avg, $f_avg, $d_avg, $s_avg);
734 0           while( ($max_num_bins - $min_num_bins) > 1) {
735 0           $avg_num_bins = sprintf("%.f", (($max_num_bins + $min_num_bins)/2));
736 0           ($b_avg, $f_avg, $d_avg, $s_avg) = $self->_bin_attempt($avg_num_bins);
737 0 0         if($s_avg < 1) {
738 0           $min_num_bins = $avg_num_bins;
739 0           ($b_min, $f_min, $d_min, $s_min) = ($b_avg, $f_avg, $d_avg, $s_avg);
740             }
741             else {
742 0           $max_num_bins = $avg_num_bins;
743 0           ($b_max, $f_max, $d_max, $s_max) = ($b_avg, $f_avg, $d_avg, $s_avg);
744             }
745             }
746             # the binning whose density sum is closer to 1 win
747 0 0         if( (1 - $s_min) < ($s_max-1) ) {
748 0           $self->{"bins"} = $b_min;
749 0           $self->{"Dist"}{"binned"}{"frequency"} = $f_min;
750 0           $self->{"Dist"}{"binned"}{"pmf"} = $d_min;
751             }
752             else {
753 0           $self->{"bins"} = $b_max;
754 0           $self->{"Dist"}{"binned"}{"frequency"} = $f_max;
755 0           $self->{"Dist"}{"binned"}{"pmf"} = $d_max;
756             }
757 0           return 0;
758             }
759              
760              
761             # bin the current data into $num_bins intervals
762             # and return: the bins, the binned frequency distribution
763             # the binned density distribution and the sum of the
764             # density values
765             sub _bin_attempt {
766 0     0     my $self = shift;
767 0           my $num_bins = shift;
768             # output values
769 0           my ($cur_bins, $cur_freq, $cur_density, $cur_sum);
770 0           $cur_sum = 0;
771             # get the bins
772 0 0         if( $self->{"bin-type"} == LIN_BINNING ) {
773 0           $cur_bins = $self->compute_lin_bins($num_bins);
774             }
775 0 0         if( $self->{"bin-type"} == LOG_BINNING ) {
776 0           $cur_bins =$self->compute_log_bins($num_bins);
777             }
778             # assert non binned frequency distribution is computed
779 0           $self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
780 0           my $count = $self->count();
781             #
782 0           my $s = 0; # support iterator
783 0           my @sorted_support = (sort {$a<=>$b} $self->_full_support());
  0            
784 0           my $sup_size = scalar @sorted_support;
785 0           my $bi = 0; # bins iterator
786 0           my @sorted_bins = (sort {$a<=>$b} keys %{$cur_bins});
  0            
  0            
787 0           my $n_bins = scalar @sorted_bins; # assert(n_bins == num_bins)
788 0           my $ref = 0;
789 0           my $freq_sum = 0;
790 0           my $interval = 0;
791 0           for(; $bi < $n_bins; $bi++) { # for each bin
792 0           $ref = $sorted_bins[$bi];
793 0           $interval = $cur_bins->{$ref}{"right"} - $cur_bins->{$ref}{"left"};
794 0           for(; $s < $sup_size; $s++) { # for each support value
795             # if the current support value is less than the rightmost value in the bin
796 0 0         if($sorted_support[$s] < $cur_bins->{$ref}{"right"}) {
797 0           $freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_support[$s]};
798             }
799             # else we have to save the information for the current bin
800             else {
801 0           $cur_freq->{$ref} = $freq_sum;
802 0 0         if($interval > 0) {
803 0           $cur_density->{$ref} = ($freq_sum / $count) / $interval;
804             }
805             else {
806 0           $cur_density->{$ref} = $freq_sum / $count;
807             }
808 0           $cur_sum += $cur_density->{$ref};
809 0           $freq_sum = 0;
810 0           last; # we do not increment s
811             }
812             }
813             # if last element of the support has been reached
814             # we have to save the current freq sum in the current bin
815 0 0         if($s == $sup_size) {
816 0           $cur_freq->{$ref} = $freq_sum;
817 0 0         if($interval > 0) {
818 0           $cur_density->{$ref} = ($freq_sum / $count) / $interval;
819             }
820             else {
821 0           $cur_density->{$ref} = $freq_sum / $count;
822             }
823 0           $cur_sum += $cur_density->{$ref};
824             }
825             }
826             #DEBUG print "attempt: " . $num_bins . " sum: " . $cur_sum . "\n";
827 0           return ($cur_bins, $cur_freq, $cur_density, $cur_sum);
828             }
829              
830              
831              
832             sub _full_support {
833 0     0     my $self = shift;
834 0           my $support = {};
835 0           my $v;
836 0           foreach $v( keys %{$self->{"data_frequency"}}) {
  0            
837 0           $support->{$v} = 1;
838             }
839 0           return (keys %{$support});
  0            
840             }
841              
842              
843              
844              
845             ################## Utilities ##################
846              
847             # return a structure describing a linear binning
848             # of the current data ($num_bins provided)
849             sub compute_lin_bins {
850 0     0 1   my $self = shift;
851 0           my $num_bins = shift;
852 0           my $min = $self->minimum();
853 0           my $max = $self->maximum();
854 0           my $linbins = {}; # reference to empty hash
855             # last bin includes just the maximum value
856 0           my $delta = abs($max - $min)/($num_bins-1);
857 0           my $i;
858             my $ref;
859 0           for($i=0; $i < $num_bins; $i++) {
860 0           $ref = $min + $i * $delta + $delta/2;
861 0           $linbins->{$ref}{"left"} = $min + $i * $delta;
862 0           $linbins->{$ref}{"right"} = $min + $i * $delta + $delta;
863             }
864 0           return $linbins;
865             }
866              
867              
868             # return a structure describing a logarithmic binning
869             # of the current data ($num_bins provided)
870             # bins are not set
871             sub compute_log_bins {
872 0     0 1   my $self = shift;
873 0           my $num_bins = shift;
874 0           my $min = $self->minimum();
875 0           my $max = $self->maximum();
876 0 0 0       if($min == 0 or $max == 0 or $min *$max <0) {
      0        
877             # if the interval contains zero, then we cannot logbin
878             # we return the linear binning
879 0           print "Logarithmic binning was not possible on this sample, ";
880 0           print "switching to linear binning mode" . "\n";
881 0           $self->{"bin-type"} = LIN_BINNING;
882 0           return $self->compute_lin_bins($num_bins);
883             }
884 0           my $logbins = {}; # reference to empty hash
885 0           my $min_exp = log($min)/log(LOG_BASE);
886 0           my $max_exp = log($max+1)/log(LOG_BASE);
887 0           my $delta_exp = abs($max_exp - $min_exp)/$num_bins;
888 0           my $cur_exp;
889             my $i;
890 0           my $ref;
891 0           for($i=0; $i < $num_bins; $i++) {
892 0           $cur_exp = $min_exp + $i * $delta_exp;
893 0           $ref = LOG_BASE ** ($cur_exp + $delta_exp/2);
894 0           $logbins->{$ref}{"left"} = LOG_BASE ** $cur_exp;
895 0           $logbins->{$ref}{"right"} = LOG_BASE ** ($cur_exp + $delta_exp);
896             }
897 0           return $logbins;
898             }
899              
900              
901             1;