line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Statistics::Discrete |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# Chiara Orsini, CAIDA, UC San Diego |
6
|
|
|
|
|
|
|
# chiara@caida.org |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright (C) 2014 The Regents of the University of California. |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# Statistics::Discrete is free software: you can redistribute it and/or modify |
11
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
12
|
|
|
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
13
|
|
|
|
|
|
|
# (at your option) any later version. |
14
|
|
|
|
|
|
|
# |
15
|
|
|
|
|
|
|
# Statistics::Discrete is distributed in the hope that it will be useful, |
16
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
17
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18
|
|
|
|
|
|
|
# GNU General Public License for more details. |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
21
|
|
|
|
|
|
|
# along with Statistics::Discrete. If not, see . |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
package Statistics::Discrete; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
|
27
|
1
|
|
|
1
|
|
33479
|
use 5.014002; |
|
1
|
|
|
|
|
4
|
|
28
|
1
|
|
|
1
|
|
4
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
23
|
|
29
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
34
|
|
30
|
1
|
|
|
1
|
|
5
|
use List::Util; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
70
|
|
31
|
1
|
|
|
1
|
|
754
|
use POSIX; |
|
1
|
|
|
|
|
7200
|
|
|
1
|
|
|
|
|
4
|
|
32
|
1
|
|
|
1
|
|
4496
|
use Storable 'dclone'; |
|
1
|
|
|
|
|
6036
|
|
|
1
|
|
|
|
|
100
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
our $VERSION = '0.05'; |
37
|
|
|
|
|
|
|
|
38
|
1
|
|
|
1
|
|
8
|
use constant NO_BINNING => 0; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
83
|
|
39
|
1
|
|
|
1
|
|
5
|
use constant LIN_BINNING => 1; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
39
|
|
40
|
1
|
|
|
1
|
|
4
|
use constant LOG_BINNING => 2; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
42
|
|
41
|
|
|
|
|
|
|
|
42
|
1
|
|
|
1
|
|
5
|
use constant LOG_BASE => 10; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
43
|
1
|
|
|
1
|
|
4
|
use constant DEFAULT_BINS_NUMBER => 10; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
5092
|
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# Constructor |
47
|
|
|
|
|
|
|
sub new { |
48
|
1
|
|
|
1
|
0
|
420
|
my $class = shift; |
49
|
1
|
|
|
|
|
2
|
my $self = {}; |
50
|
1
|
|
|
|
|
4
|
$self->{"data_frequency"} = {}; # anonymous hash constructor |
51
|
|
|
|
|
|
|
# binning default values |
52
|
1
|
|
|
|
|
3
|
$self->{"bin-type"} = NO_BINNING; |
53
|
1
|
|
|
|
|
2
|
$self->{"optimal-binning"} = 1; |
54
|
|
|
|
|
|
|
# $self->{"bins"} - bins' structure |
55
|
|
|
|
|
|
|
# $self->{"num-bins"} - num bins when optimal binning is off |
56
|
|
|
|
|
|
|
# on-demand stored stats |
57
|
|
|
|
|
|
|
# $self->{"stats"}{"Desc"} - descriptive statistics |
58
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"} - distributions |
59
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"} - binned distributions |
60
|
|
|
|
|
|
|
# bind self and class |
61
|
1
|
|
|
|
|
2
|
bless $self, $class; |
62
|
1
|
|
|
|
|
3
|
return $self; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
################## Methods to load/add data ################## |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
sub add_data { |
68
|
1
|
|
|
1
|
0
|
685
|
my $self = shift; |
69
|
1
|
|
|
|
|
4
|
my @data_to_add = @_; |
70
|
1
|
|
|
|
|
1
|
my $data; |
71
|
1
|
|
|
|
|
3
|
foreach $data(@data_to_add) { |
72
|
10
|
100
|
|
|
|
28
|
if(defined($self->{"data_frequency"}{$data})) { |
73
|
4
|
|
|
|
|
6
|
$self->{"data_frequency"}{$data}++; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
else { |
76
|
6
|
|
|
|
|
20
|
$self->{"data_frequency"}{$data} = 1 ; |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
# data has changed - stats and bins need to be computed again |
80
|
1
|
|
|
|
|
3
|
delete $self->{"stats"}; |
81
|
1
|
|
|
|
|
2
|
delete $self->{"bins"}; |
82
|
1
|
|
|
|
|
3
|
return; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub add_data_from_file { |
86
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
87
|
0
|
|
|
|
|
0
|
my $filename = shift; |
88
|
0
|
|
|
|
|
0
|
my @cols; |
89
|
|
|
|
|
|
|
my $f; |
90
|
0
|
|
|
|
|
0
|
my $line; |
91
|
0
|
0
|
|
|
|
0
|
open($f, "<", $filename) or die "Cannot open " .$filename . ": $!"; |
92
|
0
|
|
|
|
|
0
|
while($line = <$f>) { |
93
|
0
|
|
|
|
|
0
|
chomp($line); |
94
|
0
|
0
|
|
|
|
0
|
if($line =~ /^\s*#/) { |
95
|
0
|
|
|
|
|
0
|
next; |
96
|
|
|
|
|
|
|
} |
97
|
0
|
|
|
|
|
0
|
@cols = split /\s/ , $line; |
98
|
0
|
0
|
|
|
|
0
|
if(scalar @cols == 1) { # assume list of values |
99
|
0
|
0
|
|
|
|
0
|
if(defined($self->{"data_frequency"}{$cols[0]})) { |
100
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[0]}++; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
else { |
103
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[0]} = 1 ; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
} |
106
|
0
|
0
|
|
|
|
0
|
if(scalar @cols == 2) { # assume list of id - value pairs |
107
|
0
|
0
|
|
|
|
0
|
if(defined($self->{"data_frequency"}{$cols[1]})) { |
108
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[1]}++; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
else { |
111
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[1]} = 1 ; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
# File format not recognized |
115
|
0
|
0
|
0
|
|
|
0
|
if(scalar @cols != 1 and scalar @cols != 2){ |
116
|
0
|
|
|
|
|
0
|
print "File format not recognized\n"; |
117
|
0
|
|
|
|
|
0
|
close($f); |
118
|
0
|
|
|
|
|
0
|
delete $self->{"stats"}; |
119
|
0
|
|
|
|
|
0
|
delete $self->{"bins"}; |
120
|
0
|
|
|
|
|
0
|
return; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
} |
123
|
0
|
|
|
|
|
0
|
close($f); |
124
|
|
|
|
|
|
|
# data has changed - stats and bins need to be computed again |
125
|
0
|
|
|
|
|
0
|
delete $self->{"stats"}; |
126
|
0
|
|
|
|
|
0
|
delete $self->{"bins"}; |
127
|
0
|
|
|
|
|
0
|
return; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub add_data_from_frequencyfile { |
131
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
132
|
0
|
|
|
|
|
0
|
my $filename = shift; |
133
|
0
|
|
|
|
|
0
|
my @cols; |
134
|
|
|
|
|
|
|
my $f; |
135
|
0
|
|
|
|
|
0
|
my $line; |
136
|
0
|
0
|
|
|
|
0
|
open($f, "<", $filename) or die "Cannot open " .$filename . ": $!"; |
137
|
0
|
|
|
|
|
0
|
while($line = <$f>) { |
138
|
0
|
|
|
|
|
0
|
chomp($line); |
139
|
0
|
0
|
|
|
|
0
|
if($line =~ /^\s*#/) { |
140
|
0
|
|
|
|
|
0
|
next; |
141
|
|
|
|
|
|
|
} |
142
|
0
|
|
|
|
|
0
|
@cols = split /\s+/ , $line; |
143
|
|
|
|
|
|
|
#DEBUG print $line . "\t" . @cols . "\n"; |
144
|
0
|
0
|
|
|
|
0
|
if(scalar @cols == 2) { # assume list of value - count |
145
|
0
|
0
|
|
|
|
0
|
if(defined($self->{"data_frequency"}{$cols[0]})) { |
146
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[0]} += $cols[1]; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
else { |
149
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[0]} = $cols[1]; |
150
|
|
|
|
|
|
|
} |
151
|
0
|
|
|
|
|
0
|
next; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
# File format not recognized |
154
|
0
|
0
|
|
|
|
0
|
if(scalar @cols != 2){ |
155
|
0
|
|
|
|
|
0
|
print "File format not recognized\n"; |
156
|
0
|
|
|
|
|
0
|
close($f); |
157
|
0
|
|
|
|
|
0
|
delete $self->{"stats"}; |
158
|
0
|
|
|
|
|
0
|
delete $self->{"bins"}; |
159
|
0
|
|
|
|
|
0
|
return; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
} |
162
|
0
|
|
|
|
|
0
|
close($f); |
163
|
|
|
|
|
|
|
# data has changed - stats and bins need to be computed again |
164
|
0
|
|
|
|
|
0
|
delete $self->{"stats"}; |
165
|
0
|
|
|
|
|
0
|
delete $self->{"bins"}; |
166
|
0
|
|
|
|
|
0
|
return; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
################## Descriptive Statistics ################## |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# Minimum value |
173
|
|
|
|
|
|
|
# compute the minimum value, save the statistic, and return it |
174
|
|
|
|
|
|
|
sub minimum { |
175
|
1
|
|
|
1
|
1
|
9
|
my $self = shift; |
176
|
1
|
50
|
|
|
|
6
|
if(!defined($self->{"stats"}{"Desc"}{"min"})) { |
177
|
1
|
|
|
|
|
3
|
my $count = $self->count(); |
178
|
1
|
50
|
|
|
|
5
|
if($count > 0) { |
179
|
1
|
|
|
|
|
15
|
$self->{"stats"}{"Desc"}{"min"} = List::Util::min(keys $self->{"data_frequency"}); |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
else { |
182
|
0
|
|
|
|
|
0
|
$self->{"stats"}{"Desc"}{"min"} = 0; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
} |
185
|
1
|
|
|
|
|
6
|
return $self->{"stats"}{"Desc"}{"min"}; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# Maximum value |
190
|
|
|
|
|
|
|
# compute the minimum value, save the statistic, and return it |
191
|
|
|
|
|
|
|
sub maximum { |
192
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
193
|
1
|
50
|
|
|
|
6
|
if(!defined($self->{"stats"}{"Desc"}{"max"})) { |
194
|
1
|
|
|
|
|
2
|
my $count = $self->count(); |
195
|
1
|
50
|
|
|
|
4
|
if($count > 0) { |
196
|
1
|
|
|
|
|
10
|
$self->{"stats"}{"Desc"}{"max"} = List::Util::max(keys $self->{"data_frequency"}); |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
else { |
199
|
0
|
|
|
|
|
0
|
$self->{"stats"}{"Desc"}{"max"} = 0; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
} |
202
|
1
|
|
|
|
|
5
|
return $self->{"stats"}{"Desc"}{"max"}; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# count the number of samples provided |
207
|
|
|
|
|
|
|
# save the statistic, and return it |
208
|
|
|
|
|
|
|
sub count { |
209
|
6
|
|
|
6
|
1
|
12
|
my $self = shift; |
210
|
6
|
100
|
|
|
|
18
|
if(!defined($self->{"stats"}{"Desc"}{"count"})) { |
211
|
1
|
|
|
|
|
3
|
my $count = 0; |
212
|
1
|
|
|
|
|
1
|
my $data; |
213
|
1
|
|
|
|
|
2
|
foreach $data(keys %{$self->{"data_frequency"}}) { |
|
1
|
|
|
|
|
7
|
|
214
|
6
|
|
|
|
|
10
|
$count += $self->{"data_frequency"}{$data}; |
215
|
|
|
|
|
|
|
} |
216
|
1
|
|
|
|
|
4
|
$self->{"stats"}{"Desc"}{"count"} = $count; |
217
|
|
|
|
|
|
|
} |
218
|
6
|
|
|
|
|
15
|
return $self->{"stats"}{"Desc"}{"count"}; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Expected_value |
223
|
|
|
|
|
|
|
# the weighted average of the possible values, |
224
|
|
|
|
|
|
|
# using their probabilities as their weights |
225
|
|
|
|
|
|
|
sub mean { |
226
|
2
|
|
|
2
|
1
|
5
|
my $self = shift; |
227
|
2
|
100
|
|
|
|
8
|
if(!defined($self->{"stats"}{"Desc"}{"mean"})) { |
228
|
1
|
|
|
|
|
1
|
my $cumul_value = 0; |
229
|
1
|
|
|
|
|
3
|
my $count = $self->count(); |
230
|
1
|
|
|
|
|
2
|
my $v; |
231
|
1
|
|
|
|
|
2
|
foreach $v( keys %{$self->{"data_frequency"}}) { |
|
1
|
|
|
|
|
15
|
|
232
|
6
|
|
|
|
|
13
|
$cumul_value += $v * $self->{"data_frequency"}{$v}; |
233
|
|
|
|
|
|
|
} |
234
|
1
|
50
|
|
|
|
20
|
if($count > 0) { |
235
|
1
|
|
|
|
|
4
|
$self->{"stats"}{"Desc"}{"mean"} = $cumul_value / $count; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
else { |
238
|
0
|
|
|
|
|
0
|
$self->{"stats"}{"Desc"}{"mean"} = 0; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
} |
241
|
2
|
|
|
|
|
7
|
return $self->{"stats"}{"Desc"}{"mean"}; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Median |
247
|
|
|
|
|
|
|
# the median is the numerical value separating |
248
|
|
|
|
|
|
|
# the higher half of a data sample, a population, |
249
|
|
|
|
|
|
|
# or a probability distribution, from the lower half. |
250
|
|
|
|
|
|
|
sub median { |
251
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
252
|
1
|
50
|
|
|
|
10
|
if(!defined($self->{"stats"}{"Desc"}{"median"})) { |
253
|
1
|
|
|
|
|
4
|
my $count = $self->count(); |
254
|
1
|
|
|
|
|
2
|
my $odd = 0; |
255
|
1
|
50
|
|
|
|
6
|
if($count % 2 != 0) { |
256
|
0
|
|
|
|
|
0
|
$odd = 1; |
257
|
|
|
|
|
|
|
} |
258
|
1
|
|
|
|
|
15
|
my $median_index = floor($count/2) + $odd; |
259
|
1
|
|
|
|
|
2
|
my $current_index = 0; |
260
|
|
|
|
|
|
|
|
261
|
1
|
|
|
|
|
1
|
my ($v,$i); |
262
|
1
|
|
|
|
|
2
|
foreach $v(sort {$a<=>$b} keys %{$self->{"data_frequency"}}) { |
|
8
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
9
|
|
263
|
6
|
|
|
|
|
14
|
for($i=0; $i < $self->{"data_frequency"}{$v}; $i++) { |
264
|
10
|
|
|
|
|
9
|
$current_index++; |
265
|
10
|
100
|
|
|
|
22
|
if($current_index == $median_index) { |
266
|
1
|
|
|
|
|
3
|
$self->{"stats"}{"Desc"}{"median"} = $v; |
267
|
|
|
|
|
|
|
} |
268
|
10
|
100
|
66
|
|
|
44
|
if($current_index == ($median_index+1) and |
269
|
|
|
|
|
|
|
$odd == 0) { |
270
|
1
|
|
|
|
|
5
|
$self->{"stats"}{"Desc"}{"median"} = ($self->{"stats"}{"Desc"}{"median"} + $v)/2; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
} |
276
|
1
|
|
|
|
|
5
|
return $self->{"stats"}{"Desc"}{"median"}; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Percentile |
281
|
|
|
|
|
|
|
# A percentile (or a centile) is a measure |
282
|
|
|
|
|
|
|
# used in statistics indicating the value |
283
|
|
|
|
|
|
|
# below which a given percentage of observations |
284
|
|
|
|
|
|
|
# in a group of observations fall. |
285
|
|
|
|
|
|
|
sub percentile { |
286
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
287
|
0
|
|
|
|
|
0
|
my $perc = shift; |
288
|
0
|
|
|
|
|
0
|
my $max_probability = $perc/100; |
289
|
0
|
|
|
|
|
0
|
my $cdf = $self->empirical_distribution_function(); |
290
|
0
|
|
|
|
|
0
|
my $v = 0; |
291
|
0
|
|
|
|
|
0
|
my $previous_v = 0; |
292
|
0
|
|
|
|
|
0
|
my $epsilon = 0; |
293
|
0
|
|
|
|
|
0
|
foreach $v(sort {$a<=>$b} keys %{$cdf}) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
294
|
0
|
|
|
|
|
0
|
$epsilon = $v - $previous_v; |
295
|
0
|
0
|
|
|
|
0
|
if($cdf->{$v} > $max_probability) { |
296
|
0
|
|
|
|
|
0
|
return $v; |
297
|
|
|
|
|
|
|
} |
298
|
0
|
|
|
|
|
0
|
$previous_v = $v; |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
# if the program is here then $perc == 1 |
301
|
|
|
|
|
|
|
# then we return $v+epsilon |
302
|
0
|
|
|
|
|
0
|
return $v+$epsilon; |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Variance |
308
|
|
|
|
|
|
|
# The variance of a random variable X is |
309
|
|
|
|
|
|
|
# its second central moment, the expected |
310
|
|
|
|
|
|
|
# value of the squared deviation from the mean |
311
|
|
|
|
|
|
|
sub variance { |
312
|
1
|
|
|
1
|
1
|
1
|
my $self = shift; |
313
|
1
|
50
|
|
|
|
5
|
if(!defined($self->{"stats"}{"Desc"}{"variance"})) { |
314
|
1
|
|
|
|
|
3
|
my $mean = $self->mean(); |
315
|
1
|
|
|
|
|
3
|
my $count = $self->count(); |
316
|
1
|
|
|
|
|
2
|
my $cumul_value = 0; |
317
|
1
|
|
|
|
|
2
|
my $square_mean = 0; |
318
|
1
|
|
|
|
|
1
|
my $v; |
319
|
1
|
|
|
|
|
7
|
foreach $v(keys %{$self->{"data_frequency"}}) { |
|
1
|
|
|
|
|
4
|
|
320
|
6
|
|
|
|
|
13
|
$cumul_value += ($v**2) * $self->{"data_frequency"}{$v}; |
321
|
|
|
|
|
|
|
} |
322
|
1
|
50
|
|
|
|
4
|
if($count > 0) { |
323
|
1
|
|
|
|
|
2
|
$square_mean = $cumul_value / $count; |
324
|
|
|
|
|
|
|
} |
325
|
1
|
|
|
|
|
4
|
$self->{"stats"}{"Desc"}{"variance"} = $square_mean - ($mean**2); |
326
|
|
|
|
|
|
|
} |
327
|
1
|
|
|
|
|
19
|
return $self->{"stats"}{"Desc"}{"variance"}; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Standard_deviation |
332
|
|
|
|
|
|
|
# The standard deviation of a random variable |
333
|
|
|
|
|
|
|
# is the square root of its variance. |
334
|
|
|
|
|
|
|
sub standard_deviation { |
335
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
336
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Desc"}{"sdev"})) { |
337
|
0
|
|
|
|
|
|
my $variance = $self->variance(); |
338
|
0
|
|
|
|
|
|
$self->{"stats"}{"Desc"}{"sdev"} = sqrt($variance); |
339
|
|
|
|
|
|
|
} |
340
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Desc"}{"sdev"}; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
################## Distributions Statistics ################## |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Frequency_distribution |
347
|
|
|
|
|
|
|
# the frequency distribution is an arrangement of the values |
348
|
|
|
|
|
|
|
# that one or more variables take in a sample. Each entry in |
349
|
|
|
|
|
|
|
# the table contains the frequency or count of the occurrences |
350
|
|
|
|
|
|
|
# of values within a particular group or interval |
351
|
|
|
|
|
|
|
sub frequency_distribution { |
352
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
353
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
354
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty frequency distribution. \n"; |
355
|
0
|
|
|
|
|
|
return (); # empty hash |
356
|
|
|
|
|
|
|
} |
357
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
358
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"frequency"} |
359
|
0
|
|
|
|
|
|
return $self->_frequency_distribution(); |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
else { |
362
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"}{"frequency"} |
363
|
0
|
|
|
|
|
|
return $self->_binned_frequency_distribution(); |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Probability_mass_function |
369
|
|
|
|
|
|
|
# the probability mass function (pmf) is a function that gives |
370
|
|
|
|
|
|
|
# the probability that a discrete random variable is exactly |
371
|
|
|
|
|
|
|
# equal to some value. |
372
|
|
|
|
|
|
|
sub probability_mass_function { |
373
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
374
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
375
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty probability mass function. \n"; |
376
|
0
|
|
|
|
|
|
return (); # empty hash |
377
|
|
|
|
|
|
|
} |
378
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
379
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"pmf"} |
380
|
0
|
|
|
|
|
|
return $self->_probability_mass_function(); |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
else { |
383
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"}{"pmf"} |
384
|
0
|
|
|
|
|
|
return $self->_binned_probability_mass_function(); |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
} |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Empirical_distribution_function |
390
|
|
|
|
|
|
|
# the empirical distribution function, or empirical cdf, is the |
391
|
|
|
|
|
|
|
# cumulative distribution function associated with the empirical |
392
|
|
|
|
|
|
|
# measure of the sample. This cdf is a step function that jumps up |
393
|
|
|
|
|
|
|
# by 1/n at each of the n data points. |
394
|
|
|
|
|
|
|
sub empirical_distribution_function { |
395
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
396
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
397
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty emptirical distribution function. \n"; |
398
|
0
|
|
|
|
|
|
return (); # empty hash |
399
|
|
|
|
|
|
|
} |
400
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
401
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"cdf"} |
402
|
0
|
|
|
|
|
|
return $self->_empirical_distribution_function(); |
403
|
|
|
|
|
|
|
} |
404
|
|
|
|
|
|
|
else { |
405
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"}{"cdf"} |
406
|
0
|
|
|
|
|
|
return $self->_binned_empirical_distribution_function(); |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/CCDF |
412
|
|
|
|
|
|
|
# how often the random variable is above a particular level. |
413
|
|
|
|
|
|
|
sub complementary_cumulative_distribution_function { |
414
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
415
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
416
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty complementary cumulative distribution function. \n"; |
417
|
0
|
|
|
|
|
|
return (); # empty hash |
418
|
|
|
|
|
|
|
} |
419
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
420
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"ccdf"} |
421
|
0
|
|
|
|
|
|
return $self->_complementary_cumulative_distribution_function(); |
422
|
|
|
|
|
|
|
} |
423
|
|
|
|
|
|
|
else { |
424
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"}{"ccdf"} |
425
|
0
|
|
|
|
|
|
return $self->_binned_complementary_cumulative_distribution_function(); |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
################## Regular Distributions Statistics ################## |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
sub _frequency_distribution { |
435
|
0
|
|
|
0
|
|
|
my $self = shift; |
436
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"frequency"})) { |
437
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"frequency"} = dclone $self->{"data_frequency"}; |
438
|
|
|
|
|
|
|
} |
439
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"frequency"}; |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
sub _probability_mass_function { |
444
|
0
|
|
|
0
|
|
|
my $self = shift; |
445
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"pmf"})) { |
446
|
0
|
|
|
|
|
|
my $count = $self->count(); |
447
|
0
|
|
|
|
|
|
$self->_frequency_distribution(); |
448
|
0
|
|
|
|
|
|
my ($val,$freq); |
449
|
0
|
|
|
|
|
|
foreach $val(keys %{$self->{"stats"}{"Dist"}{"frequency"}}) { |
|
0
|
|
|
|
|
|
|
450
|
0
|
|
|
|
|
|
$freq = $self->{"stats"}{"Dist"}{"frequency"}{$val}; |
451
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"pmf"}{$val} = $freq / $count; |
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
} |
454
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"pmf"}; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
sub _empirical_distribution_function { |
459
|
0
|
|
|
0
|
|
|
my $self = shift; |
460
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"cdf"})) { |
461
|
0
|
|
|
|
|
|
my $count = $self->count(); |
462
|
0
|
|
|
|
|
|
$self->_frequency_distribution(); |
463
|
0
|
|
|
|
|
|
my $val; |
464
|
0
|
|
|
|
|
|
my $cumul_freq = 0; |
465
|
0
|
|
|
|
|
|
foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
466
|
0
|
|
|
|
|
|
$cumul_freq += $self->{"stats"}{"Dist"}{"frequency"}{$val}; |
467
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"cdf"}{$val} = $cumul_freq / $count; |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
} |
470
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"cdf"}; |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
sub _complementary_cumulative_distribution_function { |
475
|
0
|
|
|
0
|
|
|
my $self = shift; |
476
|
|
|
|
|
|
|
# warning, this creates self->{"stats"} if it doesn't exist |
477
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"ccdf"})) { |
478
|
0
|
|
|
|
|
|
my $count = $self->count(); |
479
|
0
|
|
|
|
|
|
$self->frequency_distribution(); |
480
|
0
|
|
|
|
|
|
my $val; |
481
|
0
|
|
|
|
|
|
my $cumul_freq = $count; |
482
|
0
|
|
|
|
|
|
foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
483
|
0
|
|
|
|
|
|
$cumul_freq -= $self->{"stats"}{"Dist"}{"frequency"}{$val}; |
484
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"ccdf"}{$val} = $cumul_freq / $count; |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
} |
487
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"ccdf"}; |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
################## Binned Distributions Statistics ################## |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
sub _binned_frequency_distribution { |
496
|
0
|
|
|
0
|
|
|
my $self = shift; |
497
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"binned"}{"frequency"})) { |
498
|
|
|
|
|
|
|
# 1. get bins |
499
|
0
|
|
|
|
|
|
$self->bins(); # set $self->{"bins"} |
500
|
|
|
|
|
|
|
# 2. get regular frequency distribution |
501
|
0
|
|
|
|
|
|
$self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"} |
502
|
|
|
|
|
|
|
# 3. compute binned property |
503
|
0
|
|
|
|
|
|
my $binned_frequency = {}; |
504
|
0
|
|
|
|
|
|
my $i = 0; # vals iterator |
505
|
0
|
|
|
|
|
|
my @sorted_vals = (sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
506
|
0
|
|
|
|
|
|
my $num_vals = scalar @sorted_vals; |
507
|
0
|
|
|
|
|
|
my $bi = 0; # bins iterator |
508
|
0
|
|
|
|
|
|
my @sorted_bins = (sort {$a<=>$b} keys %{$self->{"bins"}}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
509
|
0
|
|
|
|
|
|
my $num_bins = scalar @sorted_bins; |
510
|
0
|
|
|
|
|
|
my $freq_sum = 0; |
511
|
0
|
|
|
|
|
|
for(; $bi < $num_bins; $bi++) { # for each bin |
512
|
0
|
|
|
|
|
|
for(; $i < $num_vals; $i++) { |
513
|
0
|
0
|
|
|
|
|
if($sorted_vals[$i] < $self->{"bins"}->{$sorted_bins[$bi]}{"right"}) { |
514
|
0
|
|
|
|
|
|
$freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_vals[$i]} |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
else { |
517
|
0
|
|
|
|
|
|
$binned_frequency->{$sorted_bins[$bi]} = $freq_sum; |
518
|
0
|
|
|
|
|
|
$freq_sum = 0; |
519
|
0
|
|
|
|
|
|
last; |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
} |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
# if last element of vals then we save the current freq sum |
524
|
|
|
|
|
|
|
# in the current bin |
525
|
0
|
0
|
|
|
|
|
if($i == $num_vals) { |
526
|
0
|
|
|
|
|
|
$binned_frequency->{$sorted_bins[$bi]} = $freq_sum; |
527
|
|
|
|
|
|
|
} |
528
|
|
|
|
|
|
|
} |
529
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"frequency"} = $binned_frequency; |
530
|
|
|
|
|
|
|
} |
531
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"binned"}{"frequency"}; |
532
|
|
|
|
|
|
|
} |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
sub _binned_probability_mass_function { |
536
|
0
|
|
|
0
|
|
|
my $self = shift; |
537
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"binned"}{"pmf"})) { |
538
|
0
|
|
|
|
|
|
my $count = $self->count(); |
539
|
0
|
|
|
|
|
|
my $bins = $self->bins(); # set $self->{"bins"} |
540
|
0
|
|
|
|
|
|
my $bfd = $self->_binned_frequency_distribution(); |
541
|
0
|
|
|
|
|
|
my ($val,$freq, $interval); |
542
|
0
|
|
|
|
|
|
foreach $val(keys %{$bfd}) { |
|
0
|
|
|
|
|
|
|
543
|
0
|
|
|
|
|
|
$freq = $self->{"stats"}{"Dist"}{"binned"}{"frequency"}{$val}; |
544
|
0
|
|
|
|
|
|
$interval = $self->{"bins"}{$val}{"right"} - $self->{"bins"}{$val}{"left"}; |
545
|
0
|
0
|
|
|
|
|
if($interval > 0) { |
546
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = ($freq / $count) / $interval; |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
else { |
549
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = $freq / $count; |
550
|
|
|
|
|
|
|
} |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
} |
553
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"binned"}{"pmf"}; |
554
|
|
|
|
|
|
|
} |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
sub _binned_empirical_distribution_function { |
558
|
0
|
|
|
0
|
|
|
my $self = shift; |
559
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"binned"}{"cdf"})) { |
560
|
0
|
|
|
|
|
|
my $bpmf = $self->_binned_probability_mass_function(); |
561
|
0
|
|
|
|
|
|
my $val; |
562
|
0
|
|
|
|
|
|
my $cumul_prob = 0; |
563
|
0
|
|
|
|
|
|
foreach $val(sort {$a<=>$b} keys %{$bpmf}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
564
|
0
|
|
|
|
|
|
$cumul_prob += $bpmf->{$val}; |
565
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"cdf"}{$val} = $cumul_prob; |
566
|
|
|
|
|
|
|
} |
567
|
|
|
|
|
|
|
} |
568
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"binned"}{"cdf"}; |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
sub _binned_complementary_cumulative_distribution_function { |
573
|
0
|
|
|
0
|
|
|
my $self = shift; |
574
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"binned"}{"ccdf"})) { |
575
|
0
|
|
|
|
|
|
my $bpmf = $self->_binned_probability_mass_function(); |
576
|
0
|
|
|
|
|
|
my $val; |
577
|
0
|
|
|
|
|
|
my $cumul_prob = 0; |
578
|
0
|
|
|
|
|
|
my $old_cumul_prob = 0; |
579
|
0
|
|
|
|
|
|
foreach $val(sort {$b<=>$a} keys %{$bpmf}) { # descending sort |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
580
|
0
|
|
|
|
|
|
$old_cumul_prob = $cumul_prob; |
581
|
0
|
|
|
|
|
|
$cumul_prob += $bpmf->{$val}; |
582
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"ccdf"}{$val} = $old_cumul_prob; |
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
} |
585
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
################## Binned Related Functions ################## |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
sub set_binning_type { |
593
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
594
|
0
|
|
|
|
|
|
my $bin_type = shift; |
595
|
|
|
|
|
|
|
# check bin-type validity |
596
|
0
|
0
|
0
|
|
|
|
if($bin_type != NO_BINNING && $bin_type != LIN_BINNING && |
|
|
|
0
|
|
|
|
|
597
|
|
|
|
|
|
|
$bin_type != LOG_BINNING) { |
598
|
0
|
|
|
|
|
|
die "Wrong binning type provided\n"; |
599
|
|
|
|
|
|
|
} |
600
|
|
|
|
|
|
|
# if bin type is already set |
601
|
0
|
0
|
|
|
|
|
if($bin_type == $self->{"bin-type"}) { |
602
|
0
|
|
|
|
|
|
return; |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
# if bin-type is different |
605
|
0
|
|
|
|
|
|
$self->{"bin-type"} = $bin_type; |
606
|
0
|
|
|
|
|
|
delete $self->{"num-bins"}; |
607
|
0
|
|
|
|
|
|
delete $self->{"bins"}; |
608
|
0
|
|
|
|
|
|
delete $self->{"stats"}{"Dist"}{"binned"}; |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
sub set_optimal_binning { |
613
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
614
|
0
|
0
|
|
|
|
|
if( $self->{"optimal-binning"} == 0) { |
615
|
0
|
|
|
|
|
|
delete $self->{"bins"}; |
616
|
0
|
|
|
|
|
|
delete $self->{"num-bins"}; |
617
|
0
|
|
|
|
|
|
delete $self->{"stats"}{"Dist"}{"binned"}; |
618
|
|
|
|
|
|
|
} |
619
|
0
|
|
|
|
|
|
$self->{"optimal-binning"} = 1; |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
sub set_custom_num_bins { |
624
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
625
|
0
|
|
|
|
|
|
my $nb = shift; |
626
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
627
|
0
|
|
|
|
|
|
die "Set a binning type before setting the num of bins\n"; |
628
|
|
|
|
|
|
|
} |
629
|
0
|
0
|
|
|
|
|
if( $nb < 2 ) { |
630
|
0
|
|
|
|
|
|
die "Wrong number of bins provided: " . $nb . "\n"; |
631
|
|
|
|
|
|
|
} |
632
|
|
|
|
|
|
|
# check if there is a change |
633
|
0
|
0
|
0
|
|
|
|
if( $self->{"optimal-binning"} == 0 and |
|
|
|
0
|
|
|
|
|
634
|
|
|
|
|
|
|
defined ($self->{"num-bins"}) and |
635
|
|
|
|
|
|
|
$self->{"num-bins"} == $nb) { |
636
|
|
|
|
|
|
|
# nothing changed -> nothing to do |
637
|
|
|
|
|
|
|
} |
638
|
|
|
|
|
|
|
else { |
639
|
|
|
|
|
|
|
# update binning parameters |
640
|
0
|
|
|
|
|
|
$self->{"optimal-binning"} = 0; |
641
|
0
|
|
|
|
|
|
$self->{"num-bins"} = $nb; |
642
|
0
|
|
|
|
|
|
delete $self->{"stats"}{"Dist"}{"binned"}; |
643
|
0
|
|
|
|
|
|
delete $self->{"bins"}; |
644
|
|
|
|
|
|
|
} |
645
|
|
|
|
|
|
|
} |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
sub bins { |
649
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
650
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
651
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty bins. \n"; |
652
|
0
|
|
|
|
|
|
return {}; # empty array |
653
|
|
|
|
|
|
|
} |
654
|
|
|
|
|
|
|
|
655
|
0
|
0
|
|
|
|
|
if(!defined($self->{"bins"})) { |
656
|
|
|
|
|
|
|
# CASE 1: if no_binning is set we do not |
657
|
|
|
|
|
|
|
# save the binning structure |
658
|
0
|
0
|
|
|
|
|
if( $self->{"bin-type"} == NO_BINNING) { |
659
|
0
|
|
|
|
|
|
my $curbins = {}; |
660
|
0
|
|
|
|
|
|
my $v; |
661
|
0
|
|
|
|
|
|
foreach $v(keys %{$self->{"data_frequency"}}) { |
|
0
|
|
|
|
|
|
|
662
|
0
|
|
|
|
|
|
my $ref = $v; |
663
|
0
|
|
|
|
|
|
$curbins->{$v}{"left"} = $v; |
664
|
0
|
|
|
|
|
|
$curbins->{$v}{"right"} = $v; |
665
|
|
|
|
|
|
|
} |
666
|
0
|
|
|
|
|
|
return $curbins; |
667
|
|
|
|
|
|
|
} |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
# CASE 2: if a custom number of bins is |
670
|
|
|
|
|
|
|
# provided, we lin-/log- bin the support |
671
|
|
|
|
|
|
|
# accordingly |
672
|
0
|
0
|
|
|
|
|
if($self->{"optimal-binning"} == 0) { |
673
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == LIN_BINNING) { |
674
|
0
|
|
|
|
|
|
$self->{"bins"} = $self->compute_lin_bins($self->{"num-bins"}); |
675
|
|
|
|
|
|
|
} |
676
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == LOG_BINNING) { |
677
|
0
|
|
|
|
|
|
$self->{"bins"} = $self->compute_log_bins($self->{"num-bins"}); |
678
|
|
|
|
|
|
|
} |
679
|
|
|
|
|
|
|
} |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
# CASE 3: optimal binning |
682
|
|
|
|
|
|
|
else { |
683
|
0
|
|
|
|
|
|
my $res = $self->_optimal_binning(); |
684
|
0
|
0
|
|
|
|
|
if($res != 0) { |
685
|
0
|
|
|
|
|
|
print "Optimal binning was not possible on this sample, "; |
686
|
0
|
|
|
|
|
|
print "using the NO_BINNING mode" . "\n"; |
687
|
0
|
|
|
|
|
|
my $curbins = {}; |
688
|
0
|
|
|
|
|
|
my $v; |
689
|
0
|
|
|
|
|
|
foreach $v(keys %{$self->{"data_frequency"}}) { |
|
0
|
|
|
|
|
|
|
690
|
0
|
|
|
|
|
|
my $ref = $v; |
691
|
0
|
|
|
|
|
|
$curbins->{$v}{"left"} = $v; |
692
|
0
|
|
|
|
|
|
$curbins->{$v}{"right"} = $v; |
693
|
|
|
|
|
|
|
} |
694
|
0
|
|
|
|
|
|
$self->{"bin-type"} = NO_BINNING; |
695
|
0
|
|
|
|
|
|
$self->{"bins"} = $curbins; |
696
|
|
|
|
|
|
|
} |
697
|
|
|
|
|
|
|
} |
698
|
|
|
|
|
|
|
} |
699
|
0
|
|
|
|
|
|
return $self->{"bins"}; |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
################## Optimal Binning Related Functions ################## |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
# Compute the optimal binning |
706
|
|
|
|
|
|
|
sub _optimal_binning { |
707
|
0
|
|
|
0
|
|
|
my $self = shift; |
708
|
|
|
|
|
|
|
# LEGEND: |
709
|
|
|
|
|
|
|
# b_* => bins |
710
|
|
|
|
|
|
|
# f_* => binned frequency distribution |
711
|
|
|
|
|
|
|
# d_* => binned density distribution |
712
|
|
|
|
|
|
|
# s_* => sum of density distribution values |
713
|
|
|
|
|
|
|
# |
714
|
|
|
|
|
|
|
# start from extreme values ( min_num_bins and max_num_bins) |
715
|
|
|
|
|
|
|
# and use the bisection method on the interval |
716
|
|
|
|
|
|
|
# till the optimal is found (i.e. the binning such that |
717
|
|
|
|
|
|
|
# the sum of the density function values is the closest |
718
|
|
|
|
|
|
|
# to 1) |
719
|
0
|
|
|
|
|
|
my $min_num_bins = 2; |
720
|
0
|
|
|
|
|
|
my ($b_min, $f_min, $d_min, $s_min) = $self->_bin_attempt($min_num_bins); |
721
|
0
|
|
|
|
|
|
my $max_num_bins = scalar $self->_full_support(); |
722
|
0
|
0
|
|
|
|
|
if($max_num_bins == 1) { |
723
|
0
|
|
|
|
|
|
return -1; |
724
|
|
|
|
|
|
|
} |
725
|
0
|
|
|
|
|
|
my ($b_max, $f_max, $d_max, $s_max) = $self->_bin_attempt($max_num_bins); |
726
|
|
|
|
|
|
|
# initial check |
727
|
0
|
0
|
0
|
|
|
|
if( $s_min > 1 or $s_max < 1) { |
728
|
0
|
|
|
|
|
|
return -1; |
729
|
|
|
|
|
|
|
} |
730
|
0
|
|
|
|
|
|
my $avg_num_bins; |
731
|
0
|
|
|
|
|
|
my ($b_avg, $f_avg, $d_avg, $s_avg); |
732
|
0
|
|
|
|
|
|
while( ($max_num_bins - $min_num_bins) > 1) { |
733
|
0
|
|
|
|
|
|
$avg_num_bins = sprintf("%.f", (($max_num_bins + $min_num_bins)/2)); |
734
|
0
|
|
|
|
|
|
($b_avg, $f_avg, $d_avg, $s_avg) = $self->_bin_attempt($avg_num_bins); |
735
|
0
|
0
|
|
|
|
|
if($s_avg < 1) { |
736
|
0
|
|
|
|
|
|
$min_num_bins = $avg_num_bins; |
737
|
0
|
|
|
|
|
|
($b_min, $f_min, $d_min, $s_min) = ($b_avg, $f_avg, $d_avg, $s_avg); |
738
|
|
|
|
|
|
|
} |
739
|
|
|
|
|
|
|
else { |
740
|
0
|
|
|
|
|
|
$max_num_bins = $avg_num_bins; |
741
|
0
|
|
|
|
|
|
($b_max, $f_max, $d_max, $s_max) = ($b_avg, $f_avg, $d_avg, $s_avg); |
742
|
|
|
|
|
|
|
} |
743
|
|
|
|
|
|
|
} |
744
|
|
|
|
|
|
|
# the binning whose density sum is closer to 1 win |
745
|
0
|
0
|
|
|
|
|
if( (1 - $s_min) < ($s_max-1) ) { |
746
|
0
|
|
|
|
|
|
$self->{"bins"} = $b_min; |
747
|
0
|
|
|
|
|
|
$self->{"Dist"}{"binned"}{"frequency"} = $f_min; |
748
|
0
|
|
|
|
|
|
$self->{"Dist"}{"binned"}{"pmf"} = $d_min; |
749
|
|
|
|
|
|
|
} |
750
|
|
|
|
|
|
|
else { |
751
|
0
|
|
|
|
|
|
$self->{"bins"} = $b_max; |
752
|
0
|
|
|
|
|
|
$self->{"Dist"}{"binned"}{"frequency"} = $f_max; |
753
|
0
|
|
|
|
|
|
$self->{"Dist"}{"binned"}{"pmf"} = $d_max; |
754
|
|
|
|
|
|
|
} |
755
|
0
|
|
|
|
|
|
return 0; |
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
# bin the current data into $num_bins intervals |
760
|
|
|
|
|
|
|
# and return: the bins, the binned frequency distribution |
761
|
|
|
|
|
|
|
# the binned density distribution and the sum of the |
762
|
|
|
|
|
|
|
# density values |
763
|
|
|
|
|
|
|
sub _bin_attempt { |
764
|
0
|
|
|
0
|
|
|
my $self = shift; |
765
|
0
|
|
|
|
|
|
my $num_bins = shift; |
766
|
|
|
|
|
|
|
# output values |
767
|
0
|
|
|
|
|
|
my ($cur_bins, $cur_freq, $cur_density, $cur_sum); |
768
|
0
|
|
|
|
|
|
$cur_sum = 0; |
769
|
|
|
|
|
|
|
# get the bins |
770
|
0
|
0
|
|
|
|
|
if( $self->{"bin-type"} == LIN_BINNING ) { |
771
|
0
|
|
|
|
|
|
$cur_bins = $self->compute_lin_bins($num_bins); |
772
|
|
|
|
|
|
|
} |
773
|
0
|
0
|
|
|
|
|
if( $self->{"bin-type"} == LOG_BINNING ) { |
774
|
0
|
|
|
|
|
|
$cur_bins =$self->compute_log_bins($num_bins); |
775
|
|
|
|
|
|
|
} |
776
|
|
|
|
|
|
|
# assert non binned frequency distribution is computed |
777
|
0
|
|
|
|
|
|
$self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"} |
778
|
0
|
|
|
|
|
|
my $count = $self->count(); |
779
|
|
|
|
|
|
|
# |
780
|
0
|
|
|
|
|
|
my $s = 0; # support iterator |
781
|
0
|
|
|
|
|
|
my @sorted_support = (sort {$a<=>$b} $self->_full_support()); |
|
0
|
|
|
|
|
|
|
782
|
0
|
|
|
|
|
|
my $sup_size = scalar @sorted_support; |
783
|
0
|
|
|
|
|
|
my $bi = 0; # bins iterator |
784
|
0
|
|
|
|
|
|
my @sorted_bins = (sort {$a<=>$b} keys %{$cur_bins}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
785
|
0
|
|
|
|
|
|
my $n_bins = scalar @sorted_bins; # assert(n_bins == num_bins) |
786
|
0
|
|
|
|
|
|
my $ref = 0; |
787
|
0
|
|
|
|
|
|
my $freq_sum = 0; |
788
|
0
|
|
|
|
|
|
my $interval = 0; |
789
|
0
|
|
|
|
|
|
for(; $bi < $n_bins; $bi++) { # for each bin |
790
|
0
|
|
|
|
|
|
$ref = $sorted_bins[$bi]; |
791
|
0
|
|
|
|
|
|
$interval = $cur_bins->{$ref}{"right"} - $cur_bins->{$ref}{"left"}; |
792
|
0
|
|
|
|
|
|
for(; $s < $sup_size; $s++) { # for each support value |
793
|
|
|
|
|
|
|
# if the current support value is less than the rightmost value in the bin |
794
|
0
|
0
|
|
|
|
|
if($sorted_support[$s] < $cur_bins->{$ref}{"right"}) { |
795
|
0
|
|
|
|
|
|
$freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_support[$s]}; |
796
|
|
|
|
|
|
|
} |
797
|
|
|
|
|
|
|
# else we have to save the information for the current bin |
798
|
|
|
|
|
|
|
else { |
799
|
0
|
|
|
|
|
|
$cur_freq->{$ref} = $freq_sum; |
800
|
0
|
0
|
|
|
|
|
if($interval > 0) { |
801
|
0
|
|
|
|
|
|
$cur_density->{$ref} = ($freq_sum / $count) / $interval; |
802
|
|
|
|
|
|
|
} |
803
|
|
|
|
|
|
|
else { |
804
|
0
|
|
|
|
|
|
$cur_density->{$ref} = $freq_sum / $count; |
805
|
|
|
|
|
|
|
} |
806
|
0
|
|
|
|
|
|
$cur_sum += $cur_density->{$ref}; |
807
|
0
|
|
|
|
|
|
$freq_sum = 0; |
808
|
0
|
|
|
|
|
|
last; # we do not increment s |
809
|
|
|
|
|
|
|
} |
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
# if last element of the support has been reached |
812
|
|
|
|
|
|
|
# we have to save the current freq sum in the current bin |
813
|
0
|
0
|
|
|
|
|
if($s == $sup_size) { |
814
|
0
|
|
|
|
|
|
$cur_freq->{$ref} = $freq_sum; |
815
|
0
|
0
|
|
|
|
|
if($interval > 0) { |
816
|
0
|
|
|
|
|
|
$cur_density->{$ref} = ($freq_sum / $count) / $interval; |
817
|
|
|
|
|
|
|
} |
818
|
|
|
|
|
|
|
else { |
819
|
0
|
|
|
|
|
|
$cur_density->{$ref} = $freq_sum / $count; |
820
|
|
|
|
|
|
|
} |
821
|
0
|
|
|
|
|
|
$cur_sum += $cur_density->{$ref}; |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
} |
824
|
|
|
|
|
|
|
#DEBUG print "attempt: " . $num_bins . " sum: " . $cur_sum . "\n"; |
825
|
0
|
|
|
|
|
|
return ($cur_bins, $cur_freq, $cur_density, $cur_sum); |
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
sub _full_support { |
831
|
0
|
|
|
0
|
|
|
my $self = shift; |
832
|
0
|
|
|
|
|
|
my $support = {}; |
833
|
0
|
|
|
|
|
|
my $v; |
834
|
0
|
|
|
|
|
|
foreach $v( keys %{$self->{"data_frequency"}}) { |
|
0
|
|
|
|
|
|
|
835
|
0
|
|
|
|
|
|
$support->{$v} = 1; |
836
|
|
|
|
|
|
|
} |
837
|
0
|
|
|
|
|
|
return (keys %{$support}); |
|
0
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
} |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
################## Utilities ################## |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
# return a structure describing a linear binning |
846
|
|
|
|
|
|
|
# of the current data ($num_bins provided) |
847
|
|
|
|
|
|
|
sub compute_lin_bins { |
848
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
849
|
0
|
|
|
|
|
|
my $num_bins = shift; |
850
|
0
|
|
|
|
|
|
my $min = $self->minimum(); |
851
|
0
|
|
|
|
|
|
my $max = $self->maximum(); |
852
|
0
|
|
|
|
|
|
my $linbins = {}; # reference to empty hash |
853
|
|
|
|
|
|
|
# last bin includes just the maximum value |
854
|
0
|
|
|
|
|
|
my $delta = abs($max - $min)/($num_bins-1); |
855
|
0
|
|
|
|
|
|
my $i; |
856
|
|
|
|
|
|
|
my $ref; |
857
|
0
|
|
|
|
|
|
for($i=0; $i < $num_bins; $i++) { |
858
|
0
|
|
|
|
|
|
$ref = $min + $i * $delta + $delta/2; |
859
|
0
|
|
|
|
|
|
$linbins->{$ref}{"left"} = $min + $i * $delta; |
860
|
0
|
|
|
|
|
|
$linbins->{$ref}{"right"} = $min + $i * $delta + $delta; |
861
|
|
|
|
|
|
|
} |
862
|
0
|
|
|
|
|
|
return $linbins; |
863
|
|
|
|
|
|
|
} |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
# return a structure describing a logarithmic binning |
867
|
|
|
|
|
|
|
# of the current data ($num_bins provided) |
868
|
|
|
|
|
|
|
# bins are not set |
869
|
|
|
|
|
|
|
sub compute_log_bins { |
870
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
871
|
0
|
|
|
|
|
|
my $num_bins = shift; |
872
|
0
|
|
|
|
|
|
my $min = $self->minimum(); |
873
|
0
|
|
|
|
|
|
my $max = $self->maximum(); |
874
|
0
|
0
|
0
|
|
|
|
if($min == 0 or $max == 0 or $min *$max <0) { |
|
|
|
0
|
|
|
|
|
875
|
|
|
|
|
|
|
# if the interval contains zero, then we cannot logbin |
876
|
|
|
|
|
|
|
# we return the linear binning |
877
|
0
|
|
|
|
|
|
print "Logarithmic binning was not possible on this sample, "; |
878
|
0
|
|
|
|
|
|
print "switching to linear binning mode" . "\n"; |
879
|
0
|
|
|
|
|
|
$self->{"bin-type"} = LIN_BINNING; |
880
|
0
|
|
|
|
|
|
return $self->compute_lin_bins($num_bins); |
881
|
|
|
|
|
|
|
} |
882
|
0
|
|
|
|
|
|
my $logbins = {}; # reference to empty hash |
883
|
0
|
|
|
|
|
|
my $min_exp = log($min)/log(LOG_BASE); |
884
|
0
|
|
|
|
|
|
my $max_exp = log($max+1)/log(LOG_BASE); |
885
|
0
|
|
|
|
|
|
my $delta_exp = abs($max_exp - $min_exp)/$num_bins; |
886
|
0
|
|
|
|
|
|
my $cur_exp; |
887
|
|
|
|
|
|
|
my $i; |
888
|
0
|
|
|
|
|
|
my $ref; |
889
|
0
|
|
|
|
|
|
for($i=0; $i < $num_bins; $i++) { |
890
|
0
|
|
|
|
|
|
$cur_exp = $min_exp + $i * $delta_exp; |
891
|
0
|
|
|
|
|
|
$ref = LOG_BASE ** ($cur_exp + $delta_exp/2); |
892
|
0
|
|
|
|
|
|
$logbins->{$ref}{"left"} = LOG_BASE ** $cur_exp; |
893
|
0
|
|
|
|
|
|
$logbins->{$ref}{"right"} = LOG_BASE ** ($cur_exp + $delta_exp); |
894
|
|
|
|
|
|
|
} |
895
|
0
|
|
|
|
|
|
return $logbins; |
896
|
|
|
|
|
|
|
} |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
1; |