line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
23
|
|
|
23
|
|
1406756
|
use 5.006; |
|
23
|
|
|
|
|
334
|
|
2
|
23
|
|
|
23
|
|
147
|
use strict; |
|
23
|
|
|
|
|
52
|
|
|
23
|
|
|
|
|
623
|
|
3
|
23
|
|
|
23
|
|
125
|
use warnings; |
|
23
|
|
|
|
|
61
|
|
|
23
|
|
|
|
|
1692
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
package Statistics::Descriptive::LogScale; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Statistics::Descriptive::LogScale - Memory-efficient approximate univariate |
10
|
|
|
|
|
|
|
descriptive statistics class. |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 VERSION |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Version 0.10 |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=cut |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = 0.10; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head2 Basic usage |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
The basic usage is roughly the same as that of L. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
use Statistics::Descriptive::LogScale; |
27
|
|
|
|
|
|
|
my $stat = Statistics::Descriptive::LogScale->new (); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
while(<>) { |
30
|
|
|
|
|
|
|
chomp; |
31
|
|
|
|
|
|
|
$stat->add_data($_); |
32
|
|
|
|
|
|
|
}; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# This can also be done in O(1) memory, precisely |
35
|
|
|
|
|
|
|
printf "Mean: %f +- %f\n", $stat->mean, $stat->standard_deviation; |
36
|
|
|
|
|
|
|
# This requires storing actual data, or approximating |
37
|
|
|
|
|
|
|
printf "25%% : %f\n", $stat->percentile(25); |
38
|
|
|
|
|
|
|
printf "Median: %f\n", $stat->median; |
39
|
|
|
|
|
|
|
printf "75%% : %f\n", $stat->percentile(75); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head2 Save/load |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
This is not present in L. |
44
|
|
|
|
|
|
|
The save/load interface is designed compatible with JSON::XS. |
45
|
|
|
|
|
|
|
However, any other serializer can be used. |
46
|
|
|
|
|
|
|
The C method is I to return unblessed hashref |
47
|
|
|
|
|
|
|
with enough information to restore the original object. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
use Statistics::Descriptive::LogScale; |
50
|
|
|
|
|
|
|
my $stat = Statistics::Descriptive::LogScale->new (); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# ..... much later |
53
|
|
|
|
|
|
|
# Save |
54
|
|
|
|
|
|
|
print $fd encoder_of_choice( $stat->TO_JSON ) |
55
|
|
|
|
|
|
|
or die "Failed to save: $!"; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# ..... and even later |
58
|
|
|
|
|
|
|
# Load |
59
|
|
|
|
|
|
|
my $plain_hash = decoder_of_choice( $raw_data ); |
60
|
|
|
|
|
|
|
my $copy_of_stat = Statistics::Descriptive::LogScale->new( %$plain_hash ); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# Import into existing LogScale instance |
63
|
|
|
|
|
|
|
my $plain_hash = decoder_of_choice( $more_raw_data ); |
64
|
|
|
|
|
|
|
$copy_of_stat->add_data_hash( $plain_hash->{data} ); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head2 Histograms |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Both L and L |
69
|
|
|
|
|
|
|
offer C method for querying data point counts. |
70
|
|
|
|
|
|
|
However, there's also C method for making pretty pictures. |
71
|
|
|
|
|
|
|
Here's a simple text-based histogram. |
72
|
|
|
|
|
|
|
A proper GD example was too long to fit into this margin. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
use strict; |
75
|
|
|
|
|
|
|
use warnings; |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
use Statistics::Descriptive::LogScale; |
78
|
|
|
|
|
|
|
my $stat = Statistics::Descriptive::LogScale->new (); |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# collect/load data ... |
81
|
|
|
|
|
|
|
my $re_float = qr([-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?); |
82
|
|
|
|
|
|
|
while (<>) { |
83
|
|
|
|
|
|
|
$stat->add_data($_) for m/($re_float)/g; |
84
|
|
|
|
|
|
|
}; |
85
|
|
|
|
|
|
|
die "Empty set" |
86
|
|
|
|
|
|
|
unless $stat->count; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# get data in [ count, lower_bound, upper_bound ] format as arrayref |
89
|
|
|
|
|
|
|
my $hist = $stat->histogram( count => 20 ); |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# find maximum value to use as a scale factor |
92
|
|
|
|
|
|
|
my $scale = $hist->[0][0]; |
93
|
|
|
|
|
|
|
$scale < $_->[0] and $scale = $_->[0] for @$hist; |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
foreach (@$hist) { |
96
|
|
|
|
|
|
|
printf "%10f %s\n", $_->[1], '#' x ($_->[0] * 68 / $scale); |
97
|
|
|
|
|
|
|
}; |
98
|
|
|
|
|
|
|
printf "%10f\n", $hist->[-1][2]; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head1 DESCRIPTION |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
This module aims at providing some advanced statistical functions without |
103
|
|
|
|
|
|
|
storing all data in memory, at the cost of certain (predictable) precision loss. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
Data is represented by a set of bins that only store counts of fitting |
106
|
|
|
|
|
|
|
data points. |
107
|
|
|
|
|
|
|
Most bins are logarithmic, i.e. lower end / upper end ratio is constant. |
108
|
|
|
|
|
|
|
However, around zero linear approximation may be user instead |
109
|
|
|
|
|
|
|
(see "linear_width" and "linear_thresh" parameters in new()). |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
All operations are then performed on the bins, introducing relative error |
112
|
|
|
|
|
|
|
which does not, however, exceed the bins' relative width ("base"). |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head1 METHODS |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=cut |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
######################################################################## |
119
|
|
|
|
|
|
|
# == HOW IT WORKS == |
120
|
|
|
|
|
|
|
# Buckets are stored in a hash: { $value => $count, ... } |
121
|
|
|
|
|
|
|
# {base} is bin width, {logbase} == log {base} (cache) |
122
|
|
|
|
|
|
|
# {linear_thresh} is where we switch to equal bin approximation |
123
|
|
|
|
|
|
|
# {linear_width} is width of bin around zero (==linear_thresh if not given) |
124
|
|
|
|
|
|
|
# {floor} is lower bound of bin whose center is 1. {logfloor} = log {floor} |
125
|
|
|
|
|
|
|
# Nearly all meaningful subs have to scan all the bins, which is bad, |
126
|
|
|
|
|
|
|
# but anyway better than scanning full sample. |
127
|
|
|
|
|
|
|
|
128
|
23
|
|
|
23
|
|
160
|
use Carp; |
|
23
|
|
|
|
|
56
|
|
|
23
|
|
|
|
|
1504
|
|
129
|
23
|
|
|
23
|
|
7634
|
use POSIX qw(floor ceil); |
|
23
|
|
|
|
|
136190
|
|
|
23
|
|
|
|
|
165
|
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# Fields are NOT used internally for now, so this is just a declaration |
132
|
23
|
|
|
|
|
104
|
use fields qw( |
133
|
|
|
|
|
|
|
data count |
134
|
|
|
|
|
|
|
linear_width base logbase floor linear_thresh logfloor |
135
|
|
|
|
|
|
|
cache |
136
|
23
|
|
|
23
|
|
41360
|
); |
|
23
|
|
|
|
|
34367
|
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# Some internal constants |
139
|
|
|
|
|
|
|
# This is for infinite portability^W^W portable infinity |
140
|
|
|
|
|
|
|
my $INF = 9**9**9; |
141
|
|
|
|
|
|
|
my $re_num = qr/(?:[-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?)/; |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head2 new( %options ) |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
%options may include: |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=over |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=item * base - ratio of adjacent bins. Default is 10^(1/232), which gives |
150
|
|
|
|
|
|
|
1% precision and exact decimal powers. |
151
|
|
|
|
|
|
|
This value represents acceptable relative error in analysis results. |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
B Actual value may be slightly less than requested one. |
154
|
|
|
|
|
|
|
This is done so to avoid troubles with future rounding in (de)serialization. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=item * linear_width - width of linear bins around zero. |
157
|
|
|
|
|
|
|
This value represents precision of incoming data. |
158
|
|
|
|
|
|
|
Default is zero, i.e. we assume that the measurement is precise. |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
B Actual value may be less (by no more than a factor of C) |
161
|
|
|
|
|
|
|
so that borders of linear and logarithmic bins fit nicely. |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=item * linear_thresh - where to switch to linear approximation. |
164
|
|
|
|
|
|
|
If only one of C and C is given, |
165
|
|
|
|
|
|
|
the other will be calculated. |
166
|
|
|
|
|
|
|
However, user may want to specify both in some cases. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
B Actual value may be less (by no more than a factor of C) |
169
|
|
|
|
|
|
|
so that borders of linear and logarithmic bins fit nicely. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=item * data - hashref with C<{ value => weight }> for initializing data. |
172
|
|
|
|
|
|
|
Used for cloning. |
173
|
|
|
|
|
|
|
See C. |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=item * zero_thresh - absolute value threshold below which everything is |
176
|
|
|
|
|
|
|
considered zero. |
177
|
|
|
|
|
|
|
DEPRECATED, C and C override this if given. |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=back |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=cut |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
my @new_keys = qw( base linear_thresh linear_width zero_thresh data ); |
184
|
|
|
|
|
|
|
# TODO Throw if extra options given? |
185
|
|
|
|
|
|
|
sub new { |
186
|
42
|
|
|
42
|
1
|
24214
|
my $class = shift; |
187
|
42
|
|
|
|
|
187
|
my %opt = @_; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# base for logarithmic bins, sane default: +-1%, exact decimal powers |
190
|
|
|
|
|
|
|
# UGLY HACK number->string->number to avoid |
191
|
|
|
|
|
|
|
# future serialization inconsistencies |
192
|
42
|
|
100
|
|
|
275
|
$opt{base} ||= 10**(1/232); |
193
|
42
|
|
|
|
|
376
|
$opt{base} = 0 + "$opt{base}"; |
194
|
42
|
50
|
|
|
|
170
|
$opt{base} > 1 or croak __PACKAGE__.": new(): base must be >1"; |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# calculate where to switch to linear approximation |
197
|
|
|
|
|
|
|
# the condition is: linear bin( thresh ) ~~ log bin( thresh ) |
198
|
|
|
|
|
|
|
# i.e. thresh * base - thresh ~~ absolute error * 2 |
199
|
|
|
|
|
|
|
# i.e. thresh ~~ absolute_error * 2 / (base - 1) |
200
|
|
|
|
|
|
|
# also support legacy API (zero_thresh) |
201
|
42
|
100
|
|
|
|
150
|
if (defined $opt{linear_thresh} ) { |
202
|
13
|
|
100
|
|
|
61
|
$opt{linear_width} ||= $opt{linear_thresh} * ($opt{base}-1); |
203
|
|
|
|
|
|
|
} else { |
204
|
29
|
|
|
|
|
83
|
$opt{linear_thresh} = $opt{zero_thresh}; |
205
|
|
|
|
|
|
|
}; |
206
|
|
|
|
|
|
|
$opt{linear_thresh} = abs($opt{linear_width}) / ($opt{base} - 1) |
207
|
42
|
100
|
100
|
|
|
217
|
if $opt{linear_width} and !$opt{linear_thresh}; |
208
|
42
|
|
100
|
|
|
206
|
$opt{linear_thresh} ||= 0; |
209
|
42
|
50
|
|
|
|
172
|
$opt{linear_thresh} >= 0 |
210
|
|
|
|
|
|
|
or croak __PACKAGE__.": new(): linear_thresh must be >= 0"; |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# Can't use fields::new anymore |
213
|
|
|
|
|
|
|
# due to JSON::XS incompatibility with restricted hashes |
214
|
42
|
|
|
|
|
135
|
my $self = bless {}, $class; |
215
|
|
|
|
|
|
|
|
216
|
42
|
|
|
|
|
198
|
$self->{base} = $opt{base}; |
217
|
|
|
|
|
|
|
# cache values to ease calculations |
218
|
|
|
|
|
|
|
# floor = (lower end of bin) / (center of bin) |
219
|
42
|
|
|
|
|
187
|
$self->{floor} = 2/(1+$opt{base}); |
220
|
42
|
|
|
|
|
186
|
$self->{logbase} = log $opt{base}; |
221
|
42
|
|
|
|
|
127
|
$self->{logfloor} = log $self->{floor}; |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# bootstrap linear_thresh - make it fit bin edge |
224
|
42
|
|
|
|
|
109
|
$self->{linear_width} = $self->{linear_thresh} = 0; |
225
|
42
|
|
|
|
|
192
|
$self->{linear_thresh} = $self->_lower( $opt{linear_thresh} ); |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
# divide anything below linear_thresh into odd number of bins |
228
|
|
|
|
|
|
|
# not exceeding requested linear_width |
229
|
42
|
100
|
|
|
|
146
|
if ($self->{linear_thresh}) { |
230
|
15
|
|
66
|
|
|
64
|
my $linear_width = $opt{linear_width} || 2 * $self->{linear_thresh}; |
231
|
15
|
|
|
|
|
76
|
my $n_linear = ceil(2 * $self->{linear_thresh} / abs($linear_width)); |
232
|
15
|
100
|
|
|
|
63
|
$n_linear++ unless $n_linear % 2; |
233
|
15
|
|
|
|
|
48
|
$self->{linear_width} = (2 * $self->{linear_thresh} / $n_linear); |
234
|
|
|
|
|
|
|
}; |
235
|
|
|
|
|
|
|
|
236
|
42
|
|
|
|
|
177
|
$self->clear; |
237
|
42
|
100
|
|
|
|
139
|
if ($opt{data}) { |
238
|
10
|
|
|
|
|
37
|
$self->add_data_hash($opt{data}); |
239
|
|
|
|
|
|
|
}; |
240
|
42
|
|
|
|
|
206
|
return $self; |
241
|
|
|
|
|
|
|
}; |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=head1 General statistical methods |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
These methods are used to query the distribution properties. They generally |
246
|
|
|
|
|
|
|
follow the interface of L and co, |
247
|
|
|
|
|
|
|
with minor additions. |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
All methods return C on empty data set, except for |
250
|
|
|
|
|
|
|
C, C, C, C and C |
251
|
|
|
|
|
|
|
which all return 0. |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
B This module caches whatever it calculates very agressively. |
254
|
|
|
|
|
|
|
Don't hesitate to use statistical functions (except for sum_of/mean_of) |
255
|
|
|
|
|
|
|
more than once. The cache is deleted upon data entry. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head2 clear |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Destroy all stored data. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=cut |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
sub clear { |
264
|
56
|
|
|
56
|
1
|
8319
|
my $self = shift; |
265
|
56
|
|
|
|
|
2266
|
$self->{data} = {}; |
266
|
56
|
|
|
|
|
181
|
$self->{count} = 0; |
267
|
56
|
|
|
|
|
150
|
delete $self->{cache}; |
268
|
56
|
|
|
|
|
115
|
return $self; |
269
|
|
|
|
|
|
|
}; |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=head2 add_data( @data ) |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Add numbers to the data pool. |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Returns self, so that methods can be chained. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
If incorrect data is given (i.e. non-numeric, undef), |
278
|
|
|
|
|
|
|
an exception is thrown and only partial data gets inserted. |
279
|
|
|
|
|
|
|
The state of object is guaranteed to remain consistent in such case. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
B Cache is reset, even if no data was actually inserted. |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
B It is possible to add infinite values to data pool. |
284
|
|
|
|
|
|
|
The module will try and calculate whatever can still be calculated. |
285
|
|
|
|
|
|
|
However, no portable way of serializing such values is done yet. |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=cut |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
sub add_data { |
290
|
51213
|
|
|
51213
|
1
|
162300
|
my $self = shift; |
291
|
|
|
|
|
|
|
|
292
|
51213
|
|
|
|
|
71115
|
delete $self->{cache}; |
293
|
51213
|
|
|
|
|
80032
|
foreach (@_) { |
294
|
90141
|
|
|
|
|
152375
|
$self->{data}{ $self->_round($_) }++; |
295
|
90138
|
|
|
|
|
167178
|
$self->{count}++; |
296
|
|
|
|
|
|
|
}; |
297
|
51210
|
|
|
|
|
92938
|
$self; |
298
|
|
|
|
|
|
|
}; |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=head2 count |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
Returns number of data points. |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
=cut |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
sub count { |
307
|
69
|
|
|
69
|
1
|
10188
|
my $self = shift; |
308
|
69
|
|
|
|
|
284
|
return $self->{count}; |
309
|
|
|
|
|
|
|
}; |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=head2 min |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
=head2 max |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
Values of minimal and maximal bins. |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
NOTE: Due to rounding, some of the actual inserted values may fall outside |
318
|
|
|
|
|
|
|
of the min..max range. This may change in the future. |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=cut |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub min { |
323
|
|
|
|
|
|
|
my $self = shift; |
324
|
|
|
|
|
|
|
return $self->_sort->[0]; |
325
|
|
|
|
|
|
|
}; |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
sub max { |
328
|
|
|
|
|
|
|
my $self = shift; |
329
|
|
|
|
|
|
|
return $self->_sort->[-1]; |
330
|
|
|
|
|
|
|
}; |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=head2 sample_range |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
Return sample range of the dataset, i.e. max() - min(). |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=cut |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
sub sample_range { |
339
|
5
|
|
|
5
|
1
|
739
|
my $self = shift; |
340
|
5
|
100
|
|
|
|
16
|
return $self->count ? $self->max - $self->min : undef; |
341
|
|
|
|
|
|
|
}; |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head2 sum |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
Return sum of all data points. |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=cut |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
sub sum { |
350
|
|
|
|
|
|
|
my $self = shift; |
351
|
|
|
|
|
|
|
return $self->sum_of(sub { $_[0] }); |
352
|
|
|
|
|
|
|
}; |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=head2 sumsq |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Return sum of squares of all datapoints. |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=cut |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
sub sumsq { |
361
|
|
|
|
|
|
|
my $self = shift; |
362
|
|
|
|
|
|
|
return $self->sum_of(sub { $_[0] * $_[0] }); |
363
|
|
|
|
|
|
|
}; |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=head2 mean |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Return mean, or average value, i.e. sum()/count(). |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=cut |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
sub mean { |
372
|
|
|
|
|
|
|
my $self = shift; |
373
|
|
|
|
|
|
|
return $self->{count} ? $self->sum / $self->{count} : undef; |
374
|
|
|
|
|
|
|
}; |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=head2 variance |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=head2 variance( $correction ) |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Return data variance, i.e. E((x - E(x)) ** 2). |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
Bessel's correction (division by n-1 instead of n) is used by default. |
383
|
|
|
|
|
|
|
This may be changed by specifying $correction explicitly. |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
B The binning strategy used here should also introduce variance bias. |
386
|
|
|
|
|
|
|
This is not yet accounted for. |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=cut |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub variance { |
391
|
|
|
|
|
|
|
my $self = shift; |
392
|
|
|
|
|
|
|
my $correction = shift; |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
# in fact we'll receive correction='' because of how cache works |
395
|
|
|
|
|
|
|
$correction = 1 unless defined $correction and length $correction; |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
return 0 if ($self->{count} < 1 + $correction); |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
my $var = $self->sumsq - $self->sum**2 / $self->{count}; |
400
|
|
|
|
|
|
|
return $var <= 0 ? 0 : $var / ( $self->{count} - $correction ); |
401
|
|
|
|
|
|
|
}; |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head2 standard_deviation |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=head2 standard_deviation( $correction ) |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
=head2 std_dev |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
Return standard deviation, i.e. square root of variance. |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
Bessel's correction (division by n-1 instead of n) is used by default. |
412
|
|
|
|
|
|
|
This may be changed by specifying $correction explicitly. |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
B The binning strategy used here should also introduce variance bias. |
415
|
|
|
|
|
|
|
This is not yet accounted for. |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
=cut |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
sub standard_deviation { |
420
|
|
|
|
|
|
|
my $self = shift; |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
return sqrt($self->variance(@_)); |
423
|
|
|
|
|
|
|
}; |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=head2 cdf ($x) |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
Cumulative distribution function. Returns estimated probability of |
428
|
|
|
|
|
|
|
random data point from the sample being less than C<$x>. |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
As a special case, C accounts for I of zeroth bin count (if any). |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
Not present in Statistics::Descriptive::Full, but appears in |
433
|
|
|
|
|
|
|
L. |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=head2 cdf ($x, $y) |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
Returns probability of a value being between C<$x> and C<$y> ($x <= $y). |
438
|
|
|
|
|
|
|
This is essentially C. |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=cut |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
sub cdf { |
443
|
4
|
|
|
4
|
1
|
9
|
my $self = shift; |
444
|
4
|
50
|
|
|
|
11
|
return unless $self->{count}; |
445
|
4
|
|
|
|
|
10
|
return $self->_count(@_) / $self->{count}; |
446
|
|
|
|
|
|
|
}; |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=head2 percentile( $n ) |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Find $n-th percentile, i.e. a value below which lies $n % of the data. |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
0-th percentile is by definition -inf and is returned as undef |
453
|
|
|
|
|
|
|
(see Statistics::Descriptive). |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
$n is a real number, not necessarily integer. |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=cut |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub percentile { |
460
|
48
|
|
|
48
|
1
|
3068
|
my $self = shift; |
461
|
48
|
|
|
|
|
80
|
my $x = shift; |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# assert 0<=$x<=100 |
464
|
48
|
50
|
33
|
|
|
218
|
croak __PACKAGE__.": percentile() argument must be between 0 and 100" |
465
|
|
|
|
|
|
|
unless 0<= $x and $x <= 100; |
466
|
|
|
|
|
|
|
|
467
|
48
|
|
|
|
|
124
|
my $need = $x * $self->{count} / 100; |
468
|
48
|
100
|
|
|
|
137
|
return if $need < 1; |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
# dichotomize |
471
|
|
|
|
|
|
|
# $i is lowest value >= needed |
472
|
|
|
|
|
|
|
# $need doesnt exceed last bin! |
473
|
42
|
|
|
|
|
99
|
my $i = _bin_search_ge( $self->_probability, $need ); |
474
|
42
|
|
|
|
|
94
|
return $self->_sort->[ $i ]; |
475
|
|
|
|
|
|
|
}; |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=head2 quantile( 0..4 ) |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
From Statistics::Descriptive manual: |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
0 => zero quartile (Q0) : minimal value |
482
|
|
|
|
|
|
|
1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile |
483
|
|
|
|
|
|
|
2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile |
484
|
|
|
|
|
|
|
3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile |
485
|
|
|
|
|
|
|
4 => fourth quartile (Q4) : maximal value |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
=cut |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
sub quantile { |
490
|
|
|
|
|
|
|
my $self = shift; |
491
|
|
|
|
|
|
|
my $t = shift; |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
croak (__PACKAGE__.": quantile() argument must be one of 0..4") |
494
|
|
|
|
|
|
|
unless $t =~ /^[0-4]$/; |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
$t or return $self->min; |
497
|
|
|
|
|
|
|
return $self->percentile($t * 100 / 4); |
498
|
|
|
|
|
|
|
}; |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
=head2 median |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
Return median of data, a value that divides the sample in half. |
503
|
|
|
|
|
|
|
Same as percentile(50). |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
=cut |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
sub median { |
508
|
16
|
|
|
16
|
1
|
1171
|
my $self = shift; |
509
|
16
|
|
|
|
|
46
|
return $self->percentile(50); |
510
|
|
|
|
|
|
|
}; |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=head2 trimmed_mean( $ltrim, [ $utrim ] ) |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
Return mean of sample with $ltrim and $utrim fraction of data points |
515
|
|
|
|
|
|
|
remover from lower and upper ends respectively. |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
ltrim defaults to 0, and rtrim to ltrim. |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=cut |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
sub trimmed_mean { |
522
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
523
|
1
|
|
|
|
|
2
|
my ($lower, $upper) = @_; |
524
|
1
|
|
50
|
|
|
3
|
$lower ||= 0; |
525
|
1
|
50
|
|
|
|
4
|
$upper = $lower unless defined $upper; |
526
|
|
|
|
|
|
|
|
527
|
1
|
|
|
|
|
4
|
my $min = $self->percentile($lower * 100); |
528
|
1
|
|
|
|
|
4
|
my $max = $self->percentile(100 - $upper * 100); |
529
|
|
|
|
|
|
|
|
530
|
1
|
50
|
|
|
|
4
|
return unless $min < $max; |
531
|
|
|
|
|
|
|
|
532
|
1
|
|
|
5
|
|
6
|
return $self->mean_of(sub{$_[0]}, $min, $max); |
|
5
|
|
|
|
|
12
|
|
533
|
|
|
|
|
|
|
}; |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=head2 harmonic_mean |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
Return harmonic mean of the data, i.e. 1/E(1/x). |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Return undef if division by zero occurs (see Statistics::Descriptive). |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=cut |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
sub harmonic_mean { |
544
|
2
|
|
|
2
|
1
|
708
|
my $self = shift; |
545
|
|
|
|
|
|
|
|
546
|
2
|
|
|
|
|
4
|
my $ret; |
547
|
2
|
|
|
|
|
4
|
eval { |
548
|
2
|
|
|
5
|
|
6
|
$ret = $self->count / $self->sum_of(sub { 1/$_[0] }); |
|
5
|
|
|
|
|
22
|
|
549
|
|
|
|
|
|
|
}; |
550
|
2
|
50
|
66
|
|
|
23
|
if ($@ and $@ !~ /division.*zero/) { |
551
|
0
|
|
|
|
|
0
|
die $@; # rethrow ALL BUT 1/0 which yields undef |
552
|
|
|
|
|
|
|
}; |
553
|
2
|
|
|
|
|
12
|
return $ret; |
554
|
|
|
|
|
|
|
}; |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=head2 geometric_mean |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
Return geometric mean of the data, that is, exp(E(log x)). |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
Dies unless all data points are of the same sign. |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=cut |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
sub geometric_mean { |
565
|
2
|
|
|
2
|
1
|
719
|
my $self = shift; |
566
|
|
|
|
|
|
|
|
567
|
2
|
100
|
|
|
|
28
|
return unless $self->count; |
568
|
1
|
50
|
|
|
|
3
|
croak __PACKAGE__.": geometric_mean() called on mixed sign sample" |
569
|
|
|
|
|
|
|
if $self->min * $self->max < 0; |
570
|
|
|
|
|
|
|
|
571
|
1
|
50
|
|
|
|
3
|
return 0 if $self->{data}{0}; |
572
|
|
|
|
|
|
|
# this must be dog slow, but we already log() too much at this point. |
573
|
1
|
|
|
5
|
|
6
|
my $ret = exp( $self->sum_of( sub { log abs $_[0] } ) / $self->{count} ); |
|
5
|
|
|
|
|
13
|
|
574
|
1
|
50
|
|
|
|
5
|
return $self->min < 0 ? -$ret : $ret; |
575
|
|
|
|
|
|
|
}; |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=head2 skewness |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
Return skewness of the distribution, calculated as |
580
|
|
|
|
|
|
|
n/(n-1)(n-2) * E((x-E(x))**3)/std_dev**3 (this is consistent with Excel). |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
=cut |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
sub skewness { |
585
|
20
|
|
|
20
|
1
|
1567
|
my $self = shift; |
586
|
|
|
|
|
|
|
|
587
|
20
|
|
|
|
|
50
|
my $n = $self->{count}; |
588
|
20
|
100
|
|
|
|
73
|
return unless $n > 2; |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
# code stolen from Statistics::Descriptive |
591
|
18
|
|
|
|
|
59
|
my $skew = $n * $self->std_moment(3); |
592
|
18
|
|
|
|
|
49
|
my $correction = $n / ( ($n-1) * ($n-2) ); |
593
|
18
|
|
|
|
|
95
|
return $correction * $skew; |
594
|
|
|
|
|
|
|
}; |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=head2 kurtosis |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
Return kurtosis of the distribution, that is 4-th standardized moment - 3. |
599
|
|
|
|
|
|
|
The exact formula used here is consistent with that of Excel and |
600
|
|
|
|
|
|
|
Statistics::Descriptive. |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
=cut |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
sub kurtosis { |
605
|
22
|
|
|
22
|
1
|
805
|
my $self = shift; |
606
|
|
|
|
|
|
|
|
607
|
22
|
|
|
|
|
56
|
my $n = $self->{count}; |
608
|
22
|
100
|
|
|
|
85
|
return unless $n > 3; |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
# code stolen from Statistics::Descriptive |
611
|
20
|
|
|
|
|
53
|
my $kurt = $n * $self->std_moment(4); |
612
|
20
|
|
|
|
|
73
|
my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) ); |
613
|
20
|
|
|
|
|
58
|
my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) ); |
614
|
|
|
|
|
|
|
|
615
|
20
|
|
|
|
|
99
|
return $correction1 * $kurt - $correction2; |
616
|
|
|
|
|
|
|
}; |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
=head2 central_moment( $n ) |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
Return $n-th central moment, that is, E((x - E(x))^$n). |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
Not present in Statistics::Descriptive::Full. |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
=cut |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
sub central_moment { |
627
|
|
|
|
|
|
|
my $self = shift; |
628
|
|
|
|
|
|
|
my $n = shift; |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
my $mean = $self->mean; |
631
|
|
|
|
|
|
|
return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) / $self->{count}; |
632
|
|
|
|
|
|
|
}; |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
=head2 std_moment( $n ) |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
Return $n-th standardized moment, that is, |
637
|
|
|
|
|
|
|
E((x - E(x))**$n) / std_dev(x)**$n. |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
Not present in Statistics::Descriptive::Full. |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=cut |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
sub std_moment { |
644
|
|
|
|
|
|
|
my $self = shift; |
645
|
|
|
|
|
|
|
my $n = shift; |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
my $mean = $self->mean; |
648
|
|
|
|
|
|
|
my $dev = $self->std_dev; |
649
|
|
|
|
|
|
|
return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) |
650
|
|
|
|
|
|
|
/ ( $dev**$n * $self->{count} ); |
651
|
|
|
|
|
|
|
}; |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
=head2 mode |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
Mode of a distribution is the most common value for a discrete distribution, |
656
|
|
|
|
|
|
|
or maximum of probability density for continuous one. |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
For now we assume that the distribution IS discrete, and return the bin with |
659
|
|
|
|
|
|
|
the biggest hit count. |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
NOTE A better algorithm is still wanted. Experimental. |
662
|
|
|
|
|
|
|
Behavior may change in the future. |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=cut |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
# Naive implementation |
667
|
|
|
|
|
|
|
# Find bin w/greatest count and return it |
668
|
|
|
|
|
|
|
sub mode { |
669
|
|
|
|
|
|
|
my $self = shift; |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
return if !$self->count; |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
my $index = $self->_sort; |
674
|
|
|
|
|
|
|
return $index->[0] if @$index == 1; |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
my @count = map { $self->{data}{$_} } @$index; |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
my $max_index; |
679
|
|
|
|
|
|
|
my $max_growth = 0; |
680
|
|
|
|
|
|
|
for (my $i = 0; $i<@count; $i++) { |
681
|
|
|
|
|
|
|
$count[$i] > $max_growth or next; |
682
|
|
|
|
|
|
|
$max_index = $i; |
683
|
|
|
|
|
|
|
$max_growth = $count[$i]; |
684
|
|
|
|
|
|
|
}; |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
return $index->[$max_index]; |
687
|
|
|
|
|
|
|
}; |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=head2 frequency_distribution_ref( \@index ) |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
=head2 frequency_distribution_ref( $n ) |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=head2 frequency_distribution_ref |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
Return numbers of data point counts below each number in @index as hashref. |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
If a number is given instead of arrayref, @index is created |
698
|
|
|
|
|
|
|
by dividing [min, max] into $n intervals. |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
If no parameters are given, return previous result, if any. |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
=cut |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
sub frequency_distribution_ref { |
705
|
3
|
|
|
3
|
1
|
1106
|
my $self = shift; |
706
|
3
|
|
|
|
|
5
|
my $index = shift; |
707
|
|
|
|
|
|
|
|
708
|
3
|
100
|
|
|
|
9
|
return unless $self->count; |
709
|
|
|
|
|
|
|
# ah, compatibility - return last value |
710
|
|
|
|
|
|
|
return $self->{cache}{frequency_distribution_ref} |
711
|
2
|
50
|
|
|
|
5
|
unless defined $index; |
712
|
|
|
|
|
|
|
# make index if number given |
713
|
2
|
100
|
|
|
|
8
|
if (!ref $index) { |
714
|
1
|
50
|
|
|
|
3
|
croak __PACKAGE__.": frequency_distribution_ref(): ". |
715
|
|
|
|
|
|
|
"argument must be array, of number > 2, not $index" |
716
|
|
|
|
|
|
|
unless $index > 2; |
717
|
1
|
|
|
|
|
8
|
my $min = $self->_lower($self->min); |
718
|
1
|
|
|
|
|
4
|
my $max = $self->_upper($self->max); |
719
|
1
|
|
|
|
|
3
|
my $step = ($max - $min) / $index; |
720
|
1
|
|
|
|
|
4
|
$index = [ map { $min + $_ * $step } 1..$index ]; |
|
5
|
|
|
|
|
9
|
|
721
|
|
|
|
|
|
|
}; |
722
|
|
|
|
|
|
|
|
723
|
2
|
|
|
|
|
10
|
@$index = (-$INF, sort { $a <=> $b } @$index); |
|
12
|
|
|
|
|
24
|
|
724
|
|
|
|
|
|
|
|
725
|
2
|
|
|
|
|
5
|
my @count; |
726
|
2
|
|
|
|
|
9
|
for (my $i = 0; $i<@$index-1; $i++) { |
727
|
9
|
|
|
|
|
27
|
push @count, $self->_count( $index->[$i], $index->[$i+1] ); |
728
|
|
|
|
|
|
|
}; |
729
|
2
|
|
|
|
|
5
|
shift @$index; # remove -inf |
730
|
|
|
|
|
|
|
|
731
|
2
|
|
|
|
|
9
|
my %hash; |
732
|
2
|
|
|
|
|
16
|
@hash{@$index} = @count; |
733
|
2
|
|
|
|
|
7
|
$self->{cache}{frequency_distribution_ref} = \%hash; |
734
|
2
|
|
|
|
|
9
|
return \%hash; |
735
|
|
|
|
|
|
|
}; |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
=head1 Specific methods |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
The folowing methods only apply to this module, or are experimental. |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=cut |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
=head2 bucket_width |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
Get bin width (relative to center of bin). Percentiles are off |
746
|
|
|
|
|
|
|
by no more than half of this. DEPRECATED. |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
=cut |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
sub bucket_width { |
751
|
5
|
|
|
5
|
1
|
10
|
my $self = shift; |
752
|
5
|
|
|
|
|
32
|
return $self->{base} - 1; |
753
|
|
|
|
|
|
|
}; |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=head2 log_base |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
Get upper/lower bound ratio for logarithmic bins. |
758
|
|
|
|
|
|
|
This represents relative precision of sample. |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
=head2 linear_width |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
Get width of linear buckets. |
763
|
|
|
|
|
|
|
This represents absolute precision of sample. |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=head2 linear_threshold |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
Get absolute value threshold below which interpolation is switched to linear. |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=cut |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
sub log_base { |
772
|
3
|
|
|
3
|
1
|
11
|
my $self = shift; |
773
|
3
|
|
|
|
|
34
|
return $self->{base}; |
774
|
|
|
|
|
|
|
}; |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
sub linear_width { |
777
|
3
|
|
|
3
|
1
|
6
|
my $self = shift; |
778
|
3
|
|
|
|
|
26
|
return $self->{linear_width}; |
779
|
|
|
|
|
|
|
}; |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
sub linear_threshold { |
782
|
8
|
|
|
8
|
1
|
33
|
my $self = shift; |
783
|
8
|
|
|
|
|
41
|
return $self->{linear_thresh}; |
784
|
|
|
|
|
|
|
}; |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
=head2 add_data_hash ( { value => weight, ... } ) |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
Add values with counts/weights. |
789
|
|
|
|
|
|
|
This can be used to import data from other |
790
|
|
|
|
|
|
|
Statistics::Descriptive::LogScale object. |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
Returns self, so that methods can be chained. |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
Negative counts are allowed and treated as "forgetting" data. |
795
|
|
|
|
|
|
|
If a bin count goes below zero, such bin is simply discarded. |
796
|
|
|
|
|
|
|
Minus infinity weight is allowed and has the same effect. |
797
|
|
|
|
|
|
|
Data is guaranteed to remain consistent. |
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
If incorrect data is given (i.e. non-numeric, undef, or +infinity), |
800
|
|
|
|
|
|
|
an exception is thrown and nothing changes. |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
B Cache may be reset, even if no data was actually inserted. |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
B It is possible to add infinite values to data pool. |
805
|
|
|
|
|
|
|
The module will try and calculate whatever can still be calculated. |
806
|
|
|
|
|
|
|
However, no portable way of serializing such values is done yet. |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
=cut |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
sub add_data_hash { |
811
|
19
|
|
|
19
|
1
|
62
|
my $self = shift; |
812
|
19
|
|
|
|
|
41
|
my $hash = shift; |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
# check incoming data for being numeric, and no +inf values |
815
|
19
|
|
|
|
|
34
|
eval { |
816
|
23
|
|
|
23
|
|
51696
|
use warnings FATAL => qw(numeric); |
|
23
|
|
|
|
|
64
|
|
|
23
|
|
|
|
|
46706
|
|
817
|
19
|
|
|
|
|
114
|
while (my ($k, $v) = each %$hash) { |
818
|
289
|
100
|
33
|
|
|
1546
|
$k == 0+$k and $v == 0+$v and $v < $INF |
|
|
|
66
|
|
|
|
|
819
|
|
|
|
|
|
|
or die "Infinite count for $k\n"; |
820
|
|
|
|
|
|
|
} |
821
|
|
|
|
|
|
|
}; |
822
|
19
|
100
|
|
|
|
278
|
croak __PACKAGE__.": add_data_hash failed: $@" |
823
|
|
|
|
|
|
|
if $@; |
824
|
|
|
|
|
|
|
|
825
|
17
|
|
|
|
|
40
|
delete $self->{cache}; |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
# update our counters |
828
|
17
|
|
|
|
|
78
|
foreach (keys %$hash) { |
829
|
286
|
50
|
|
|
|
517
|
next unless $hash->{$_}; |
830
|
286
|
|
|
|
|
522
|
my $key = $self->_round($_); |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
# Insert data. Make sure -Inf doesn't corrupt our counter. |
833
|
286
|
|
100
|
|
|
1016
|
my $newcount = ($self->{data}{$key} || 0) + $hash->{$_}; |
834
|
286
|
100
|
|
|
|
513
|
if ($newcount > 0) { |
835
|
|
|
|
|
|
|
# normal insert |
836
|
274
|
|
|
|
|
663
|
$self->{data}{$key} = $newcount; |
837
|
274
|
|
|
|
|
479
|
$self->{count} += $hash->{$_}; |
838
|
|
|
|
|
|
|
} else { |
839
|
|
|
|
|
|
|
# We're "forgetting" data, AND the bin got empty |
840
|
12
|
|
100
|
|
|
44
|
$self->{count} -= delete $self->{data}{$key} || 0; |
841
|
|
|
|
|
|
|
}; |
842
|
|
|
|
|
|
|
}; |
843
|
17
|
|
|
|
|
55
|
$self; |
844
|
|
|
|
|
|
|
}; |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
=head2 get_data_hash( %options ) |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
Return distribution hashref {value => number of occurances}. |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
This is inverse of add_data_hash. |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
Options may include: |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
=over |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
=item * min - ignore values below this. (See find_boundaries) |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
=item * max - ignore values above this. (See find_boundaries) |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
=item * ltrim - ignore this % of values on lower end. (See find_boundaries) |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
=item * rtrim - ignore this % of values on upper end. (See find_boundaries) |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
=item * noise_thresh - strip bins with count below this. |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
=back |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
=cut |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
sub get_data_hash { |
871
|
63
|
|
|
63
|
1
|
12106
|
my ($self, %opt) = @_; |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
# shallow copy of data if no options given |
874
|
63
|
100
|
|
|
|
197
|
return {%{ $self->{data} }} unless %opt; |
|
61
|
|
|
|
|
739
|
|
875
|
|
|
|
|
|
|
|
876
|
2
|
|
|
|
|
8
|
my ($min, $max) = $self->find_boundaries( %opt ); |
877
|
2
|
|
100
|
|
|
10
|
my $noize = $opt{noize_thresh} || 0; |
878
|
|
|
|
|
|
|
|
879
|
2
|
|
|
|
|
5
|
my $data = $self->{data}; |
880
|
2
|
|
|
|
|
4
|
my %hash; |
881
|
2
|
|
|
|
|
8
|
foreach (keys %$data ) { |
882
|
13
|
100
|
|
|
|
38
|
$_ < $min and next; |
883
|
8
|
50
|
|
|
|
24
|
$_ > $max and next; |
884
|
8
|
100
|
|
|
|
18
|
$data->{$_} < $noize and next; |
885
|
6
|
|
|
|
|
10
|
$hash{$_} = $data->{$_}; |
886
|
|
|
|
|
|
|
}; |
887
|
|
|
|
|
|
|
|
888
|
2
|
|
|
|
|
13
|
return \%hash; |
889
|
|
|
|
|
|
|
}; |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
=head2 TO_JSON() |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
Return enough data to recreate the whole object as an unblessed hashref. |
894
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
This routine conforms with C, hence the name. |
896
|
|
|
|
|
|
|
Can be called as |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
my $str = JSON::XS->new->allow_blessed->convert_blessed->encode( $this ); |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
B This module DOES NOT require JSON::XS or serialize to JSON. |
901
|
|
|
|
|
|
|
It just deals with data. |
902
|
|
|
|
|
|
|
Use C, C, C or any serializer of choice. |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
my $raw_data = $stat->TO_JSON; |
905
|
|
|
|
|
|
|
Statistics::Descriptive::LogScale->new( %$raw_data ); |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
Would generate an exact copy of C<$stat> object |
908
|
|
|
|
|
|
|
(provided it's S::D::L and not a subclass). |
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
=head2 clone( [ %options ] ) |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
Copy constructor - returns copy of an existing object. |
913
|
|
|
|
|
|
|
Cache is not preserved. |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
Constructor options may be given to override existing data. See new(). |
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
Trim options may be given to get partial data. See get_data_hash(). |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
=cut |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
sub clone { |
922
|
7
|
|
|
7
|
1
|
1099
|
my ($self, %opt) = @_; |
923
|
|
|
|
|
|
|
|
924
|
7
|
|
|
|
|
19
|
my $raw = $self->TO_JSON; |
925
|
7
|
100
|
|
|
|
22
|
if (%opt) { |
926
|
|
|
|
|
|
|
$raw->{data} = $self->get_data_hash( %opt ) |
927
|
4
|
100
|
|
|
|
10
|
unless exists $opt{data}; |
928
|
|
|
|
|
|
|
exists $opt{$_} and $raw->{$_} = $opt{$_} |
929
|
4
|
|
66
|
|
|
34
|
for @new_keys; |
930
|
|
|
|
|
|
|
}; |
931
|
|
|
|
|
|
|
|
932
|
7
|
|
|
|
|
39
|
return (ref $self)->new( %$raw ); |
933
|
|
|
|
|
|
|
}; |
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
sub TO_JSON { |
936
|
31
|
|
|
31
|
1
|
219
|
my $self = shift; |
937
|
|
|
|
|
|
|
# UGLY HACK Increase linear_thresh by a factor of base ** 1/10 |
938
|
|
|
|
|
|
|
# so that it's rounded down to present value |
939
|
|
|
|
|
|
|
return { |
940
|
|
|
|
|
|
|
CLASS => ref $self, |
941
|
|
|
|
|
|
|
VERSION => $VERSION, |
942
|
|
|
|
|
|
|
base => $self->{base}, |
943
|
|
|
|
|
|
|
linear_width => $self->{linear_width}, |
944
|
31
|
|
|
|
|
148
|
linear_thresh => $self->{linear_thresh} * ($self->{base}+9)/10, |
945
|
|
|
|
|
|
|
data => $self->get_data_hash, |
946
|
|
|
|
|
|
|
}; |
947
|
|
|
|
|
|
|
}; |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
=head2 scale_sample( $scale ) |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
Multiply all bins' counts by given value. This can be used to adjust |
952
|
|
|
|
|
|
|
significance of previous data before adding new data (e.g. gradually |
953
|
|
|
|
|
|
|
"forgetting" past data in a long-running application). |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
=cut |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
sub scale_sample { |
958
|
1
|
|
|
1
|
1
|
7
|
my $self = shift; |
959
|
1
|
|
|
|
|
3
|
my $factor = shift; |
960
|
1
|
50
|
|
|
|
5
|
$factor > 0 or croak (__PACKAGE__.": scale_sample: factor must be positive"); |
961
|
|
|
|
|
|
|
|
962
|
1
|
|
|
|
|
3
|
delete $self->{cache}; |
963
|
1
|
|
|
|
|
3
|
foreach (@{ $self->_sort }) { |
|
1
|
|
|
|
|
4
|
|
964
|
100
|
|
|
|
|
141
|
$self->{data}{$_} *= $factor; |
965
|
|
|
|
|
|
|
}; |
966
|
1
|
|
|
|
|
2
|
$self->{count} *= $factor; |
967
|
1
|
|
|
|
|
3
|
return $self; |
968
|
|
|
|
|
|
|
}; |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
=head2 mean_of( $code, [$min, $max] ) |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
Return expectation of $code over sample within given range. |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
$code is expected to be a pure function (i.e. depending only on its input |
975
|
|
|
|
|
|
|
value, and having no side effects). |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
The underlying integration mechanism only calculates $code once per bin, |
978
|
|
|
|
|
|
|
so $code should be stable as in not vary wildly over small intervals. |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
=cut |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
sub mean_of { |
983
|
24
|
|
|
24
|
1
|
80
|
my $self = shift; |
984
|
24
|
|
|
|
|
67
|
my ($code, $min, $max) = @_; |
985
|
|
|
|
|
|
|
|
986
|
24
|
|
|
429
|
|
119
|
my $weight = $self->sum_of( sub {1}, $min, $max ); |
|
429
|
|
|
|
|
906
|
|
987
|
24
|
100
|
|
|
|
112
|
return unless $weight; |
988
|
23
|
|
|
|
|
70
|
return $self->sum_of($code, $min, $max) / $weight; |
989
|
|
|
|
|
|
|
}; |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
=head1 Experimental methods |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
These methods may be subject to change in the future, or stay, if they |
994
|
|
|
|
|
|
|
are good. |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
=head2 sum_of ( $code, [ $min, $max ] ) |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
Integrate arbitrary function over the sample within the [ $min, $max ] interval. |
999
|
|
|
|
|
|
|
Default values for both limits are infinities of appropriate sign. |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
Values in the edge bins are cut using interpolation if needed. |
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
NOTE: sum_of(sub{1}, $a, $b) would return rough nubmer of data points |
1004
|
|
|
|
|
|
|
between $a and $b. |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
EXPERIMENTAL. The method name may change in the future. |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
=cut |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
sub sum_of { |
1011
|
147
|
|
|
147
|
1
|
945
|
my $self = shift; |
1012
|
147
|
|
|
|
|
361
|
my ($code, $realmin, $realmax) = @_; |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
# Just app up stuff |
1015
|
147
|
100
|
100
|
|
|
614
|
if (!defined $realmin and !defined $realmax) { |
1016
|
124
|
|
|
|
|
223
|
my $sum = 0; |
1017
|
124
|
|
|
|
|
186
|
while (my ($val, $count) = each %{ $self->{data} }) { |
|
63236
|
|
|
|
|
162918
|
|
1018
|
63112
|
|
|
|
|
100733
|
$sum += $count * $code->( $val ); |
1019
|
|
|
|
|
|
|
}; |
1020
|
124
|
|
|
|
|
679
|
return $sum; |
1021
|
|
|
|
|
|
|
}; |
1022
|
|
|
|
|
|
|
|
1023
|
23
|
100
|
|
|
|
69
|
$realmin = -$INF unless defined $realmin; |
1024
|
23
|
100
|
|
|
|
61
|
$realmax = $INF unless defined $realmax; |
1025
|
23
|
50
|
|
|
|
70
|
return 0 if( $realmin >= $realmax ); |
1026
|
|
|
|
|
|
|
|
1027
|
|
|
|
|
|
|
# correct limits. $min, $max are indices; $left, $right are limits |
1028
|
23
|
|
|
|
|
71
|
my $min = $self->_round($realmin); |
1029
|
23
|
|
|
|
|
64
|
my $max = $self->_round($realmax); |
1030
|
23
|
|
|
|
|
68
|
my $left = $self->_lower($realmin); |
1031
|
23
|
|
|
|
|
72
|
my $right = $self->_upper($realmax); |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
# find first bin that's above $left |
1034
|
23
|
|
|
|
|
66
|
my $keys = $self->_sort; |
1035
|
23
|
|
|
|
|
79
|
my $i = _bin_search_ge($keys, $left); |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
# warn "sum_of [$min, $max]"; |
1038
|
|
|
|
|
|
|
# add up bins |
1039
|
23
|
|
|
|
|
53
|
my $sum = 0; |
1040
|
23
|
|
|
|
|
72
|
for (; $i < @$keys; $i++) { |
1041
|
284
|
|
|
|
|
1000
|
my $val = $keys->[$i]; |
1042
|
284
|
100
|
|
|
|
684
|
last if $val > $right; |
1043
|
264
|
|
|
|
|
653
|
$sum += $self->{data}{$val} * $code->( $val ); |
1044
|
|
|
|
|
|
|
}; |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
# cut edges: the hard part |
1047
|
|
|
|
|
|
|
# min and max are now used as indices |
1048
|
|
|
|
|
|
|
# if min or max hits 0, we cut it in half (i.e. into equal 0+ and 0-) |
1049
|
|
|
|
|
|
|
# warn "Add up, sum_of = $sum"; |
1050
|
23
|
100
|
|
|
|
153
|
if ($self->{data}{$max}) { |
1051
|
20
|
|
|
|
|
60
|
my $width = $self->_upper($max) - $self->_lower($max); |
1052
|
20
|
100
|
|
|
|
65
|
my $part = $width |
1053
|
|
|
|
|
|
|
? ($self->_upper($max) - $realmax) / $width |
1054
|
|
|
|
|
|
|
: 0.5; |
1055
|
20
|
|
|
|
|
106
|
$sum -= $self->{data}{$max} * $code->($max) * $part; |
1056
|
|
|
|
|
|
|
}; |
1057
|
|
|
|
|
|
|
# warn "Cut R, sum_of = $sum"; |
1058
|
23
|
100
|
|
|
|
131
|
if ($self->{data}{$min}) { |
1059
|
20
|
|
|
|
|
56
|
my $width = $self->_upper($min) - $self->_lower($min); |
1060
|
20
|
100
|
|
|
|
66
|
my $part = $width |
1061
|
|
|
|
|
|
|
? ($realmin - $self->_lower($min)) / $width |
1062
|
|
|
|
|
|
|
: 0.5; |
1063
|
20
|
|
|
|
|
81
|
$sum -= $self->{data}{$min} * $code->($min) * $part; |
1064
|
|
|
|
|
|
|
}; |
1065
|
|
|
|
|
|
|
# warn "Cut L, sum_of = $sum"; |
1066
|
|
|
|
|
|
|
|
1067
|
23
|
|
|
|
|
174
|
return $sum; |
1068
|
|
|
|
|
|
|
}; # end sum_of |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
=head2 histogram ( %options ) |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
Returns array of form [ [ count0_1, x0, x1 ], [count1_2, x1, x2 ], ... ] |
1073
|
|
|
|
|
|
|
where countX_Y is number of data points between X and Y. |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
Options may include: |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
=over |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
=item * count (+) - number of intervals to divide sample into. |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
=item * index (+) - interval borders as array. Will be sorted before processing. |
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
=item * min - ignore values below this. Default = $self->min - epsilon. |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
=item * max - ignore values above this. Default = $self->max + epsilon. |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
=item * ltrim - ignore this % of values on lower end. |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
=item * rtrim - ignore this % of values on upper end. |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
=item * normalize_to - adjust counts so that max number becomes nnn. |
1092
|
|
|
|
|
|
|
This may be useful if one intends to draw pictures. |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
=back |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
Either count or index must be present. |
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
NOTE: this is equivalent to frequency_distribution_ref but better suited |
1099
|
|
|
|
|
|
|
for omitting sample tails and outputting pretty pictures. |
1100
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
=cut |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
sub histogram { |
1104
|
5
|
|
|
5
|
1
|
3170
|
my ($self, %opt) = @_; |
1105
|
|
|
|
|
|
|
|
1106
|
5
|
100
|
|
|
|
21
|
return unless $self->count; |
1107
|
4
|
|
|
|
|
21
|
my ($min, $max) = $self->find_boundaries( %opt ); |
1108
|
|
|
|
|
|
|
# build/check index |
1109
|
4
|
100
|
|
|
|
10
|
my @index = @{ $opt{index} || [] }; |
|
4
|
|
|
|
|
25
|
|
1110
|
4
|
100
|
|
|
|
13
|
if (!@index) { |
1111
|
3
|
|
|
|
|
7
|
my $n = $opt{count}; |
1112
|
3
|
50
|
|
|
|
9
|
$n > 0 or croak (__PACKAGE__.": histogram: insufficient options (count < 1 )"); |
1113
|
3
|
|
|
|
|
10
|
my $step = ($max - $min) / $n; |
1114
|
3
|
|
|
|
|
10
|
for (my $x = $min; $x <= $max; $x += $step) { |
1115
|
16
|
|
|
|
|
45
|
push @index, $x; |
1116
|
|
|
|
|
|
|
}; |
1117
|
|
|
|
|
|
|
} else { |
1118
|
|
|
|
|
|
|
# sort & uniq raw index |
1119
|
1
|
|
|
|
|
4
|
my %known; |
1120
|
1
|
|
|
|
|
5
|
@index = grep { !$known{$_}++ } @index; |
|
5
|
|
|
|
|
24
|
|
1121
|
1
|
|
|
|
|
7
|
@index = sort { $a <=> $b } @index; |
|
8
|
|
|
|
|
19
|
|
1122
|
1
|
50
|
|
|
|
7
|
@index > 1 or croak (__PACKAGE__.": histogram: insufficient options (index < 2)"); |
1123
|
|
|
|
|
|
|
}; |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
# finally: estimated counts between indices! |
1126
|
4
|
|
|
|
|
9
|
my @ret; |
1127
|
4
|
|
|
|
|
15
|
for ( my $i = 0; $i<@index-1; $i++) { |
1128
|
17
|
|
|
|
|
51
|
my $count = $self->_count( $index[$i], $index[$i+1] ); |
1129
|
17
|
|
|
|
|
84
|
push @ret, [ $count, $index[$i], $index[$i+1] ]; |
1130
|
|
|
|
|
|
|
}; |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
# if normalize - find maximum & divide by it |
1133
|
4
|
100
|
|
|
|
13
|
if (my $norm = $opt{normalize_to}) { |
1134
|
1
|
|
|
|
|
3
|
my $max = 0; |
1135
|
|
|
|
|
|
|
$max < $_->[0] and $max = $_->[0] |
1136
|
1
|
|
66
|
|
|
8
|
for @ret; |
1137
|
1
|
|
|
|
|
3
|
$norm /= $max; |
1138
|
1
|
|
|
|
|
4
|
$_->[0] *= $norm for @ret; |
1139
|
|
|
|
|
|
|
}; |
1140
|
|
|
|
|
|
|
|
1141
|
4
|
|
|
|
|
25
|
return \@ret; |
1142
|
|
|
|
|
|
|
}; |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
=head2 find_boundaries( %opt ) |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
Return ($min, $max) of part of sample denoted by options. |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
Options may include: |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
=over |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
=item * min - ignore values below this. default = min() - epsilon. |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
=item * max - ignore values above this. default = max() + epsilon. |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=item * ltrim - ignore this % of values on lower end. |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
=item * rtrim - ignore this % of values on upper end. |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
=back |
1161
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
If no options are given, the whole sample is guaranteed to reside between |
1163
|
|
|
|
|
|
|
returned values. |
1164
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
=cut |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
sub find_boundaries { |
1168
|
9
|
|
|
9
|
1
|
1492
|
my $self = shift; |
1169
|
9
|
|
|
|
|
28
|
my %opt = @_; |
1170
|
|
|
|
|
|
|
|
1171
|
9
|
100
|
|
|
|
26
|
return unless $self->count; |
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
# preprocess boundaries |
1174
|
8
|
100
|
|
|
|
35
|
my $min = defined $opt{min} ? $opt{min} : $self->_lower( $self->min ); |
1175
|
8
|
100
|
|
|
|
35
|
my $max = defined $opt{max} ? $opt{max} : $self->_upper( $self->max ); |
1176
|
|
|
|
|
|
|
|
1177
|
8
|
100
|
|
|
|
27
|
if ($opt{ltrim}) { |
1178
|
1
|
|
|
|
|
5
|
my $newmin = $self->percentile( $opt{ltrim} ); |
1179
|
1
|
50
|
33
|
|
|
7
|
defined $newmin and $newmin > $min and $min = $newmin; |
1180
|
|
|
|
|
|
|
}; |
1181
|
8
|
100
|
|
|
|
25
|
if ($opt{utrim}) { |
1182
|
1
|
|
|
|
|
3
|
my $newmax = $self->percentile( 100-$opt{utrim} ); |
1183
|
1
|
50
|
33
|
|
|
5
|
defined $newmax and $newmax < $max and $max = $newmax; |
1184
|
|
|
|
|
|
|
}; |
1185
|
|
|
|
|
|
|
|
1186
|
8
|
|
|
|
|
33
|
return ($min, $max); |
1187
|
|
|
|
|
|
|
}; |
1188
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
=head2 format( "printf-like expression", ... ) |
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
Returns a summary as requested by format string. |
1192
|
|
|
|
|
|
|
Just as with printf and sprintf, a placeholder starts with a C<%>, |
1193
|
|
|
|
|
|
|
followed by formatting options and a |
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
The following placeholders are supported: |
1196
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
=over |
1198
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
=item * % - a literal % |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
=item * s, f, g - a normal printf acting on an extra argument. |
1202
|
|
|
|
|
|
|
The number of extra arguments MUST match the number of such placeholders, |
1203
|
|
|
|
|
|
|
or this function dies. |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
=item * n - count; |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
=item * m - min; |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
=item * M - max, |
1210
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
=item * a - mean, |
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
=item * d - standard deviation, |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
=item * S - skewness, |
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
=item * K - kurtosis, |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
=item * q(x) - x-th quantile (requires argument), |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
=item * p(x) - x-th percentile (requires argument), |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
=item * P(x) - cdf - the inferred cumulative distribution function (x) |
1224
|
|
|
|
|
|
|
(requires argument), |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
=item * e(n) - central_moment - central moment of n-th power |
1227
|
|
|
|
|
|
|
(requires argument), |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
=item * E(n) - std_moment - standard moment of n-th power (requires argument), |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
=back |
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
For example, |
1234
|
|
|
|
|
|
|
|
1235
|
|
|
|
|
|
|
$stat->format( "99%% results lie between %p(0.5) and %p(99.5)" ); |
1236
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
Or |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
for( my $i = 0; $i < @stats; $i++ ) { |
1240
|
|
|
|
|
|
|
print $stats[$i]->format( "%s-th average value is %a +- %d", $i ); |
1241
|
|
|
|
|
|
|
}; |
1242
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
=cut |
1244
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
my %format = ( |
1246
|
|
|
|
|
|
|
# percent literal |
1247
|
|
|
|
|
|
|
'%' => '%', |
1248
|
|
|
|
|
|
|
# placeholders without parameters |
1249
|
|
|
|
|
|
|
n => 'count', |
1250
|
|
|
|
|
|
|
m => 'min', |
1251
|
|
|
|
|
|
|
M => 'max', |
1252
|
|
|
|
|
|
|
a => 'mean', |
1253
|
|
|
|
|
|
|
d => 'std_dev', |
1254
|
|
|
|
|
|
|
S => 'skewness', |
1255
|
|
|
|
|
|
|
K => 'kurtosis', |
1256
|
|
|
|
|
|
|
# placeholders with 1 parameter |
1257
|
|
|
|
|
|
|
q => 'quantile?', |
1258
|
|
|
|
|
|
|
p => 'percentile?', |
1259
|
|
|
|
|
|
|
P => 'cdf?', |
1260
|
|
|
|
|
|
|
e => 'central_moment?', |
1261
|
|
|
|
|
|
|
E => 'std_moment?', |
1262
|
|
|
|
|
|
|
); |
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
my %printf = ( |
1265
|
|
|
|
|
|
|
s => 1, |
1266
|
|
|
|
|
|
|
f => 1, |
1267
|
|
|
|
|
|
|
g => 1, |
1268
|
|
|
|
|
|
|
); |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
my $re_format = join "|", keys %format, keys %printf; |
1271
|
|
|
|
|
|
|
$re_format = qr((?:$re_format)); |
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
sub format { |
1274
|
5
|
|
|
5
|
1
|
20
|
my ($self, $format, @extra) = @_; |
1275
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
|
1277
|
|
|
|
|
|
|
# FIXME this accepts %m(5), then dies - UGLY |
1278
|
|
|
|
|
|
|
# TODO rewrite this as a giant sprintf... one day... |
1279
|
5
|
|
|
|
|
115
|
$format =~ s <%([0-9.\-+ #]*)($re_format)(?:\(($re_num)?\)){0,1}> |
|
7
|
|
|
|
|
24
|
|
1280
|
|
|
|
|
|
|
< _format_dispatch($self, $2, $1, $3, \@extra) >ge; |
1281
|
5
|
50
|
|
|
|
14
|
|
1282
|
|
|
|
|
|
|
croak __PACKAGE__.": Extra arguments in format()" |
1283
|
5
|
|
|
|
|
31
|
if @extra; |
1284
|
|
|
|
|
|
|
return $format; |
1285
|
|
|
|
|
|
|
}; |
1286
|
|
|
|
|
|
|
|
1287
|
7
|
|
|
7
|
|
32
|
sub _format_dispatch { |
1288
|
|
|
|
|
|
|
my ($obj, $method, $float, $arg, $extra) = @_; |
1289
|
|
|
|
|
|
|
|
1290
|
7
|
100
|
|
|
|
29
|
# Handle % escapes |
1291
|
1
|
|
|
|
|
4
|
if ($method !~ /^[a-zA-Z]/) { |
1292
|
|
|
|
|
|
|
return $method; |
1293
|
|
|
|
|
|
|
}; |
1294
|
6
|
100
|
|
|
|
49
|
# Handle printf built-in formats |
1295
|
1
|
50
|
|
|
|
6
|
if (!$format{$method}) { |
1296
|
|
|
|
|
|
|
croak __PACKAGE__.": Not enough arguments in format()" |
1297
|
1
|
|
|
|
|
30
|
unless @$extra; |
1298
|
|
|
|
|
|
|
return sprintf "%${float}${method}", shift @$extra; |
1299
|
|
|
|
|
|
|
}; |
1300
|
|
|
|
|
|
|
|
1301
|
5
|
|
|
|
|
10
|
# Now we know it's LogScale's own method |
1302
|
5
|
100
|
|
|
|
20
|
$method = $format{$method}; |
1303
|
2
|
50
|
|
|
|
7
|
if ($method =~ s/\?$//) { |
1304
|
|
|
|
|
|
|
die "Missing argument in method $method" if !defined $arg; |
1305
|
3
|
50
|
|
|
|
10
|
} else { |
1306
|
|
|
|
|
|
|
die "Extra argument in method $method" if defined $arg; |
1307
|
5
|
|
|
|
|
21
|
}; |
1308
|
|
|
|
|
|
|
my $result = $obj->$method($arg); |
1309
|
|
|
|
|
|
|
|
1310
|
5
|
100
|
100
|
|
|
17
|
# work around S::D::Full's convention that "-inf == undef" |
1311
|
|
|
|
|
|
|
$result = -9**9**9 |
1312
|
5
|
|
|
|
|
44
|
if ($method eq 'percentile' and !defined $result); |
1313
|
|
|
|
|
|
|
return sprintf "%${float}f", $result; |
1314
|
|
|
|
|
|
|
}; |
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
################################################################ |
1317
|
|
|
|
|
|
|
# No more public methods please |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
# MEMOIZE |
1320
|
|
|
|
|
|
|
# We'll keep methods' returned values under {cache}. |
1321
|
|
|
|
|
|
|
# All setters destroy said cache altogether. |
1322
|
|
|
|
|
|
|
# PLEASE replace this with a ready-made module if there's one. |
1323
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
# Sorry for this black magic, but it's too hard to write //= in EVERY method |
1325
|
|
|
|
|
|
|
# Long story short |
1326
|
|
|
|
|
|
|
# The next sub replaces $self->foo() with |
1327
|
|
|
|
|
|
|
# sub { $self->{cache}{foo} //= $self->originnal_foo } |
1328
|
|
|
|
|
|
|
# All setter methods are EXPECTED to destroy {cache} altogether. |
1329
|
|
|
|
|
|
|
|
1330
|
|
|
|
|
|
|
# NOTE if you plan subclassing the method, re-memoize methods you change. |
1331
|
253
|
|
|
253
|
|
567
|
sub _memoize_method { |
1332
|
|
|
|
|
|
|
my ($class, $name, $arg) = @_; |
1333
|
253
|
|
|
|
|
960
|
|
1334
|
253
|
50
|
|
|
|
704
|
my $orig_code = $class->can($name); |
1335
|
|
|
|
|
|
|
die "Error in memoizer section ($name)" |
1336
|
|
|
|
|
|
|
unless ref $orig_code eq 'CODE'; |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
# begin long conditional |
1339
|
|
|
|
|
|
|
my $cached_code = !$arg |
1340
|
208
|
100
|
|
208
|
|
7283
|
? sub { |
1341
|
103
|
|
|
|
|
284
|
if (!exists $_[0]->{cache}{$name}) { |
1342
|
|
|
|
|
|
|
$_[0]->{cache}{$name} = $orig_code->($_[0]); |
1343
|
208
|
|
|
|
|
816
|
}; |
1344
|
|
|
|
|
|
|
return $_[0]->{cache}{$name}; |
1345
|
|
|
|
|
|
|
} |
1346
|
171
|
|
|
171
|
|
2088
|
: sub { |
1347
|
171
|
|
|
|
|
281
|
my $self = shift; |
1348
|
171
|
100
|
|
|
|
412
|
my $arg = shift; |
1349
|
|
|
|
|
|
|
$arg = '' unless defined $arg; |
1350
|
171
|
100
|
|
|
|
526
|
|
1351
|
102
|
|
|
|
|
253
|
if (!exists $self->{cache}{"$name:$arg"}) { |
1352
|
|
|
|
|
|
|
$self->{cache}{"$name:$arg"} = $orig_code->($self, $arg); |
1353
|
170
|
|
|
|
|
773
|
}; |
1354
|
253
|
100
|
|
|
|
1041
|
return $self->{cache}{"$name:$arg"}; |
1355
|
|
|
|
|
|
|
}; |
1356
|
|
|
|
|
|
|
# conditional ends here |
1357
|
23
|
|
|
23
|
|
222
|
|
|
23
|
|
|
|
|
59
|
|
|
23
|
|
|
|
|
847
|
|
1358
|
23
|
|
|
23
|
|
145
|
no strict 'refs'; ## no critic |
|
23
|
|
|
|
|
56
|
|
|
23
|
|
|
|
|
2697
|
|
1359
|
253
|
|
|
|
|
450
|
no warnings 'redefine'; ## no critic |
|
253
|
|
|
|
|
1028
|
|
1360
|
|
|
|
|
|
|
*{$class."::".$name} = $cached_code; |
1361
|
|
|
|
|
|
|
}; # end of _memoize_method |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
# Memoize all the methods w/o arguments |
1364
|
|
|
|
|
|
|
foreach ( qw(sum sumsq mean min max mode) ) { |
1365
|
|
|
|
|
|
|
__PACKAGE__->_memoize_method($_); |
1366
|
|
|
|
|
|
|
}; |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
# Memoize methods with 1 argument |
1369
|
|
|
|
|
|
|
foreach (qw(quantile central_moment std_moment standard_deviation variance)) { |
1370
|
|
|
|
|
|
|
__PACKAGE__->_memoize_method($_, 1); |
1371
|
|
|
|
|
|
|
}; |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
# add shorter alias of standard_deviation (this must happen AFTER memoization) |
1374
|
23
|
|
|
23
|
|
170
|
{ |
|
23
|
|
|
|
|
62
|
|
|
23
|
|
|
|
|
8005
|
|
1375
|
|
|
|
|
|
|
no warnings 'once'; ## no critic |
1376
|
|
|
|
|
|
|
*std_dev = \&standard_deviation; |
1377
|
|
|
|
|
|
|
}; |
1378
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
# Get number of values below $x |
1380
|
|
|
|
|
|
|
# Like sum_of(sub{1}, undef, $x), but faster. |
1381
|
|
|
|
|
|
|
# Used by cdf() |
1382
|
105
|
|
|
105
|
|
192
|
sub _count { |
1383
|
105
|
100
|
|
|
|
277
|
my $self = shift; |
1384
|
74
|
|
|
|
|
120
|
@_>1 and return $self->_count($_[1]) - $self->_count($_[0]); |
1385
|
|
|
|
|
|
|
my $x = shift; |
1386
|
74
|
|
|
|
|
175
|
|
1387
|
74
|
|
|
|
|
171
|
my $upper = $self->_upper($x); |
1388
|
74
|
100
|
|
|
|
189
|
my $i = _bin_search_gt( $self->_sort, $upper ); |
1389
|
65
|
|
|
|
|
144
|
!$i-- and return 0; |
1390
|
|
|
|
|
|
|
my $count = $self->_probability->[$i]; |
1391
|
|
|
|
|
|
|
|
1392
|
65
|
|
|
|
|
140
|
# interpolate |
1393
|
65
|
100
|
|
|
|
325
|
my $bin = $self->_round( $x ); |
1394
|
34
|
|
|
|
|
65
|
if (my $val = $self->{data}{$bin}) { |
1395
|
34
|
100
|
|
|
|
138
|
my $width = ($upper - $bin) * 2; |
1396
|
34
|
|
|
|
|
74
|
my $part = $width ? ( ($upper - $x) / $width) : 1/2; |
1397
|
|
|
|
|
|
|
$count -= $part * $val; |
1398
|
65
|
|
|
|
|
208
|
}; |
1399
|
|
|
|
|
|
|
return $count; |
1400
|
|
|
|
|
|
|
}; |
1401
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
# BINARY SEARCH |
1403
|
|
|
|
|
|
|
# Not a method, just a function |
1404
|
|
|
|
|
|
|
# Takes sorted \@array and a $num |
1405
|
|
|
|
|
|
|
# Return lowest $i such that $array[$i] >= $num |
1406
|
|
|
|
|
|
|
# Return (scalar @array) if no such $i exists |
1407
|
147
|
|
|
147
|
|
1102
|
sub _bin_search_ge { |
1408
|
|
|
|
|
|
|
my ($array, $x) = @_; |
1409
|
147
|
100
|
100
|
|
|
692
|
|
1410
|
126
|
|
|
|
|
248
|
return 0 unless @$array and $array->[0] < $x; |
1411
|
126
|
|
|
|
|
199
|
my $l = 0; |
1412
|
126
|
|
|
|
|
315
|
my $r = @$array; |
1413
|
405
|
|
|
|
|
805
|
while ($l+1 < $r) { |
1414
|
405
|
100
|
|
|
|
1180
|
my $m = int( ($l + $r) /2); |
1415
|
|
|
|
|
|
|
$array->[$m] < $x ? $l = $m : $r = $m; |
1416
|
126
|
|
|
|
|
276
|
}; |
1417
|
|
|
|
|
|
|
return $l+1; |
1418
|
|
|
|
|
|
|
}; |
1419
|
74
|
|
|
74
|
|
162
|
sub _bin_search_gt { |
1420
|
74
|
|
|
|
|
148
|
my ($array, $x) = @_; |
1421
|
74
|
100
|
100
|
|
|
273
|
my $i = _bin_search_ge(@_); |
1422
|
74
|
|
|
|
|
126
|
$i++ if defined $array->[$i] and $array->[$i] == $x; |
1423
|
|
|
|
|
|
|
return $i; |
1424
|
|
|
|
|
|
|
}; |
1425
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
# THE CORE |
1427
|
|
|
|
|
|
|
# Here come the number=>bin functions |
1428
|
|
|
|
|
|
|
# round() generates bin center |
1429
|
|
|
|
|
|
|
# upper() and lower() are respective boundaries. |
1430
|
|
|
|
|
|
|
# Here's the algorithm: |
1431
|
|
|
|
|
|
|
# 1) determine whether bin is linear or logarithmic |
1432
|
|
|
|
|
|
|
# 2) for linear bins, return bin# * bucket_width (==2*absolute error) |
1433
|
|
|
|
|
|
|
# add/subtract 0.5 to get edges. |
1434
|
|
|
|
|
|
|
# 3) for logarithmic bins, return base ** bin# with appropriate sign |
1435
|
|
|
|
|
|
|
# multiply by precalculated constant to get edges |
1436
|
|
|
|
|
|
|
# note that +0.5 doesn't work here, since sqrt(a*b) != (a+b)/2 |
1437
|
|
|
|
|
|
|
# This part is fragile and can be written better |
1438
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
# center of bin containing x |
1440
|
90893
|
|
|
90893
|
|
127270
|
sub _round { |
1441
|
90893
|
|
|
|
|
123324
|
my $self = shift; |
1442
|
|
|
|
|
|
|
my $x = shift; |
1443
|
23
|
|
|
23
|
|
197
|
|
|
23
|
|
|
|
|
58
|
|
|
23
|
|
|
|
|
3313
|
|
1444
|
|
|
|
|
|
|
use warnings FATAL => qw(numeric uninitialized); |
1445
|
90893
|
100
|
|
|
|
182093
|
|
1446
|
|
|
|
|
|
|
if (abs($x) <= $self->{linear_thresh}) { |
1447
|
1761
|
|
100
|
|
|
5531
|
return $self->{linear_width} |
1448
|
|
|
|
|
|
|
&& $self->{linear_width} * floor( $x / $self->{linear_width} + 0.5 ); |
1449
|
89129
|
|
|
|
|
201463
|
}; |
1450
|
89129
|
|
|
|
|
163434
|
my $i = floor (((log abs $x) - $self->{logfloor})/ $self->{logbase}); |
1451
|
89129
|
100
|
|
|
|
344638
|
my $value = $self->{base} ** $i; |
1452
|
|
|
|
|
|
|
return $x < 0 ? -$value : $value; |
1453
|
|
|
|
|
|
|
}; |
1454
|
|
|
|
|
|
|
|
1455
|
|
|
|
|
|
|
# lower, upper limits of bin containing x |
1456
|
883
|
|
|
883
|
|
55043
|
sub _lower { |
1457
|
883
|
|
|
|
|
1363
|
my $self = shift; |
1458
|
|
|
|
|
|
|
my $x = shift; |
1459
|
23
|
|
|
23
|
|
170
|
|
|
23
|
|
|
|
|
61
|
|
|
23
|
|
|
|
|
7518
|
|
1460
|
|
|
|
|
|
|
use warnings FATAL => qw(numeric uninitialized); |
1461
|
883
|
100
|
|
|
|
2795
|
|
1462
|
|
|
|
|
|
|
if (abs($x) <= $self->{linear_thresh}) { |
1463
|
184
|
|
66
|
|
|
1449
|
return $self->{linear_width} |
1464
|
|
|
|
|
|
|
&& $self->{linear_width} * (floor( $x / $self->{linear_width} + 0.5) - 0.5); |
1465
|
699
|
|
|
|
|
2433
|
}; |
1466
|
699
|
100
|
|
|
|
1602
|
my $i = floor (((log abs $x) - $self->{logfloor} )/ $self->{logbase}); |
1467
|
310
|
|
|
|
|
1325
|
if ($x > 0) { |
1468
|
|
|
|
|
|
|
return $self->{floor} * $self->{base}**($i); |
1469
|
389
|
|
|
|
|
1727
|
} else { |
1470
|
|
|
|
|
|
|
return -$self->{floor} * $self->{base}**($i+1); |
1471
|
|
|
|
|
|
|
}; |
1472
|
|
|
|
|
|
|
}; |
1473
|
|
|
|
|
|
|
|
1474
|
462
|
|
|
462
|
|
6588
|
sub _upper { |
1475
|
|
|
|
|
|
|
return -$_[0]->_lower(-$_[1]); |
1476
|
|
|
|
|
|
|
}; |
1477
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
# build bin index |
1479
|
191
|
|
|
191
|
|
324
|
sub _sort { |
1480
|
|
|
|
|
|
|
my $self = shift; |
1481
|
191
|
|
100
|
|
|
703
|
return $self->{cache}{sorted} |
|
1247
|
|
|
|
|
2033
|
|
|
32
|
|
|
|
|
298
|
|
1482
|
|
|
|
|
|
|
||= [ sort { $a <=> $b } keys %{ $self->{data} } ]; |
1483
|
|
|
|
|
|
|
}; |
1484
|
|
|
|
|
|
|
|
1485
|
|
|
|
|
|
|
# build cumulative bin counts index |
1486
|
107
|
|
|
107
|
|
166
|
sub _probability { |
1487
|
107
|
|
66
|
|
|
313
|
my $self = shift; |
1488
|
20
|
|
|
|
|
36
|
return $self->{cache}{probability} ||= do { |
1489
|
20
|
|
|
|
|
33
|
my @array; |
1490
|
20
|
|
|
|
|
48
|
my $sum = 0; |
|
20
|
|
|
|
|
70
|
|
1491
|
165
|
|
|
|
|
247
|
foreach (@{ $self->_sort }) { |
1492
|
165
|
|
|
|
|
339
|
$sum += $self->{data}{$_}; |
1493
|
|
|
|
|
|
|
push @array, $sum; |
1494
|
20
|
|
|
|
|
123
|
}; |
1495
|
|
|
|
|
|
|
\@array; |
1496
|
|
|
|
|
|
|
}; |
1497
|
|
|
|
|
|
|
}; |
1498
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
=head1 AUTHOR |
1500
|
|
|
|
|
|
|
|
1501
|
|
|
|
|
|
|
Konstantin S. Uvarin, C<< >> |
1502
|
|
|
|
|
|
|
|
1503
|
|
|
|
|
|
|
=head1 BUGS |
1504
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
The module is currently under development. There may be bugs. |
1506
|
|
|
|
|
|
|
|
1507
|
|
|
|
|
|
|
C only works for discrete distributions, and simply returns |
1508
|
|
|
|
|
|
|
the first bin with largest bin count. |
1509
|
|
|
|
|
|
|
A better algorithm is wanted. |
1510
|
|
|
|
|
|
|
|
1511
|
|
|
|
|
|
|
C should have been made a private method. |
1512
|
|
|
|
|
|
|
Its signature and/or name may change in the future. |
1513
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
See the TODO file in the distribution package. |
1515
|
|
|
|
|
|
|
|
1516
|
|
|
|
|
|
|
Please feel free to post bugs and/or feature requests to github: |
1517
|
|
|
|
|
|
|
L |
1518
|
|
|
|
|
|
|
|
1519
|
|
|
|
|
|
|
Alternatively, you can use CPAN RT |
1520
|
|
|
|
|
|
|
via e-mail C, |
1521
|
|
|
|
|
|
|
or through the web interface at |
1522
|
|
|
|
|
|
|
L. |
1523
|
|
|
|
|
|
|
|
1524
|
|
|
|
|
|
|
Your contribution is appreciated. |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
=head1 SUPPORT |
1527
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
1529
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
perldoc Statistics::Descriptive::LogScale |
1531
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
You can also look for information at: |
1533
|
|
|
|
|
|
|
|
1534
|
|
|
|
|
|
|
=over 4 |
1535
|
|
|
|
|
|
|
|
1536
|
|
|
|
|
|
|
=item * GitHub: |
1537
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
L |
1539
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here) |
1541
|
|
|
|
|
|
|
|
1542
|
|
|
|
|
|
|
L |
1543
|
|
|
|
|
|
|
|
1544
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
1545
|
|
|
|
|
|
|
|
1546
|
|
|
|
|
|
|
L |
1547
|
|
|
|
|
|
|
|
1548
|
|
|
|
|
|
|
=item * CPAN Ratings |
1549
|
|
|
|
|
|
|
|
1550
|
|
|
|
|
|
|
L |
1551
|
|
|
|
|
|
|
|
1552
|
|
|
|
|
|
|
=item * Search CPAN |
1553
|
|
|
|
|
|
|
|
1554
|
|
|
|
|
|
|
L |
1555
|
|
|
|
|
|
|
|
1556
|
|
|
|
|
|
|
=back |
1557
|
|
|
|
|
|
|
|
1558
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
1559
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
This module was inspired by a talk that Andrew Aksyonoff, author of |
1561
|
|
|
|
|
|
|
L, |
1562
|
|
|
|
|
|
|
has given at HighLoad++ conference in Moscow, 2012. |
1563
|
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
|
L was and is used as reference when in doubt. |
1565
|
|
|
|
|
|
|
Several code snippets were shamelessly stolen from there. |
1566
|
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
|
C and C parameter names were suggested by |
1568
|
|
|
|
|
|
|
CountZero from http://perlmonks.org |
1569
|
|
|
|
|
|
|
|
1570
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
1571
|
|
|
|
|
|
|
|
1572
|
|
|
|
|
|
|
Copyright 2013-2015 Konstantin S. Uvarin. |
1573
|
|
|
|
|
|
|
|
1574
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
1575
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
1576
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
1577
|
|
|
|
|
|
|
|
1578
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
1579
|
|
|
|
|
|
|
|
1580
|
|
|
|
|
|
|
=cut |
1581
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
1; # End of Statistics::Descriptive::LogScale |