line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
22
|
|
|
22
|
|
533900
|
use 5.006; |
|
22
|
|
|
|
|
85
|
|
2
|
22
|
|
|
22
|
|
121
|
use strict; |
|
22
|
|
|
|
|
39
|
|
|
22
|
|
|
|
|
496
|
|
3
|
22
|
|
|
22
|
|
106
|
use warnings; |
|
22
|
|
|
|
|
45
|
|
|
22
|
|
|
|
|
1893
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
package Statistics::Descriptive::LogScale; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Statistics::Descriptive::LogScale - Memory-efficient approximate univariate |
10
|
|
|
|
|
|
|
descriptive statistics class. |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 VERSION |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Version 0.09 |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=cut |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = 0.09; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head2 Basic usage |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
The basic usage is roughly the same as that of L. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
use Statistics::Descriptive::LogScale; |
27
|
|
|
|
|
|
|
my $stat = Statistics::Descriptive::LogScale->new (); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
while(<>) { |
30
|
|
|
|
|
|
|
chomp; |
31
|
|
|
|
|
|
|
$stat->add_data($_); |
32
|
|
|
|
|
|
|
}; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# This can also be done in O(1) memory, precisely |
35
|
|
|
|
|
|
|
printf "Mean: %f +- %f\n", $stat->mean, $stat->standard_deviation; |
36
|
|
|
|
|
|
|
# This requires storing actual data, or approximating |
37
|
|
|
|
|
|
|
printf "25%% : %f\n", $stat->percentile(25); |
38
|
|
|
|
|
|
|
printf "Median: %f\n", $stat->median; |
39
|
|
|
|
|
|
|
printf "75%% : %f\n", $stat->percentile(75); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head2 Save/load |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
This is not present in L. |
44
|
|
|
|
|
|
|
The save/load interface is designed compatible with JSON::XS. |
45
|
|
|
|
|
|
|
However, any other serializer can be used. |
46
|
|
|
|
|
|
|
The C method is I to return unblessed hashref |
47
|
|
|
|
|
|
|
with enough information to restore the original object. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
use Statistics::Descriptive::LogScale; |
50
|
|
|
|
|
|
|
my $stat = Statistics::Descriptive::LogScale->new (); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# ..... much later |
53
|
|
|
|
|
|
|
# Save |
54
|
|
|
|
|
|
|
print $fd encoder_of_choice( $stat->TO_JSON ) |
55
|
|
|
|
|
|
|
or die "Failed to save: $!"; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# ..... and even later |
58
|
|
|
|
|
|
|
# Load |
59
|
|
|
|
|
|
|
my $plain_hash = decoder_of_choice( $raw_data ); |
60
|
|
|
|
|
|
|
my $copy_of_stat = Statistics::Descriptive::LogScale->new( %$plain_hash ); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# Import into existing LogScale instance |
63
|
|
|
|
|
|
|
my $plain_hash = decoder_of_choice( $more_raw_data ); |
64
|
|
|
|
|
|
|
$copy_of_stat->add_data_hash( $plain_hash->{data} ); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head2 Histograms |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Both L and L |
69
|
|
|
|
|
|
|
offer C method for querying data point counts. |
70
|
|
|
|
|
|
|
However, there's also C method for making pretty pictures. |
71
|
|
|
|
|
|
|
Here's a simple text-based histogram. |
72
|
|
|
|
|
|
|
A proper GD example was too long to fit into this margin. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
use strict; |
75
|
|
|
|
|
|
|
use warnings; |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
use Statistics::Descriptive::LogScale; |
78
|
|
|
|
|
|
|
my $stat = Statistics::Descriptive::LogScale->new (); |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# collect/load data ... |
81
|
|
|
|
|
|
|
my $re_float = qr([-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?); |
82
|
|
|
|
|
|
|
while (<>) { |
83
|
|
|
|
|
|
|
$stat->add_data($_) for m/($re_float)/g; |
84
|
|
|
|
|
|
|
}; |
85
|
|
|
|
|
|
|
die "Empty set" |
86
|
|
|
|
|
|
|
unless $stat->count; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# get data in [ count, lower_bound, upper_bound ] format as arrayref |
89
|
|
|
|
|
|
|
my $hist = $stat->histogram( count => 20 ); |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# find maximum value to use as a scale factor |
92
|
|
|
|
|
|
|
my $scale = $hist->[0][0]; |
93
|
|
|
|
|
|
|
$scale < $_->[0] and $scale = $_->[0] for @$hist; |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
foreach (@$hist) { |
96
|
|
|
|
|
|
|
printf "%10f %s\n", $_->[1], '#' x ($_->[0] * 68 / $scale); |
97
|
|
|
|
|
|
|
}; |
98
|
|
|
|
|
|
|
printf "%10f\n", $hist->[-1][2]; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head1 DESCRIPTION |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
This module aims at providing some advanced statistical functions without |
103
|
|
|
|
|
|
|
storing all data in memory, at the cost of certain (predictable) precision loss. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
Data is represented by a set of bins that only store counts of fitting |
106
|
|
|
|
|
|
|
data points. |
107
|
|
|
|
|
|
|
Most bins are logarithmic, i.e. lower end / upper end ratio is constant. |
108
|
|
|
|
|
|
|
However, around zero linear approximation may be user instead |
109
|
|
|
|
|
|
|
(see "linear_width" and "linear_thresh" parameters in new()). |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
All operations are then performed on the bins, introducing relative error |
112
|
|
|
|
|
|
|
which does not, however, exceed the bins' relative width ("base"). |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head1 METHODS |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=cut |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
######################################################################## |
119
|
|
|
|
|
|
|
# == HOW IT WORKS == |
120
|
|
|
|
|
|
|
# Buckets are stored in a hash: { $value => $count, ... } |
121
|
|
|
|
|
|
|
# {base} is bin width, {logbase} == log {base} (cache) |
122
|
|
|
|
|
|
|
# {linear_thresh} is where we switch to equal bin approximation |
123
|
|
|
|
|
|
|
# {linear_width} is width of bin around zero (==linear_thresh if not given) |
124
|
|
|
|
|
|
|
# {floor} is lower bound of bin whose center is 1. {logfloor} = log {floor} |
125
|
|
|
|
|
|
|
# Nearly all meaningful subs have to scan all the bins, which is bad, |
126
|
|
|
|
|
|
|
# but anyway better than scanning full sample. |
127
|
|
|
|
|
|
|
|
128
|
22
|
|
|
22
|
|
120
|
use Carp; |
|
22
|
|
|
|
|
40
|
|
|
22
|
|
|
|
|
1794
|
|
129
|
22
|
|
|
22
|
|
17822
|
use POSIX qw(floor ceil); |
|
22
|
|
|
|
|
175957
|
|
|
22
|
|
|
|
|
122
|
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# Fields are NOT used internally for now, so this is just a declaration |
132
|
22
|
|
|
|
|
128
|
use fields qw( |
133
|
|
|
|
|
|
|
data count |
134
|
|
|
|
|
|
|
linear_width base logbase floor linear_thresh logfloor |
135
|
|
|
|
|
|
|
cache |
136
|
22
|
|
|
22
|
|
3422996
|
); |
|
22
|
|
|
|
|
35081
|
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# This is for infinite portability^W^W portable infinity |
139
|
|
|
|
|
|
|
my $INF = 9**9**9; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=head2 new( %options ) |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
%options may include: |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=over |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=item * base - ratio of adjacent bins. Default is 10^(1/232), which gives |
148
|
|
|
|
|
|
|
1% precision and exact decimal powers. |
149
|
|
|
|
|
|
|
This value represents acceptable relative error in analysis results. |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
B Actual value may be slightly less than requested one. |
152
|
|
|
|
|
|
|
This is done so to avoid troubles with future rounding in (de)serialization. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item * linear_width - width of linear bins around zero. |
155
|
|
|
|
|
|
|
This value represents precision of incoming data. |
156
|
|
|
|
|
|
|
Default is zero, i.e. we assume that the measurement is precise. |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
B Actual value may be less (by no more than a factor of C) |
159
|
|
|
|
|
|
|
so that borders of linear and logarithmic bins fit nicely. |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=item * linear_thresh - where to switch to linear approximation. |
162
|
|
|
|
|
|
|
If only one of C and C is given, |
163
|
|
|
|
|
|
|
the other will be calculated. |
164
|
|
|
|
|
|
|
However, user may want to specify both in some cases. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
B Actual value may be less (by no more than a factor of C) |
167
|
|
|
|
|
|
|
so that borders of linear and logarithmic bins fit nicely. |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=item * data - hashref with C<{ value => weight }> for initializing data. |
170
|
|
|
|
|
|
|
Used for cloning. |
171
|
|
|
|
|
|
|
See C. |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=item * zero_thresh - absolute value threshold below which everything is |
174
|
|
|
|
|
|
|
considered zero. |
175
|
|
|
|
|
|
|
DEPRECATED, C and C override this if given. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=back |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=cut |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
my @new_keys = qw( base linear_thresh linear_width zero_thresh data ); |
182
|
|
|
|
|
|
|
# TODO Throw if extra options given? |
183
|
|
|
|
|
|
|
sub new { |
184
|
41
|
|
|
41
|
1
|
8290
|
my $class = shift; |
185
|
41
|
|
|
|
|
149
|
my %opt = @_; |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# base for logarithmic bins, sane default: +-1%, exact decimal powers |
188
|
|
|
|
|
|
|
# UGLY HACK number->string->number to avoid |
189
|
|
|
|
|
|
|
# future serialization inconsistencies |
190
|
41
|
|
100
|
|
|
194
|
$opt{base} ||= 10**(1/232); |
191
|
41
|
|
|
|
|
403
|
$opt{base} = 0 + "$opt{base}"; |
192
|
41
|
50
|
|
|
|
221
|
$opt{base} > 1 or croak __PACKAGE__.": new(): base must be >1"; |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
# calculate where to switch to linear approximation |
195
|
|
|
|
|
|
|
# the condition is: linear bin( thresh ) ~~ log bin( thresh ) |
196
|
|
|
|
|
|
|
# i.e. thresh * base - thresh ~~ absolute error * 2 |
197
|
|
|
|
|
|
|
# i.e. thresh ~~ absolute_error * 2 / (base - 1) |
198
|
|
|
|
|
|
|
# also support legacy API (zero_thresh) |
199
|
41
|
100
|
|
|
|
143
|
if (defined $opt{linear_thresh} ) { |
200
|
13
|
|
100
|
|
|
78
|
$opt{linear_width} ||= $opt{linear_thresh} * ($opt{base}-1); |
201
|
|
|
|
|
|
|
} else { |
202
|
28
|
|
|
|
|
424
|
$opt{linear_thresh} = $opt{zero_thresh}; |
203
|
|
|
|
|
|
|
}; |
204
|
|
|
|
|
|
|
$opt{linear_thresh} = abs($opt{linear_width}) / ($opt{base} - 1) |
205
|
41
|
100
|
100
|
|
|
222
|
if $opt{linear_width} and !$opt{linear_thresh}; |
206
|
41
|
|
100
|
|
|
181
|
$opt{linear_thresh} ||= 0; |
207
|
41
|
50
|
|
|
|
137
|
$opt{linear_thresh} >= 0 |
208
|
|
|
|
|
|
|
or croak __PACKAGE__.": new(): linear_thresh must be >= 0"; |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# Can't use fields::new anymore |
211
|
|
|
|
|
|
|
# due to JSON::XS incompatibility with restricted hashes |
212
|
41
|
|
|
|
|
130
|
my $self = bless {}, $class; |
213
|
|
|
|
|
|
|
|
214
|
41
|
|
|
|
|
171
|
$self->{base} = $opt{base}; |
215
|
|
|
|
|
|
|
# cache values to ease calculations |
216
|
|
|
|
|
|
|
# floor = (lower end of bin) / (center of bin) |
217
|
41
|
|
|
|
|
125
|
$self->{floor} = 2/(1+$opt{base}); |
218
|
41
|
|
|
|
|
263
|
$self->{logbase} = log $opt{base}; |
219
|
41
|
|
|
|
|
132
|
$self->{logfloor} = log $self->{floor}; |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
# bootstrap linear_thresh - make it fit bin edge |
222
|
41
|
|
|
|
|
91
|
$self->{linear_width} = $self->{linear_thresh} = 0; |
223
|
41
|
|
|
|
|
158
|
$self->{linear_thresh} = $self->_lower( $opt{linear_thresh} ); |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# divide anything below linear_thresh into odd number of bins |
226
|
|
|
|
|
|
|
# not exceeding requested linear_width |
227
|
41
|
100
|
|
|
|
139
|
if ($self->{linear_thresh}) { |
228
|
15
|
|
66
|
|
|
57
|
my $linear_width = $opt{linear_width} || 2 * $self->{linear_thresh}; |
229
|
15
|
|
|
|
|
62
|
my $n_linear = ceil(2 * $self->{linear_thresh} / abs($linear_width)); |
230
|
15
|
100
|
|
|
|
53
|
$n_linear++ unless $n_linear % 2; |
231
|
15
|
|
|
|
|
41
|
$self->{linear_width} = (2 * $self->{linear_thresh} / $n_linear); |
232
|
|
|
|
|
|
|
}; |
233
|
|
|
|
|
|
|
|
234
|
41
|
|
|
|
|
142
|
$self->clear; |
235
|
41
|
100
|
|
|
|
133
|
if ($opt{data}) { |
236
|
10
|
|
|
|
|
31
|
$self->add_data_hash($opt{data}); |
237
|
|
|
|
|
|
|
}; |
238
|
41
|
|
|
|
|
181
|
return $self; |
239
|
|
|
|
|
|
|
}; |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=head1 General statistical methods |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
These methods are used to query the distribution properties. They generally |
244
|
|
|
|
|
|
|
follow the interface of L and co, |
245
|
|
|
|
|
|
|
with minor additions. |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
All methods return C on empty data set, except for |
248
|
|
|
|
|
|
|
C, C, C, C and C |
249
|
|
|
|
|
|
|
which all return 0. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
B This module caches whatever it calculates very agressively. |
252
|
|
|
|
|
|
|
Don't hesitate to use statistical functions (except for sum_of/mean_of) |
253
|
|
|
|
|
|
|
more than once. The cache is deleted upon data entry. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=head2 clear |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
Destroy all stored data. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=cut |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub clear { |
262
|
55
|
|
|
55
|
1
|
10259
|
my $self = shift; |
263
|
55
|
|
|
|
|
120
|
$self->{data} = {}; |
264
|
55
|
|
|
|
|
3445
|
$self->{count} = 0; |
265
|
55
|
|
|
|
|
118
|
delete $self->{cache}; |
266
|
55
|
|
|
|
|
114
|
return $self; |
267
|
|
|
|
|
|
|
}; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head2 add_data( @data ) |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Add numbers to the data pool. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Returns self, so that methods can be chained. |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
If incorrect data is given (i.e. non-numeric, undef), |
276
|
|
|
|
|
|
|
an exception is thrown and only partial data gets inserted. |
277
|
|
|
|
|
|
|
The state of object is guaranteed to remain consistent in such case. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
B Cache is reset, even if no data was actually inserted. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
B It is possible to add infinite values to data pool. |
282
|
|
|
|
|
|
|
The module will try and calculate whatever can still be calculated. |
283
|
|
|
|
|
|
|
However, no portable way of serializing such values is done yet. |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=cut |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
sub add_data { |
288
|
51212
|
|
|
51212
|
1
|
208194
|
my $self = shift; |
289
|
|
|
|
|
|
|
|
290
|
51212
|
|
|
|
|
58987
|
delete $self->{cache}; |
291
|
51212
|
|
|
|
|
81457
|
foreach (@_) { |
292
|
90136
|
|
|
|
|
174826
|
$self->{data}{ $self->_round($_) }++; |
293
|
90133
|
|
|
|
|
164623
|
$self->{count}++; |
294
|
|
|
|
|
|
|
}; |
295
|
51209
|
|
|
|
|
97184
|
$self; |
296
|
|
|
|
|
|
|
}; |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=head2 count |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
Returns number of data points. |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=cut |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
sub count { |
305
|
69
|
|
|
69
|
1
|
10577
|
my $self = shift; |
306
|
69
|
|
|
|
|
314
|
return $self->{count}; |
307
|
|
|
|
|
|
|
}; |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=head2 min |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=head2 max |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
Values of minimal and maximal bins. |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
NOTE: Due to rounding, some of the actual inserted values may fall outside |
316
|
|
|
|
|
|
|
of the min..max range. This may change in the future. |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=cut |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
sub min { |
321
|
|
|
|
|
|
|
my $self = shift; |
322
|
|
|
|
|
|
|
return $self->_sort->[0]; |
323
|
|
|
|
|
|
|
}; |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
sub max { |
326
|
|
|
|
|
|
|
my $self = shift; |
327
|
|
|
|
|
|
|
return $self->_sort->[-1]; |
328
|
|
|
|
|
|
|
}; |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
=head2 sample_range |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
Return sample range of the dataset, i.e. max() - min(). |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=cut |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub sample_range { |
337
|
5
|
|
|
5
|
1
|
597
|
my $self = shift; |
338
|
5
|
100
|
|
|
|
14
|
return $self->count ? $self->max - $self->min : undef; |
339
|
|
|
|
|
|
|
}; |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=head2 sum |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
Return sum of all data points. |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=cut |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
sub sum { |
348
|
|
|
|
|
|
|
my $self = shift; |
349
|
|
|
|
|
|
|
return $self->sum_of(sub { $_[0] }); |
350
|
|
|
|
|
|
|
}; |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=head2 sumsq |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
Return sum of squares of all datapoints. |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=cut |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
sub sumsq { |
359
|
|
|
|
|
|
|
my $self = shift; |
360
|
|
|
|
|
|
|
return $self->sum_of(sub { $_[0] * $_[0] }); |
361
|
|
|
|
|
|
|
}; |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
=head2 mean |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
Return mean, or average value, i.e. sum()/count(). |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=cut |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
sub mean { |
370
|
|
|
|
|
|
|
my $self = shift; |
371
|
|
|
|
|
|
|
return $self->{count} ? $self->sum / $self->{count} : undef; |
372
|
|
|
|
|
|
|
}; |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
=head2 variance |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=head2 variance( $correction ) |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
Return data variance, i.e. E((x - E(x)) ** 2). |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Bessel's correction (division by n-1 instead of n) is used by default. |
381
|
|
|
|
|
|
|
This may be changed by specifying $correction explicitly. |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
B The binning strategy used here should also introduce variance bias. |
384
|
|
|
|
|
|
|
This is not yet accounted for. |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=cut |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
sub variance { |
389
|
|
|
|
|
|
|
my $self = shift; |
390
|
|
|
|
|
|
|
my $correction = shift; |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
# in fact we'll receive correction='' because of how cache works |
393
|
|
|
|
|
|
|
$correction = 1 unless defined $correction and length $correction; |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
return 0 if ($self->{count} < 1 + $correction); |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
my $var = $self->sumsq - $self->sum**2 / $self->{count}; |
398
|
|
|
|
|
|
|
return $var <= 0 ? 0 : $var / ( $self->{count} - $correction ); |
399
|
|
|
|
|
|
|
}; |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=head2 standard_deviation |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head2 standard_deviation( $correction ) |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=head2 std_dev |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
Return standard deviation, i.e. square root of variance. |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
Bessel's correction (division by n-1 instead of n) is used by default. |
410
|
|
|
|
|
|
|
This may be changed by specifying $correction explicitly. |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
B The binning strategy used here should also introduce variance bias. |
413
|
|
|
|
|
|
|
This is not yet accounted for. |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
=cut |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
sub standard_deviation { |
418
|
|
|
|
|
|
|
my $self = shift; |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
return sqrt($self->variance(@_)); |
421
|
|
|
|
|
|
|
}; |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=head2 cdf ($x) |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
Cumulative distribution function. Returns estimated probability of |
426
|
|
|
|
|
|
|
random data point from the sample being less than C<$x>. |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
As a special case, C accounts for I of zeroth bin count (if any). |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
Not present in Statistics::Descriptive::Full, but appears in |
431
|
|
|
|
|
|
|
L. |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=head2 cdf ($x, $y) |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
Returns probability of a value being between C<$x> and C<$y> ($x <= $y). |
436
|
|
|
|
|
|
|
This is essentially C. |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=cut |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
sub cdf { |
441
|
4
|
|
|
4
|
1
|
7
|
my $self = shift; |
442
|
4
|
50
|
|
|
|
12
|
return unless $self->{count}; |
443
|
4
|
|
|
|
|
9
|
return $self->_count(@_) / $self->{count}; |
444
|
|
|
|
|
|
|
}; |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head2 percentile( $n ) |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Find $n-th percentile, i.e. a value below which lies $n % of the data. |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
0-th percentile is by definition -inf and is returned as undef |
451
|
|
|
|
|
|
|
(see Statistics::Descriptive). |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
$n is a real number, not necessarily integer. |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=cut |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
sub percentile { |
458
|
46
|
|
|
46
|
1
|
8715
|
my $self = shift; |
459
|
46
|
|
|
|
|
77
|
my $x = shift; |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# assert 0<=$x<=100 |
462
|
46
|
50
|
33
|
|
|
294
|
croak __PACKAGE__.": percentile() argument must be between 0 and 100" |
463
|
|
|
|
|
|
|
unless 0<= $x and $x <= 100; |
464
|
|
|
|
|
|
|
|
465
|
46
|
|
|
|
|
126
|
my $need = $x * $self->{count} / 100; |
466
|
46
|
100
|
|
|
|
161
|
return if $need < 1; |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
# dichotomize |
469
|
|
|
|
|
|
|
# $i is lowest value >= needed |
470
|
|
|
|
|
|
|
# $need doesnt exceed last bin! |
471
|
41
|
|
|
|
|
124
|
my $i = _bin_search_ge( $self->_probability, $need ); |
472
|
41
|
|
|
|
|
113
|
return $self->_sort->[ $i ]; |
473
|
|
|
|
|
|
|
}; |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=head2 quantile( 0..4 ) |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
From Statistics::Descriptive manual: |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
0 => zero quartile (Q0) : minimal value |
480
|
|
|
|
|
|
|
1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile |
481
|
|
|
|
|
|
|
2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile |
482
|
|
|
|
|
|
|
3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile |
483
|
|
|
|
|
|
|
4 => fourth quartile (Q4) : maximal value |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
=cut |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
sub quantile { |
488
|
|
|
|
|
|
|
my $self = shift; |
489
|
|
|
|
|
|
|
my $t = shift; |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
croak (__PACKAGE__.": quantile() argument must be one of 0..4") |
492
|
|
|
|
|
|
|
unless $t =~ /^[0-4]$/; |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
$t or return $self->min; |
495
|
|
|
|
|
|
|
return $self->percentile($t * 100 / 4); |
496
|
|
|
|
|
|
|
}; |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
=head2 median |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
Return median of data, a value that divides the sample in half. |
501
|
|
|
|
|
|
|
Same as percentile(50). |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=cut |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
sub median { |
506
|
16
|
|
|
16
|
1
|
993
|
my $self = shift; |
507
|
16
|
|
|
|
|
52
|
return $self->percentile(50); |
508
|
|
|
|
|
|
|
}; |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=head2 trimmed_mean( $ltrim, [ $utrim ] ) |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
Return mean of sample with $ltrim and $utrim fraction of data points |
513
|
|
|
|
|
|
|
remover from lower and upper ends respectively. |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
ltrim defaults to 0, and rtrim to ltrim. |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=cut |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
sub trimmed_mean { |
520
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
521
|
1
|
|
|
|
|
2
|
my ($lower, $upper) = @_; |
522
|
1
|
|
50
|
|
|
4
|
$lower ||= 0; |
523
|
1
|
50
|
|
|
|
4
|
$upper = $lower unless defined $upper; |
524
|
|
|
|
|
|
|
|
525
|
1
|
|
|
|
|
5
|
my $min = $self->percentile($lower * 100); |
526
|
1
|
|
|
|
|
5
|
my $max = $self->percentile(100 - $upper * 100); |
527
|
|
|
|
|
|
|
|
528
|
1
|
50
|
|
|
|
4
|
return unless $min < $max; |
529
|
|
|
|
|
|
|
|
530
|
1
|
|
|
5
|
|
7
|
return $self->mean_of(sub{$_[0]}, $min, $max); |
|
5
|
|
|
|
|
13
|
|
531
|
|
|
|
|
|
|
}; |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
=head2 harmonic_mean |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
Return harmonic mean of the data, i.e. 1/E(1/x). |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
Return undef if division by zero occurs (see Statistics::Descriptive). |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=cut |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
sub harmonic_mean { |
542
|
2
|
|
|
2
|
1
|
614
|
my $self = shift; |
543
|
|
|
|
|
|
|
|
544
|
2
|
|
|
|
|
4
|
my $ret; |
545
|
2
|
|
|
|
|
4
|
eval { |
546
|
2
|
|
|
5
|
|
7
|
$ret = $self->count / $self->sum_of(sub { 1/$_[0] }); |
|
5
|
|
|
|
|
14
|
|
547
|
|
|
|
|
|
|
}; |
548
|
2
|
50
|
66
|
|
|
23
|
if ($@ and $@ !~ /division.*zero/) { |
549
|
0
|
|
|
|
|
0
|
die $@; # rethrow ALL BUT 1/0 which yields undef |
550
|
|
|
|
|
|
|
}; |
551
|
2
|
|
|
|
|
10
|
return $ret; |
552
|
|
|
|
|
|
|
}; |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=head2 geometric_mean |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
Return geometric mean of the data, that is, exp(E(log x)). |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
Dies unless all data points are of the same sign. |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=cut |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
sub geometric_mean { |
563
|
2
|
|
|
2
|
1
|
618
|
my $self = shift; |
564
|
|
|
|
|
|
|
|
565
|
2
|
100
|
|
|
|
8
|
return unless $self->count; |
566
|
1
|
50
|
|
|
|
5
|
croak __PACKAGE__.": geometric_mean() called on mixed sign sample" |
567
|
|
|
|
|
|
|
if $self->min * $self->max < 0; |
568
|
|
|
|
|
|
|
|
569
|
1
|
50
|
|
|
|
5
|
return 0 if $self->{data}{0}; |
570
|
|
|
|
|
|
|
# this must be dog slow, but we already log() too much at this point. |
571
|
1
|
|
|
5
|
|
6
|
my $ret = exp( $self->sum_of( sub { log abs $_[0] } ) / $self->{count} ); |
|
5
|
|
|
|
|
15
|
|
572
|
1
|
50
|
|
|
|
5
|
return $self->min < 0 ? -$ret : $ret; |
573
|
|
|
|
|
|
|
}; |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=head2 skewness |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
Return skewness of the distribution, calculated as |
578
|
|
|
|
|
|
|
n/(n-1)(n-2) * E((x-E(x))**3)/std_dev**3 (this is consistent with Excel). |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
=cut |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
sub skewness { |
583
|
20
|
|
|
20
|
1
|
1307
|
my $self = shift; |
584
|
|
|
|
|
|
|
|
585
|
20
|
|
|
|
|
40
|
my $n = $self->{count}; |
586
|
20
|
100
|
|
|
|
65
|
return unless $n > 2; |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
# code stolen from Statistics::Descriptive |
589
|
18
|
|
|
|
|
51
|
my $skew = $n * $self->std_moment(3); |
590
|
18
|
|
|
|
|
42
|
my $correction = $n / ( ($n-1) * ($n-2) ); |
591
|
18
|
|
|
|
|
89
|
return $correction * $skew; |
592
|
|
|
|
|
|
|
}; |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
=head2 kurtosis |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
Return kurtosis of the distribution, that is 4-th standardized moment - 3. |
597
|
|
|
|
|
|
|
The exact formula used here is consistent with that of Excel and |
598
|
|
|
|
|
|
|
Statistics::Descriptive. |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=cut |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
sub kurtosis { |
603
|
22
|
|
|
22
|
1
|
639
|
my $self = shift; |
604
|
|
|
|
|
|
|
|
605
|
22
|
|
|
|
|
47
|
my $n = $self->{count}; |
606
|
22
|
100
|
|
|
|
70
|
return unless $n > 3; |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
# code stolen from Statistics::Descriptive |
609
|
20
|
|
|
|
|
50
|
my $kurt = $n * $self->std_moment(4); |
610
|
20
|
|
|
|
|
65
|
my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) ); |
611
|
20
|
|
|
|
|
48
|
my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) ); |
612
|
|
|
|
|
|
|
|
613
|
20
|
|
|
|
|
115
|
return $correction1 * $kurt - $correction2; |
614
|
|
|
|
|
|
|
}; |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
=head2 central_moment( $n ) |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
Return $n-th central moment, that is, E((x - E(x))^$n). |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
Not present in Statistics::Descriptive::Full. |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=cut |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
sub central_moment { |
625
|
|
|
|
|
|
|
my $self = shift; |
626
|
|
|
|
|
|
|
my $n = shift; |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
my $mean = $self->mean; |
629
|
|
|
|
|
|
|
return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) / $self->{count}; |
630
|
|
|
|
|
|
|
}; |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
=head2 std_moment( $n ) |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
Return $n-th standardized moment, that is, |
635
|
|
|
|
|
|
|
E((x - E(x))**$n) / std_dev(x)**$n. |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
Not present in Statistics::Descriptive::Full. |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=cut |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
sub std_moment { |
642
|
|
|
|
|
|
|
my $self = shift; |
643
|
|
|
|
|
|
|
my $n = shift; |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
my $mean = $self->mean; |
646
|
|
|
|
|
|
|
my $dev = $self->std_dev; |
647
|
|
|
|
|
|
|
return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) |
648
|
|
|
|
|
|
|
/ ( $dev**$n * $self->{count} ); |
649
|
|
|
|
|
|
|
}; |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=head2 mode |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
Mode of a distribution is the most common value for a discrete distribution, |
654
|
|
|
|
|
|
|
or maximum of probability density for continuous one. |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
For now we assume that the distribution IS discrete, and return the bin with |
657
|
|
|
|
|
|
|
the biggest hit count. |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
NOTE A better algorithm is still wanted. Experimental. |
660
|
|
|
|
|
|
|
Behavior may change in the future. |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=cut |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
# Naive implementation |
665
|
|
|
|
|
|
|
# Find bin w/greatest count and return it |
666
|
|
|
|
|
|
|
sub mode { |
667
|
|
|
|
|
|
|
my $self = shift; |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
return if !$self->count; |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
my $index = $self->_sort; |
672
|
|
|
|
|
|
|
return $index->[0] if @$index == 1; |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
my @count = map { $self->{data}{$_} } @$index; |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
my $max_index; |
677
|
|
|
|
|
|
|
my $max_growth = 0; |
678
|
|
|
|
|
|
|
for (my $i = 0; $i<@count; $i++) { |
679
|
|
|
|
|
|
|
$count[$i] > $max_growth or next; |
680
|
|
|
|
|
|
|
$max_index = $i; |
681
|
|
|
|
|
|
|
$max_growth = $count[$i]; |
682
|
|
|
|
|
|
|
}; |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
return $index->[$max_index]; |
685
|
|
|
|
|
|
|
}; |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=head2 frequency_distribution_ref( \@index ) |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=head2 frequency_distribution_ref( $n ) |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
=head2 frequency_distribution_ref |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
Return numbers of data point counts below each number in @index as hashref. |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
If a number is given instead of arrayref, @index is created |
696
|
|
|
|
|
|
|
by dividing [min, max] into $n intervals. |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
If no parameters are given, return previous result, if any. |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=cut |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
sub frequency_distribution_ref { |
703
|
3
|
|
|
3
|
1
|
628
|
my $self = shift; |
704
|
3
|
|
|
|
|
6
|
my $index = shift; |
705
|
|
|
|
|
|
|
|
706
|
3
|
100
|
|
|
|
11
|
return unless $self->count; |
707
|
|
|
|
|
|
|
# ah, compatibility - return last value |
708
|
|
|
|
|
|
|
return $self->{cache}{frequency_distribution_ref} |
709
|
2
|
50
|
|
|
|
6
|
unless defined $index; |
710
|
|
|
|
|
|
|
# make index if number given |
711
|
2
|
100
|
|
|
|
5
|
if (!ref $index) { |
712
|
1
|
50
|
|
|
|
8
|
croak __PACKAGE__.": frequency_distribution_ref(): ". |
713
|
|
|
|
|
|
|
"argument must be array, of number > 2, not $index" |
714
|
|
|
|
|
|
|
unless $index > 2; |
715
|
1
|
|
|
|
|
5
|
my $min = $self->_lower($self->min); |
716
|
1
|
|
|
|
|
4
|
my $max = $self->_upper($self->max); |
717
|
1
|
|
|
|
|
3
|
my $step = ($max - $min) / $index; |
718
|
1
|
|
|
|
|
2
|
$index = [ map { $min + $_ * $step } 1..$index ]; |
|
5
|
|
|
|
|
11
|
|
719
|
|
|
|
|
|
|
}; |
720
|
|
|
|
|
|
|
|
721
|
2
|
|
|
|
|
8
|
@$index = (-$INF, sort { $a <=> $b } @$index); |
|
12
|
|
|
|
|
25
|
|
722
|
|
|
|
|
|
|
|
723
|
2
|
|
|
|
|
3
|
my @count; |
724
|
2
|
|
|
|
|
10
|
for (my $i = 0; $i<@$index-1; $i++) { |
725
|
9
|
|
|
|
|
26
|
push @count, $self->_count( $index->[$i], $index->[$i+1] ); |
726
|
|
|
|
|
|
|
}; |
727
|
2
|
|
|
|
|
4
|
shift @$index; # remove -inf |
728
|
|
|
|
|
|
|
|
729
|
2
|
|
|
|
|
3
|
my %hash; |
730
|
2
|
|
|
|
|
22
|
@hash{@$index} = @count; |
731
|
2
|
|
|
|
|
5
|
$self->{cache}{frequency_distribution_ref} = \%hash; |
732
|
2
|
|
|
|
|
9
|
return \%hash; |
733
|
|
|
|
|
|
|
}; |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=head1 Specific methods |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
The folowing methods only apply to this module, or are experimental. |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
=cut |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=head2 bucket_width |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
Get bin width (relative to center of bin). Percentiles are off |
744
|
|
|
|
|
|
|
by no more than half of this. DEPRECATED. |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
=cut |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
sub bucket_width { |
749
|
5
|
|
|
5
|
1
|
9
|
my $self = shift; |
750
|
5
|
|
|
|
|
29
|
return $self->{base} - 1; |
751
|
|
|
|
|
|
|
}; |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
=head2 log_base |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
Get upper/lower bound ratio for logarithmic bins. |
756
|
|
|
|
|
|
|
This represents relative precision of sample. |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
=head2 linear_width |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
Get width of linear buckets. |
761
|
|
|
|
|
|
|
This represents absolute precision of sample. |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
=head2 linear_threshold |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
Get absolute value threshold below which interpolation is switched to linear. |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
=cut |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
sub log_base { |
770
|
3
|
|
|
3
|
1
|
7
|
my $self = shift; |
771
|
3
|
|
|
|
|
14
|
return $self->{base}; |
772
|
|
|
|
|
|
|
}; |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
sub linear_width { |
775
|
3
|
|
|
3
|
1
|
5
|
my $self = shift; |
776
|
3
|
|
|
|
|
21
|
return $self->{linear_width}; |
777
|
|
|
|
|
|
|
}; |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
sub linear_threshold { |
780
|
8
|
|
|
8
|
1
|
30
|
my $self = shift; |
781
|
8
|
|
|
|
|
44
|
return $self->{linear_thresh}; |
782
|
|
|
|
|
|
|
}; |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
=head2 add_data_hash ( { value => weight, ... } ) |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
Add values with counts/weights. |
787
|
|
|
|
|
|
|
This can be used to import data from other |
788
|
|
|
|
|
|
|
Statistics::Descriptive::LogScale object. |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
Returns self, so that methods can be chained. |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
Negative counts are allowed and treated as "forgetting" data. |
793
|
|
|
|
|
|
|
If a bin count goes below zero, such bin is simply discarded. |
794
|
|
|
|
|
|
|
Minus infinity weight is allowed and has the same effect. |
795
|
|
|
|
|
|
|
Data is guaranteed to remain consistent. |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
If incorrect data is given (i.e. non-numeric, undef, or +infinity), |
798
|
|
|
|
|
|
|
an exception is thrown and nothing changes. |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
B Cache may be reset, even if no data was actually inserted. |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
B It is possible to add infinite values to data pool. |
803
|
|
|
|
|
|
|
The module will try and calculate whatever can still be calculated. |
804
|
|
|
|
|
|
|
However, no portable way of serializing such values is done yet. |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
=cut |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
sub add_data_hash { |
809
|
19
|
|
|
19
|
1
|
49
|
my $self = shift; |
810
|
19
|
|
|
|
|
36
|
my $hash = shift; |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
# check incoming data for being numeric, and no +inf values |
813
|
19
|
|
|
|
|
30
|
eval { |
814
|
22
|
|
|
22
|
|
56346
|
use warnings FATAL => qw(numeric); |
|
22
|
|
|
|
|
57
|
|
|
22
|
|
|
|
|
42584
|
|
815
|
19
|
|
|
|
|
99
|
while (my ($k, $v) = each %$hash) { |
816
|
289
|
100
|
33
|
|
|
2487
|
$k == 0+$k and $v == 0+$v and $v < $INF |
|
|
|
66
|
|
|
|
|
817
|
|
|
|
|
|
|
or die "Infinite count for $k\n"; |
818
|
|
|
|
|
|
|
} |
819
|
|
|
|
|
|
|
}; |
820
|
19
|
100
|
|
|
|
290
|
croak __PACKAGE__.": add_data_hash failed: $@" |
821
|
|
|
|
|
|
|
if $@; |
822
|
|
|
|
|
|
|
|
823
|
17
|
|
|
|
|
29
|
delete $self->{cache}; |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
# update our counters |
826
|
17
|
|
|
|
|
82
|
foreach (keys %$hash) { |
827
|
286
|
50
|
|
|
|
630
|
next unless $hash->{$_}; |
828
|
286
|
|
|
|
|
555
|
my $key = $self->_round($_); |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
# Insert data. Make sure -Inf doesn't corrupt our counter. |
831
|
286
|
|
100
|
|
|
1381
|
my $newcount = ($self->{data}{$key} || 0) + $hash->{$_}; |
832
|
286
|
100
|
|
|
|
530
|
if ($newcount > 0) { |
833
|
|
|
|
|
|
|
# normal insert |
834
|
274
|
|
|
|
|
781
|
$self->{data}{$key} = $newcount; |
835
|
274
|
|
|
|
|
565
|
$self->{count} += $hash->{$_}; |
836
|
|
|
|
|
|
|
} else { |
837
|
|
|
|
|
|
|
# We're "forgetting" data, AND the bin got empty |
838
|
12
|
|
100
|
|
|
58
|
$self->{count} -= delete $self->{data}{$key} || 0; |
839
|
|
|
|
|
|
|
}; |
840
|
|
|
|
|
|
|
}; |
841
|
17
|
|
|
|
|
49
|
$self; |
842
|
|
|
|
|
|
|
}; |
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
=head2 get_data_hash( %options ) |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
Return distribution hashref {value => number of occurances}. |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
This is inverse of add_data_hash. |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
Options may include: |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
=over |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
=item * min - ignore values below this. (See find_boundaries) |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
=item * max - ignore values above this. (See find_boundaries) |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
=item * ltrim - ignore this % of values on lower end. (See find_boundaries) |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
=item * rtrim - ignore this % of values on upper end. (See find_boundaries) |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
=item * noise_thresh - strip bins with count below this. |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
=back |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
=cut |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
sub get_data_hash { |
869
|
63
|
|
|
63
|
1
|
13235
|
my ($self, %opt) = @_; |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
# shallow copy of data if no options given |
872
|
63
|
100
|
|
|
|
196
|
return {%{ $self->{data} }} unless %opt; |
|
61
|
|
|
|
|
820
|
|
873
|
|
|
|
|
|
|
|
874
|
2
|
|
|
|
|
30
|
my ($min, $max) = $self->find_boundaries( %opt ); |
875
|
2
|
|
100
|
|
|
15
|
my $noize = $opt{noize_thresh} || 0; |
876
|
|
|
|
|
|
|
|
877
|
2
|
|
|
|
|
4
|
my $data = $self->{data}; |
878
|
2
|
|
|
|
|
4
|
my %hash; |
879
|
2
|
|
|
|
|
11
|
foreach (keys %$data ) { |
880
|
13
|
100
|
|
|
|
42
|
$_ < $min and next; |
881
|
8
|
50
|
|
|
|
17
|
$_ > $max and next; |
882
|
8
|
100
|
|
|
|
20
|
$data->{$_} < $noize and next; |
883
|
6
|
|
|
|
|
14
|
$hash{$_} = $data->{$_}; |
884
|
|
|
|
|
|
|
}; |
885
|
|
|
|
|
|
|
|
886
|
2
|
|
|
|
|
11
|
return \%hash; |
887
|
|
|
|
|
|
|
}; |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
=head2 TO_JSON() |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
Return enough data to recreate the whole object as an unblessed hashref. |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
This routine conforms with C, hence the name. |
894
|
|
|
|
|
|
|
Can be called as |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
my $str = JSON::XS->new->allow_blessed->convert_blessed->encode( $this ); |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
B This module DOES NOT require JSON::XS or serialize to JSON. |
899
|
|
|
|
|
|
|
It just deals with data. |
900
|
|
|
|
|
|
|
Use C, C, C or any serializer of choice. |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
my $raw_data = $stat->TO_JSON; |
903
|
|
|
|
|
|
|
Statistics::Descriptive::LogScale->new( %$raw_data ); |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
Would generate an exact copy of C<$stat> object |
906
|
|
|
|
|
|
|
(provided it's S::D::L and not a subclass). |
907
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
=head2 clone( [ %options ] ) |
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
Copy constructor - returns copy of an existing object. |
911
|
|
|
|
|
|
|
Cache is not preserved. |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
Constructor options may be given to override existing data. See new(). |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
Trim options may be given to get partial data. See get_data_hash(). |
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
=cut |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
sub clone { |
920
|
7
|
|
|
7
|
1
|
3397
|
my ($self, %opt) = @_; |
921
|
|
|
|
|
|
|
|
922
|
7
|
|
|
|
|
18
|
my $raw = $self->TO_JSON; |
923
|
7
|
100
|
|
|
|
23
|
if (%opt) { |
924
|
|
|
|
|
|
|
$raw->{data} = $self->get_data_hash( %opt ) |
925
|
4
|
100
|
|
|
|
18
|
unless exists $opt{data}; |
926
|
|
|
|
|
|
|
exists $opt{$_} and $raw->{$_} = $opt{$_} |
927
|
4
|
|
66
|
|
|
51
|
for @new_keys; |
928
|
|
|
|
|
|
|
}; |
929
|
|
|
|
|
|
|
|
930
|
7
|
|
|
|
|
35
|
return (ref $self)->new( %$raw ); |
931
|
|
|
|
|
|
|
}; |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
sub TO_JSON { |
934
|
31
|
|
|
31
|
1
|
152
|
my $self = shift; |
935
|
|
|
|
|
|
|
# UGLY HACK Increase linear_thresh by a factor of base ** 1/10 |
936
|
|
|
|
|
|
|
# so that it's rounded down to present value |
937
|
|
|
|
|
|
|
return { |
938
|
|
|
|
|
|
|
CLASS => ref $self, |
939
|
|
|
|
|
|
|
VERSION => $VERSION, |
940
|
|
|
|
|
|
|
base => $self->{base}, |
941
|
|
|
|
|
|
|
linear_width => $self->{linear_width}, |
942
|
31
|
|
|
|
|
150
|
linear_thresh => $self->{linear_thresh} * ($self->{base}+9)/10, |
943
|
|
|
|
|
|
|
data => $self->get_data_hash, |
944
|
|
|
|
|
|
|
}; |
945
|
|
|
|
|
|
|
}; |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=head2 scale_sample( $scale ) |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
Multiply all bins' counts by given value. This can be used to adjust |
950
|
|
|
|
|
|
|
significance of previous data before adding new data (e.g. gradually |
951
|
|
|
|
|
|
|
"forgetting" past data in a long-running application). |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
=cut |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
sub scale_sample { |
956
|
1
|
|
|
1
|
1
|
8
|
my $self = shift; |
957
|
1
|
|
|
|
|
2
|
my $factor = shift; |
958
|
1
|
50
|
|
|
|
4
|
$factor > 0 or croak (__PACKAGE__.": scale_sample: factor must be positive"); |
959
|
|
|
|
|
|
|
|
960
|
1
|
|
|
|
|
3
|
delete $self->{cache}; |
961
|
1
|
|
|
|
|
2
|
foreach (@{ $self->_sort }) { |
|
1
|
|
|
|
|
9
|
|
962
|
100
|
|
|
|
|
175
|
$self->{data}{$_} *= $factor; |
963
|
|
|
|
|
|
|
}; |
964
|
1
|
|
|
|
|
7
|
$self->{count} *= $factor; |
965
|
1
|
|
|
|
|
3
|
return $self; |
966
|
|
|
|
|
|
|
}; |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
=head2 mean_of( $code, [$min, $max] ) |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
Return expectation of $code over sample within given range. |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
$code is expected to be a pure function (i.e. depending only on its input |
973
|
|
|
|
|
|
|
value, and having no side effects). |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
The underlying integration mechanism only calculates $code once per bin, |
976
|
|
|
|
|
|
|
so $code should be stable as in not vary wildly over small intervals. |
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
=cut |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
sub mean_of { |
981
|
24
|
|
|
24
|
1
|
55
|
my $self = shift; |
982
|
24
|
|
|
|
|
40
|
my ($code, $min, $max) = @_; |
983
|
|
|
|
|
|
|
|
984
|
24
|
|
|
429
|
|
108
|
my $weight = $self->sum_of( sub {1}, $min, $max ); |
|
429
|
|
|
|
|
861
|
|
985
|
24
|
100
|
|
|
|
114
|
return unless $weight; |
986
|
23
|
|
|
|
|
52
|
return $self->sum_of($code, $min, $max) / $weight; |
987
|
|
|
|
|
|
|
}; |
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
=head1 Experimental methods |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
These methods may be subject to change in the future, or stay, if they |
992
|
|
|
|
|
|
|
are good. |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
=head2 sum_of ( $code, [ $min, $max ] ) |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
Integrate arbitrary function over the sample within the [ $min, $max ] interval. |
997
|
|
|
|
|
|
|
Default values for both limits are infinities of appropriate sign. |
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
Values in the edge bins are cut using interpolation if needed. |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
NOTE: sum_of(sub{1}, $a, $b) would return rough nubmer of data points |
1002
|
|
|
|
|
|
|
between $a and $b. |
1003
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
EXPERIMENTAL. The method name may change in the future. |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
=cut |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
sub sum_of { |
1009
|
145
|
|
|
145
|
1
|
764
|
my $self = shift; |
1010
|
145
|
|
|
|
|
292
|
my ($code, $realmin, $realmax) = @_; |
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
# Just app up stuff |
1013
|
145
|
100
|
66
|
|
|
546
|
if (!defined $realmin and !defined $realmax) { |
1014
|
122
|
|
|
|
|
165
|
my $sum = 0; |
1015
|
122
|
|
|
|
|
166
|
while (my ($val, $count) = each %{ $self->{data} }) { |
|
63224
|
|
|
|
|
168850
|
|
1016
|
63102
|
|
|
|
|
100217
|
$sum += $count * $code->( $val ); |
1017
|
|
|
|
|
|
|
}; |
1018
|
122
|
|
|
|
|
691
|
return $sum; |
1019
|
|
|
|
|
|
|
}; |
1020
|
|
|
|
|
|
|
|
1021
|
23
|
100
|
|
|
|
52
|
$realmin = -$INF unless defined $realmin; |
1022
|
23
|
100
|
|
|
|
50
|
$realmax = $INF unless defined $realmax; |
1023
|
23
|
50
|
|
|
|
53
|
return 0 if( $realmin >= $realmax ); |
1024
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
# correct limits. $min, $max are indices; $left, $right are limits |
1026
|
23
|
|
|
|
|
46
|
my $min = $self->_round($realmin); |
1027
|
23
|
|
|
|
|
49
|
my $max = $self->_round($realmax); |
1028
|
23
|
|
|
|
|
52
|
my $left = $self->_lower($realmin); |
1029
|
23
|
|
|
|
|
51
|
my $right = $self->_upper($realmax); |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
# find first bin that's above $left |
1032
|
23
|
|
|
|
|
52
|
my $keys = $self->_sort; |
1033
|
23
|
|
|
|
|
52
|
my $i = _bin_search_ge($keys, $left); |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
# warn "sum_of [$min, $max]"; |
1036
|
|
|
|
|
|
|
# add up bins |
1037
|
23
|
|
|
|
|
31
|
my $sum = 0; |
1038
|
23
|
|
|
|
|
51
|
for (; $i < @$keys; $i++) { |
1039
|
284
|
|
|
|
|
855
|
my $val = $keys->[$i]; |
1040
|
284
|
100
|
|
|
|
510
|
last if $val > $right; |
1041
|
264
|
|
|
|
|
569
|
$sum += $self->{data}{$val} * $code->( $val ); |
1042
|
|
|
|
|
|
|
}; |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
# cut edges: the hard part |
1045
|
|
|
|
|
|
|
# min and max are now used as indices |
1046
|
|
|
|
|
|
|
# if min or max hits 0, we cut it in half (i.e. into equal 0+ and 0-) |
1047
|
|
|
|
|
|
|
# warn "Add up, sum_of = $sum"; |
1048
|
23
|
100
|
|
|
|
134
|
if ($self->{data}{$max}) { |
1049
|
20
|
|
|
|
|
45
|
my $width = $self->_upper($max) - $self->_lower($max); |
1050
|
20
|
100
|
|
|
|
61
|
my $part = $width |
1051
|
|
|
|
|
|
|
? ($self->_upper($max) - $realmax) / $width |
1052
|
|
|
|
|
|
|
: 0.5; |
1053
|
20
|
|
|
|
|
99
|
$sum -= $self->{data}{$max} * $code->($max) * $part; |
1054
|
|
|
|
|
|
|
}; |
1055
|
|
|
|
|
|
|
# warn "Cut R, sum_of = $sum"; |
1056
|
23
|
100
|
|
|
|
113
|
if ($self->{data}{$min}) { |
1057
|
20
|
|
|
|
|
40
|
my $width = $self->_upper($min) - $self->_lower($min); |
1058
|
20
|
100
|
|
|
|
56
|
my $part = $width |
1059
|
|
|
|
|
|
|
? ($realmin - $self->_lower($min)) / $width |
1060
|
|
|
|
|
|
|
: 0.5; |
1061
|
20
|
|
|
|
|
71
|
$sum -= $self->{data}{$min} * $code->($min) * $part; |
1062
|
|
|
|
|
|
|
}; |
1063
|
|
|
|
|
|
|
# warn "Cut L, sum_of = $sum"; |
1064
|
|
|
|
|
|
|
|
1065
|
23
|
|
|
|
|
129
|
return $sum; |
1066
|
|
|
|
|
|
|
}; # end sum_of |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
=head2 histogram ( %options ) |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
Returns array of form [ [ count0_1, x0, x1 ], [count1_2, x1, x2 ], ... ] |
1071
|
|
|
|
|
|
|
where countX_Y is number of data points between X and Y. |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
Options may include: |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
=over |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
=item * count (+) - number of intervals to divide sample into. |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
=item * index (+) - interval borders as array. Will be sorted before processing. |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
=item * min - ignore values below this. Default = $self->min - epsilon. |
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
=item * max - ignore values above this. Default = $self->max + epsilon. |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
=item * ltrim - ignore this % of values on lower end. |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
=item * rtrim - ignore this % of values on upper end. |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
=item * normalize_to - adjust counts so that max number becomes nnn. |
1090
|
|
|
|
|
|
|
This may be useful if one intends to draw pictures. |
1091
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
=back |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
Either count or index must be present. |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
NOTE: this is equivalent to frequency_distribution_ref but better suited |
1097
|
|
|
|
|
|
|
for omitting sample tails and outputting pretty pictures. |
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
=cut |
1100
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
sub histogram { |
1102
|
5
|
|
|
5
|
1
|
1977
|
my ($self, %opt) = @_; |
1103
|
|
|
|
|
|
|
|
1104
|
5
|
100
|
|
|
|
13
|
return unless $self->count; |
1105
|
4
|
|
|
|
|
14
|
my ($min, $max) = $self->find_boundaries( %opt ); |
1106
|
|
|
|
|
|
|
# build/check index |
1107
|
4
|
100
|
|
|
|
6
|
my @index = @{ $opt{index} || [] }; |
|
4
|
|
|
|
|
22
|
|
1108
|
4
|
100
|
|
|
|
12
|
if (!@index) { |
1109
|
3
|
|
|
|
|
6
|
my $n = $opt{count}; |
1110
|
3
|
50
|
|
|
|
8
|
$n > 0 or croak (__PACKAGE__.": histogram: insufficient options (count < 1 )"); |
1111
|
3
|
|
|
|
|
33
|
my $step = ($max - $min) / $n; |
1112
|
3
|
|
|
|
|
12
|
for (my $x = $min; $x <= $max; $x += $step) { |
1113
|
16
|
|
|
|
|
38
|
push @index, $x; |
1114
|
|
|
|
|
|
|
}; |
1115
|
|
|
|
|
|
|
} else { |
1116
|
|
|
|
|
|
|
# sort & uniq raw index |
1117
|
1
|
|
|
|
|
3
|
my %known; |
1118
|
1
|
|
|
|
|
2
|
@index = grep { !$known{$_}++ } @index; |
|
5
|
|
|
|
|
17
|
|
1119
|
1
|
|
|
|
|
4
|
@index = sort { $a <=> $b } @index; |
|
8
|
|
|
|
|
11
|
|
1120
|
1
|
50
|
|
|
|
5
|
@index > 1 or croak (__PACKAGE__.": histogram: insufficient options (index < 2)"); |
1121
|
|
|
|
|
|
|
}; |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
# finally: estimated counts between indices! |
1124
|
4
|
|
|
|
|
6
|
my @ret; |
1125
|
4
|
|
|
|
|
14
|
for ( my $i = 0; $i<@index-1; $i++) { |
1126
|
17
|
|
|
|
|
41
|
my $count = $self->_count( $index[$i], $index[$i+1] ); |
1127
|
17
|
|
|
|
|
78
|
push @ret, [ $count, $index[$i], $index[$i+1] ]; |
1128
|
|
|
|
|
|
|
}; |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
# if normalize - find maximum & divide by it |
1131
|
4
|
100
|
|
|
|
11
|
if (my $norm = $opt{normalize_to}) { |
1132
|
1
|
|
|
|
|
3
|
my $max = 0; |
1133
|
|
|
|
|
|
|
$max < $_->[0] and $max = $_->[0] |
1134
|
1
|
|
66
|
|
|
9
|
for @ret; |
1135
|
1
|
|
|
|
|
2
|
$norm /= $max; |
1136
|
1
|
|
|
|
|
5
|
$_->[0] *= $norm for @ret; |
1137
|
|
|
|
|
|
|
}; |
1138
|
|
|
|
|
|
|
|
1139
|
4
|
|
|
|
|
18
|
return \@ret; |
1140
|
|
|
|
|
|
|
}; |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
=head2 find_boundaries( %opt ) |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
Return ($min, $max) of part of sample denoted by options. |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
Options may include: |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
=over |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
=item * min - ignore values below this. default = min() - epsilon. |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
=item * max - ignore values above this. default = max() + epsilon. |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
=item * ltrim - ignore this % of values on lower end. |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=item * rtrim - ignore this % of values on upper end. |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
=back |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
If no options are given, the whole sample is guaranteed to reside between |
1161
|
|
|
|
|
|
|
returned values. |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
=cut |
1164
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
sub find_boundaries { |
1166
|
9
|
|
|
9
|
1
|
1187
|
my $self = shift; |
1167
|
9
|
|
|
|
|
26
|
my %opt = @_; |
1168
|
|
|
|
|
|
|
|
1169
|
9
|
100
|
|
|
|
33
|
return unless $self->count; |
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
# preprocess boundaries |
1172
|
8
|
100
|
|
|
|
39
|
my $min = defined $opt{min} ? $opt{min} : $self->_lower( $self->min ); |
1173
|
8
|
100
|
|
|
|
39
|
my $max = defined $opt{max} ? $opt{max} : $self->_upper( $self->max ); |
1174
|
|
|
|
|
|
|
|
1175
|
8
|
100
|
|
|
|
33
|
if ($opt{ltrim}) { |
1176
|
1
|
|
|
|
|
5
|
my $newmin = $self->percentile( $opt{ltrim} ); |
1177
|
1
|
50
|
33
|
|
|
7
|
defined $newmin and $newmin > $min and $min = $newmin; |
1178
|
|
|
|
|
|
|
}; |
1179
|
8
|
100
|
|
|
|
23
|
if ($opt{utrim}) { |
1180
|
1
|
|
|
|
|
3
|
my $newmax = $self->percentile( 100-$opt{utrim} ); |
1181
|
1
|
50
|
33
|
|
|
14
|
defined $newmax and $newmax < $max and $max = $newmax; |
1182
|
|
|
|
|
|
|
}; |
1183
|
|
|
|
|
|
|
|
1184
|
8
|
|
|
|
|
30
|
return ($min, $max); |
1185
|
|
|
|
|
|
|
}; |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
################################################################ |
1188
|
|
|
|
|
|
|
# No more public methods please |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
# MEMOIZE |
1191
|
|
|
|
|
|
|
# We'll keep methods' returned values under {cache}. |
1192
|
|
|
|
|
|
|
# All setters destroy said cache altogether. |
1193
|
|
|
|
|
|
|
# PLEASE replace this with a ready-made module if there's one. |
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
# Sorry for this black magic, but it's too hard to write //= in EVERY method |
1196
|
|
|
|
|
|
|
# Long story short |
1197
|
|
|
|
|
|
|
# The next sub replaces $self->foo() with |
1198
|
|
|
|
|
|
|
# sub { $self->{cache}{foo} //= $self->originnal_foo } |
1199
|
|
|
|
|
|
|
# All setter methods are EXPECTED to destroy {cache} altogether. |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
# NOTE if you plan subclassing the method, re-memoize methods you change. |
1202
|
|
|
|
|
|
|
sub _memoize_method { |
1203
|
242
|
|
|
242
|
|
438
|
my ($class, $name, $arg) = @_; |
1204
|
|
|
|
|
|
|
|
1205
|
242
|
|
|
|
|
1041
|
my $orig_code = $class->can($name); |
1206
|
242
|
50
|
|
|
|
646
|
die "Error in memoizer section ($name)" |
1207
|
|
|
|
|
|
|
unless ref $orig_code eq 'CODE'; |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
# begin long conditional |
1210
|
|
|
|
|
|
|
my $cached_code = !$arg |
1211
|
|
|
|
|
|
|
? sub { |
1212
|
203
|
100
|
|
203
|
|
5906
|
if (!exists $_[0]->{cache}{$name}) { |
1213
|
100
|
|
|
|
|
277
|
$_[0]->{cache}{$name} = $orig_code->($_[0]); |
1214
|
|
|
|
|
|
|
}; |
1215
|
203
|
|
|
|
|
859
|
return $_[0]->{cache}{$name}; |
1216
|
|
|
|
|
|
|
} |
1217
|
|
|
|
|
|
|
: sub { |
1218
|
169
|
|
|
169
|
|
1657
|
my $self = shift; |
1219
|
169
|
|
|
|
|
236
|
my $arg = shift; |
1220
|
169
|
100
|
|
|
|
409
|
$arg = '' unless defined $arg; |
1221
|
|
|
|
|
|
|
|
1222
|
169
|
100
|
|
|
|
593
|
if (!exists $self->{cache}{"$name:$arg"}) { |
1223
|
100
|
|
|
|
|
239
|
$self->{cache}{"$name:$arg"} = $orig_code->($self, $arg); |
1224
|
|
|
|
|
|
|
}; |
1225
|
168
|
|
|
|
|
798
|
return $self->{cache}{"$name:$arg"}; |
1226
|
242
|
100
|
|
|
|
994
|
}; |
1227
|
|
|
|
|
|
|
# conditional ends here |
1228
|
|
|
|
|
|
|
|
1229
|
22
|
|
|
22
|
|
133
|
no strict 'refs'; ## no critic |
|
22
|
|
|
|
|
38
|
|
|
22
|
|
|
|
|
760
|
|
1230
|
22
|
|
|
22
|
|
118
|
no warnings 'redefine'; ## no critic |
|
22
|
|
|
|
|
47
|
|
|
22
|
|
|
|
|
2694
|
|
1231
|
242
|
|
|
|
|
312
|
*{$class."::".$name} = $cached_code; |
|
242
|
|
|
|
|
997
|
|
1232
|
|
|
|
|
|
|
}; # end of _memoize_method |
1233
|
|
|
|
|
|
|
|
1234
|
|
|
|
|
|
|
# Memoize all the methods w/o arguments |
1235
|
|
|
|
|
|
|
foreach ( qw(sum sumsq mean min max mode) ) { |
1236
|
|
|
|
|
|
|
__PACKAGE__->_memoize_method($_); |
1237
|
|
|
|
|
|
|
}; |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
# Memoize methods with 1 argument |
1240
|
|
|
|
|
|
|
foreach (qw(quantile central_moment std_moment standard_deviation variance)) { |
1241
|
|
|
|
|
|
|
__PACKAGE__->_memoize_method($_, 1); |
1242
|
|
|
|
|
|
|
}; |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
# add shorter alias of standard_deviation (this must happen AFTER memoization) |
1245
|
|
|
|
|
|
|
{ |
1246
|
22
|
|
|
22
|
|
329
|
no warnings 'once'; ## no critic |
|
22
|
|
|
|
|
38
|
|
|
22
|
|
|
|
|
9137
|
|
1247
|
|
|
|
|
|
|
*std_dev = \&standard_deviation; |
1248
|
|
|
|
|
|
|
}; |
1249
|
|
|
|
|
|
|
|
1250
|
|
|
|
|
|
|
# Get number of values below $x |
1251
|
|
|
|
|
|
|
# Like sum_of(sub{1}, undef, $x), but faster. |
1252
|
|
|
|
|
|
|
# Used by cdf() |
1253
|
|
|
|
|
|
|
sub _count { |
1254
|
105
|
|
|
105
|
|
145
|
my $self = shift; |
1255
|
105
|
100
|
|
|
|
259
|
@_>1 and return $self->_count($_[1]) - $self->_count($_[0]); |
1256
|
74
|
|
|
|
|
88
|
my $x = shift; |
1257
|
|
|
|
|
|
|
|
1258
|
74
|
|
|
|
|
134
|
my $upper = $self->_upper($x); |
1259
|
74
|
|
|
|
|
156
|
my $i = _bin_search_gt( $self->_sort, $upper ); |
1260
|
74
|
100
|
|
|
|
180
|
!$i-- and return 0; |
1261
|
65
|
|
|
|
|
140
|
my $count = $self->_probability->[$i]; |
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
# interpolate |
1264
|
65
|
|
|
|
|
130
|
my $bin = $self->_round( $x ); |
1265
|
65
|
100
|
|
|
|
325
|
if (my $val = $self->{data}{$bin}) { |
1266
|
34
|
|
|
|
|
57
|
my $width = ($upper - $bin) * 2; |
1267
|
34
|
100
|
|
|
|
67
|
my $part = $width ? ( ($upper - $x) / $width) : 1/2; |
1268
|
34
|
|
|
|
|
56
|
$count -= $part * $val; |
1269
|
|
|
|
|
|
|
}; |
1270
|
65
|
|
|
|
|
210
|
return $count; |
1271
|
|
|
|
|
|
|
}; |
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
# BINARY SEARCH |
1274
|
|
|
|
|
|
|
# Not a method, just a function |
1275
|
|
|
|
|
|
|
# Takes sorted \@array and a $num |
1276
|
|
|
|
|
|
|
# Return lowest $i such that $array[$i] >= $num |
1277
|
|
|
|
|
|
|
# Return (scalar @array) if no such $i exists |
1278
|
|
|
|
|
|
|
sub _bin_search_ge { |
1279
|
146
|
|
|
146
|
|
370
|
my ($array, $x) = @_; |
1280
|
|
|
|
|
|
|
|
1281
|
146
|
100
|
100
|
|
|
796
|
return 0 unless @$array and $array->[0] < $x; |
1282
|
125
|
|
|
|
|
160
|
my $l = 0; |
1283
|
125
|
|
|
|
|
183
|
my $r = @$array; |
1284
|
125
|
|
|
|
|
308
|
while ($l+1 < $r) { |
1285
|
403
|
|
|
|
|
632
|
my $m = int( ($l + $r) /2); |
1286
|
403
|
100
|
|
|
|
1252
|
$array->[$m] < $x ? $l = $m : $r = $m; |
1287
|
|
|
|
|
|
|
}; |
1288
|
125
|
|
|
|
|
347
|
return $l+1; |
1289
|
|
|
|
|
|
|
}; |
1290
|
|
|
|
|
|
|
sub _bin_search_gt { |
1291
|
74
|
|
|
74
|
|
103
|
my ($array, $x) = @_; |
1292
|
74
|
|
|
|
|
133
|
my $i = _bin_search_ge(@_); |
1293
|
74
|
100
|
100
|
|
|
326
|
$i++ if defined $array->[$i] and $array->[$i] == $x; |
1294
|
74
|
|
|
|
|
115
|
return $i; |
1295
|
|
|
|
|
|
|
}; |
1296
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
# THE CORE |
1298
|
|
|
|
|
|
|
# Here come the number=>bin functions |
1299
|
|
|
|
|
|
|
# round() generates bin center |
1300
|
|
|
|
|
|
|
# upper() and lower() are respective boundaries. |
1301
|
|
|
|
|
|
|
# Here's the algorithm: |
1302
|
|
|
|
|
|
|
# 1) determine whether bin is linear or logarithmic |
1303
|
|
|
|
|
|
|
# 2) for linear bins, return bin# * bucket_width (==2*absolute error) |
1304
|
|
|
|
|
|
|
# add/subtract 0.5 to get edges. |
1305
|
|
|
|
|
|
|
# 3) for logarithmic bins, return base ** bin# with appropriate sign |
1306
|
|
|
|
|
|
|
# multiply by precalculated constant to get edges |
1307
|
|
|
|
|
|
|
# note that +0.5 doesn't work here, since sqrt(a*b) != (a+b)/2 |
1308
|
|
|
|
|
|
|
# This part is fragile and can be written better |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
# center of bin containing x |
1311
|
|
|
|
|
|
|
sub _round { |
1312
|
90888
|
|
|
90888
|
|
109040
|
my $self = shift; |
1313
|
90888
|
|
|
|
|
109305
|
my $x = shift; |
1314
|
|
|
|
|
|
|
|
1315
|
22
|
|
|
22
|
|
180
|
use warnings FATAL => qw(numeric uninitialized); |
|
22
|
|
|
|
|
37
|
|
|
22
|
|
|
|
|
3570
|
|
1316
|
|
|
|
|
|
|
|
1317
|
90888
|
100
|
|
|
|
189372
|
if (abs($x) <= $self->{linear_thresh}) { |
1318
|
|
|
|
|
|
|
return $self->{linear_width} |
1319
|
1761
|
|
100
|
|
|
6796
|
&& $self->{linear_width} * floor( $x / $self->{linear_width} + 0.5 ); |
1320
|
|
|
|
|
|
|
}; |
1321
|
89124
|
|
|
|
|
225567
|
my $i = floor (((log abs $x) - $self->{logfloor})/ $self->{logbase}); |
1322
|
89124
|
|
|
|
|
152199
|
my $value = $self->{base} ** $i; |
1323
|
89124
|
100
|
|
|
|
415819
|
return $x < 0 ? -$value : $value; |
1324
|
|
|
|
|
|
|
}; |
1325
|
|
|
|
|
|
|
|
1326
|
|
|
|
|
|
|
# lower, upper limits of bin containing x |
1327
|
|
|
|
|
|
|
sub _lower { |
1328
|
882
|
|
|
882
|
|
58216
|
my $self = shift; |
1329
|
882
|
|
|
|
|
1151
|
my $x = shift; |
1330
|
|
|
|
|
|
|
|
1331
|
22
|
|
|
22
|
|
116
|
use warnings FATAL => qw(numeric uninitialized); |
|
22
|
|
|
|
|
33
|
|
|
22
|
|
|
|
|
8391
|
|
1332
|
|
|
|
|
|
|
|
1333
|
882
|
100
|
|
|
|
2365
|
if (abs($x) <= $self->{linear_thresh}) { |
1334
|
|
|
|
|
|
|
return $self->{linear_width} |
1335
|
183
|
|
66
|
|
|
1419
|
&& $self->{linear_width} * (floor( $x / $self->{linear_width} + 0.5) - 0.5); |
1336
|
|
|
|
|
|
|
}; |
1337
|
699
|
|
|
|
|
2309
|
my $i = floor (((log abs $x) - $self->{logfloor} )/ $self->{logbase}); |
1338
|
699
|
100
|
|
|
|
1571
|
if ($x > 0) { |
1339
|
310
|
|
|
|
|
1300
|
return $self->{floor} * $self->{base}**($i); |
1340
|
|
|
|
|
|
|
} else { |
1341
|
389
|
|
|
|
|
1627
|
return -$self->{floor} * $self->{base}**($i+1); |
1342
|
|
|
|
|
|
|
}; |
1343
|
|
|
|
|
|
|
}; |
1344
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
sub _upper { |
1346
|
462
|
|
|
462
|
|
4814
|
return -$_[0]->_lower(-$_[1]); |
1347
|
|
|
|
|
|
|
}; |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
# build bin index |
1350
|
|
|
|
|
|
|
sub _sort { |
1351
|
189
|
|
|
189
|
|
260
|
my $self = shift; |
1352
|
|
|
|
|
|
|
return $self->{cache}{sorted} |
1353
|
189
|
|
100
|
|
|
863
|
||= [ sort { $a <=> $b } keys %{ $self->{data} } ]; |
|
1232
|
|
|
|
|
1909
|
|
|
31
|
|
|
|
|
262
|
|
1354
|
|
|
|
|
|
|
}; |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
# build cumulative bin counts index |
1357
|
|
|
|
|
|
|
sub _probability { |
1358
|
106
|
|
|
106
|
|
134
|
my $self = shift; |
1359
|
106
|
|
66
|
|
|
425
|
return $self->{cache}{probability} ||= do { |
1360
|
19
|
|
|
|
|
35
|
my @array; |
1361
|
19
|
|
|
|
|
36
|
my $sum = 0; |
1362
|
19
|
|
|
|
|
33
|
foreach (@{ $self->_sort }) { |
|
19
|
|
|
|
|
56
|
|
1363
|
160
|
|
|
|
|
255
|
$sum += $self->{data}{$_}; |
1364
|
160
|
|
|
|
|
299
|
push @array, $sum; |
1365
|
|
|
|
|
|
|
}; |
1366
|
19
|
|
|
|
|
129
|
\@array; |
1367
|
|
|
|
|
|
|
}; |
1368
|
|
|
|
|
|
|
}; |
1369
|
|
|
|
|
|
|
|
1370
|
|
|
|
|
|
|
=head1 AUTHOR |
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
Konstantin S. Uvarin, C<< >> |
1373
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
=head1 BUGS |
1375
|
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
The module is currently under development. There may be bugs. |
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
C only works for discrete distributions, and simply returns |
1379
|
|
|
|
|
|
|
the first bin with largest bin count. |
1380
|
|
|
|
|
|
|
A better algorithm is wanted. |
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
C should have been made a private method. |
1383
|
|
|
|
|
|
|
Its signature and/or name may change in the future. |
1384
|
|
|
|
|
|
|
|
1385
|
|
|
|
|
|
|
See the TODO file in the distribution package. |
1386
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
Please feel free to post bugs and/or feature requests to github: |
1388
|
|
|
|
|
|
|
L |
1389
|
|
|
|
|
|
|
|
1390
|
|
|
|
|
|
|
Alternatively, you can use CPAN RT |
1391
|
|
|
|
|
|
|
via e-mail C, |
1392
|
|
|
|
|
|
|
or through the web interface at |
1393
|
|
|
|
|
|
|
L. |
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
Your contribution is appreciated. |
1396
|
|
|
|
|
|
|
|
1397
|
|
|
|
|
|
|
=head1 SUPPORT |
1398
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
1400
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
perldoc Statistics::Descriptive::LogScale |
1402
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
You can also look for information at: |
1404
|
|
|
|
|
|
|
|
1405
|
|
|
|
|
|
|
=over 4 |
1406
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
=item * GitHub: |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
L |
1410
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here) |
1412
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
L |
1414
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
1416
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
L |
1418
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
=item * CPAN Ratings |
1420
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
L |
1422
|
|
|
|
|
|
|
|
1423
|
|
|
|
|
|
|
=item * Search CPAN |
1424
|
|
|
|
|
|
|
|
1425
|
|
|
|
|
|
|
L |
1426
|
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
|
=back |
1428
|
|
|
|
|
|
|
|
1429
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
1430
|
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
This module was inspired by a talk that Andrew Aksyonoff, author of |
1432
|
|
|
|
|
|
|
L, |
1433
|
|
|
|
|
|
|
has given at HighLoad++ conference in Moscow, 2012. |
1434
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
L was and is used as reference when in doubt. |
1436
|
|
|
|
|
|
|
Several code snippets were shamelessly stolen from there. |
1437
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
C and C parameter names were suggested by |
1439
|
|
|
|
|
|
|
CountZero from http://perlmonks.org |
1440
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
1442
|
|
|
|
|
|
|
|
1443
|
|
|
|
|
|
|
Copyright 2013-2015 Konstantin S. Uvarin. |
1444
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
1446
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
1447
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
1448
|
|
|
|
|
|
|
|
1449
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
1450
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
=cut |
1452
|
|
|
|
|
|
|
|
1453
|
|
|
|
|
|
|
1; # End of Statistics::Descriptive::LogScale |