File Coverage

blib/lib/Statistics/Descriptive/LogScale.pm

Criterion	Covered	Total	%
statement	363	364	99.7
branch	139	164	84.7
condition	52	68	76.4
subroutine	56	56	100.0
pod	28	28	100.0
total	638	680	93.8

line	stmt	bran	cond	sub	pod	time	code
1	23			23		1406756	use 5.006;
	23					334
2	23			23		147	use strict;
	23					52
	23					623
3	23			23		125	use warnings;
	23					61
	23					1692
4
5							package Statistics::Descriptive::LogScale;
6
7							=head1 NAME
8
9							Statistics::Descriptive::LogScale - Memory-efficient approximate univariate
10							descriptive statistics class.
11
12							=head1 VERSION
13
14							Version 0.10
15
16							=cut
17
18							our $VERSION = 0.10;
19
20							=head1 SYNOPSIS
21
22							=head2 Basic usage
23
24							The basic usage is roughly the same as that of L.
25
26							use Statistics::Descriptive::LogScale;
27							my $stat = Statistics::Descriptive::LogScale->new ();
28
29							while(<>) {
30							chomp;
31							$stat->add_data($_);
32							};
33
34							# This can also be done in O(1) memory, precisely
35							printf "Mean: %f +- %f\n", $stat->mean, $stat->standard_deviation;
36							# This requires storing actual data, or approximating
37							printf "25%% : %f\n", $stat->percentile(25);
38							printf "Median: %f\n", $stat->median;
39							printf "75%% : %f\n", $stat->percentile(75);
40
41							=head2 Save/load
42
43							This is not present in L.
44							The save/load interface is designed compatible with JSON::XS.
45							However, any other serializer can be used.
46							The C method is I to return unblessed hashref
47							with enough information to restore the original object.
48
49							use Statistics::Descriptive::LogScale;
50							my $stat = Statistics::Descriptive::LogScale->new ();
51
52							# ..... much later
53							# Save
54							print $fd encoder_of_choice( $stat->TO_JSON )
55							or die "Failed to save: $!";
56
57							# ..... and even later
58							# Load
59							my $plain_hash = decoder_of_choice( $raw_data );
60							my $copy_of_stat = Statistics::Descriptive::LogScale->new( %$plain_hash );
61
62							# Import into existing LogScale instance
63							my $plain_hash = decoder_of_choice( $more_raw_data );
64							$copy_of_stat->add_data_hash( $plain_hash->{data} );
65
66							=head2 Histograms
67
68							Both L and L
69							offer C method for querying data point counts.
70							However, there's also C method for making pretty pictures.
71							Here's a simple text-based histogram.
72							A proper GD example was too long to fit into this margin.
73
74							use strict;
75							use warnings;
76
77							use Statistics::Descriptive::LogScale;
78							my $stat = Statistics::Descriptive::LogScale->new ();
79
80							# collect/load data ...
81							my $re_float = qr([-+]?(?:\d+\.?\d*\|\.\d+)(?:[Ee][-+]?\d+)?);
82							while (<>) {
83							$stat->add_data($_) for m/($re_float)/g;
84							};
85							die "Empty set"
86							unless $stat->count;
87
88							# get data in [ count, lower_bound, upper_bound ] format as arrayref
89							my $hist = $stat->histogram( count => 20 );
90
91							# find maximum value to use as a scale factor
92							my $scale = $hist->[0][0];
93							$scale < $_->[0] and $scale = $_->[0] for @$hist;
94
95							foreach (@$hist) {
96							printf "%10f %s\n", $_->[1], '#' x ($_->[0] * 68 / $scale);
97							};
98							printf "%10f\n", $hist->[-1][2];
99
100							=head1 DESCRIPTION
101
102							This module aims at providing some advanced statistical functions without
103							storing all data in memory, at the cost of certain (predictable) precision loss.
104
105							Data is represented by a set of bins that only store counts of fitting
106							data points.
107							Most bins are logarithmic, i.e. lower end / upper end ratio is constant.
108							However, around zero linear approximation may be user instead
109							(see "linear_width" and "linear_thresh" parameters in new()).
110
111							All operations are then performed on the bins, introducing relative error
112							which does not, however, exceed the bins' relative width ("base").
113
114							=head1 METHODS
115
116							=cut
117
118							########################################################################
119							# == HOW IT WORKS ==
120							# Buckets are stored in a hash: { $value => $count, ... }
121							# {base} is bin width, {logbase} == log {base} (cache)
122							# {linear_thresh} is where we switch to equal bin approximation
123							# {linear_width} is width of bin around zero (==linear_thresh if not given)
124							# {floor} is lower bound of bin whose center is 1. {logfloor} = log {floor}
125							# Nearly all meaningful subs have to scan all the bins, which is bad,
126							# but anyway better than scanning full sample.
127
128	23			23		160	use Carp;
	23					56
	23					1504
129	23			23		7634	use POSIX qw(floor ceil);
	23					136190
	23					165
130
131							# Fields are NOT used internally for now, so this is just a declaration
132	23					104	use fields qw(
133							data count
134							linear_width base logbase floor linear_thresh logfloor
135							cache
136	23			23		41360	);
	23					34367
137
138							# Some internal constants
139							# This is for infinite portability^W^W portable infinity
140							my $INF = 999;
141							my $re_num = qr/(?:[-+]?(?:\d+\.?\d*\|\.\d+)(?:[Ee][-+]?\d+)?)/;
142
143							=head2 new( %options )
144
145							%options may include:
146
147							=over
148
149							=item * base - ratio of adjacent bins. Default is 10^(1/232), which gives
150							1% precision and exact decimal powers.
151							This value represents acceptable relative error in analysis results.
152
153							B Actual value may be slightly less than requested one.
154							This is done so to avoid troubles with future rounding in (de)serialization.
155
156							=item * linear_width - width of linear bins around zero.
157							This value represents precision of incoming data.
158							Default is zero, i.e. we assume that the measurement is precise.
159
160							B Actual value may be less (by no more than a factor of C)
161							so that borders of linear and logarithmic bins fit nicely.
162
163							=item * linear_thresh - where to switch to linear approximation.
164							If only one of C and C is given,
165							the other will be calculated.
166							However, user may want to specify both in some cases.
167
168							B Actual value may be less (by no more than a factor of C)
169							so that borders of linear and logarithmic bins fit nicely.
170
171							=item * data - hashref with C<{ value => weight }> for initializing data.
172							Used for cloning.
173							See C.
174
175							=item * zero_thresh - absolute value threshold below which everything is
176							considered zero.
177							DEPRECATED, C and C override this if given.
178
179							=back
180
181							=cut
182
183							my @new_keys = qw( base linear_thresh linear_width zero_thresh data );
184							# TODO Throw if extra options given?
185							sub new {
186	42			42	1	24214	my $class = shift;
187	42					187	my %opt = @_;
188
189							# base for logarithmic bins, sane default: +-1%, exact decimal powers
190							# UGLY HACK number->string->number to avoid
191							# future serialization inconsistencies
192	42		100			275	$opt{base} \|\|= 10**(1/232);
193	42					376	$opt{base} = 0 + "$opt{base}";
194	42	50				170	$opt{base} > 1 or croak __PACKAGE__.": new(): base must be >1";
195
196							# calculate where to switch to linear approximation
197							# the condition is: linear bin( thresh ) ~~ log bin( thresh )
198							# i.e. thresh * base - thresh ~~ absolute error * 2
199							# i.e. thresh ~~ absolute_error * 2 / (base - 1)
200							# also support legacy API (zero_thresh)
201	42	100				150	if (defined $opt{linear_thresh} ) {
202	13		100			61	$opt{linear_width} \|\|= $opt{linear_thresh} * ($opt{base}-1);
203							} else {
204	29					83	$opt{linear_thresh} = $opt{zero_thresh};
205							};
206							$opt{linear_thresh} = abs($opt{linear_width}) / ($opt{base} - 1)
207	42	100	100			217	if $opt{linear_width} and !$opt{linear_thresh};
208	42		100			206	$opt{linear_thresh} \|\|= 0;
209	42	50				172	$opt{linear_thresh} >= 0
210							or croak __PACKAGE__.": new(): linear_thresh must be >= 0";
211
212							# Can't use fields::new anymore
213							# due to JSON::XS incompatibility with restricted hashes
214	42					135	my $self = bless {}, $class;
215
216	42					198	$self->{base} = $opt{base};
217							# cache values to ease calculations
218							# floor = (lower end of bin) / (center of bin)
219	42					187	$self->{floor} = 2/(1+$opt{base});
220	42					186	$self->{logbase} = log $opt{base};
221	42					127	$self->{logfloor} = log $self->{floor};
222
223							# bootstrap linear_thresh - make it fit bin edge
224	42					109	$self->{linear_width} = $self->{linear_thresh} = 0;
225	42					192	$self->{linear_thresh} = $self->_lower( $opt{linear_thresh} );
226
227							# divide anything below linear_thresh into odd number of bins
228							# not exceeding requested linear_width
229	42	100				146	if ($self->{linear_thresh}) {
230	15		66			64	my $linear_width = $opt{linear_width} \|\| 2 * $self->{linear_thresh};
231	15					76	my $n_linear = ceil(2 * $self->{linear_thresh} / abs($linear_width));
232	15	100				63	$n_linear++ unless $n_linear % 2;
233	15					48	$self->{linear_width} = (2 * $self->{linear_thresh} / $n_linear);
234							};
235
236	42					177	$self->clear;
237	42	100				139	if ($opt{data}) {
238	10					37	$self->add_data_hash($opt{data});
239							};
240	42					206	return $self;
241							};
242
243							=head1 General statistical methods
244
245							These methods are used to query the distribution properties. They generally
246							follow the interface of L and co,
247							with minor additions.
248
249							All methods return C on empty data set, except for
250							C, C, C, C and C
251							which all return 0.
252
253							B This module caches whatever it calculates very agressively.
254							Don't hesitate to use statistical functions (except for sum_of/mean_of)
255							more than once. The cache is deleted upon data entry.
256
257							=head2 clear
258
259							Destroy all stored data.
260
261							=cut
262
263							sub clear {
264	56			56	1	8319	my $self = shift;
265	56					2266	$self->{data} = {};
266	56					181	$self->{count} = 0;
267	56					150	delete $self->{cache};
268	56					115	return $self;
269							};
270
271							=head2 add_data( @data )
272
273							Add numbers to the data pool.
274
275							Returns self, so that methods can be chained.
276
277							If incorrect data is given (i.e. non-numeric, undef),
278							an exception is thrown and only partial data gets inserted.
279							The state of object is guaranteed to remain consistent in such case.
280
281							B Cache is reset, even if no data was actually inserted.
282
283							B It is possible to add infinite values to data pool.
284							The module will try and calculate whatever can still be calculated.
285							However, no portable way of serializing such values is done yet.
286
287							=cut
288
289							sub add_data {
290	51213			51213	1	162300	my $self = shift;
291
292	51213					71115	delete $self->{cache};
293	51213					80032	foreach (@_) {
294	90141					152375	$self->{data}{ $self->_round($_) }++;
295	90138					167178	$self->{count}++;
296							};
297	51210					92938	$self;
298							};
299
300							=head2 count
301
302							Returns number of data points.
303
304							=cut
305
306							sub count {
307	69			69	1	10188	my $self = shift;
308	69					284	return $self->{count};
309							};
310
311							=head2 min
312
313							=head2 max
314
315							Values of minimal and maximal bins.
316
317							NOTE: Due to rounding, some of the actual inserted values may fall outside
318							of the min..max range. This may change in the future.
319
320							=cut
321
322							sub min {
323							my $self = shift;
324							return $self->_sort->[0];
325							};
326
327							sub max {
328							my $self = shift;
329							return $self->_sort->[-1];
330							};
331
332							=head2 sample_range
333
334							Return sample range of the dataset, i.e. max() - min().
335
336							=cut
337
338							sub sample_range {
339	5			5	1	739	my $self = shift;
340	5	100				16	return $self->count ? $self->max - $self->min : undef;
341							};
342
343							=head2 sum
344
345							Return sum of all data points.
346
347							=cut
348
349							sub sum {
350							my $self = shift;
351							return $self->sum_of(sub { $_[0] });
352							};
353
354							=head2 sumsq
355
356							Return sum of squares of all datapoints.
357
358							=cut
359
360							sub sumsq {
361							my $self = shift;
362							return $self->sum_of(sub { $_[0] * $_[0] });
363							};
364
365							=head2 mean
366
367							Return mean, or average value, i.e. sum()/count().
368
369							=cut
370
371							sub mean {
372							my $self = shift;
373							return $self->{count} ? $self->sum / $self->{count} : undef;
374							};
375
376							=head2 variance
377
378							=head2 variance( $correction )
379
380							Return data variance, i.e. E((x - E(x)) ** 2).
381
382							Bessel's correction (division by n-1 instead of n) is used by default.
383							This may be changed by specifying $correction explicitly.
384
385							B The binning strategy used here should also introduce variance bias.
386							This is not yet accounted for.
387
388							=cut
389
390							sub variance {
391							my $self = shift;
392							my $correction = shift;
393
394							# in fact we'll receive correction='' because of how cache works
395							$correction = 1 unless defined $correction and length $correction;
396
397							return 0 if ($self->{count} < 1 + $correction);
398
399							my $var = $self->sumsq - $self->sum**2 / $self->{count};
400							return $var <= 0 ? 0 : $var / ( $self->{count} - $correction );
401							};
402
403							=head2 standard_deviation
404
405							=head2 standard_deviation( $correction )
406
407							=head2 std_dev
408
409							Return standard deviation, i.e. square root of variance.
410
411							Bessel's correction (division by n-1 instead of n) is used by default.
412							This may be changed by specifying $correction explicitly.
413
414							B The binning strategy used here should also introduce variance bias.
415							This is not yet accounted for.
416
417							=cut
418
419							sub standard_deviation {
420							my $self = shift;
421
422							return sqrt($self->variance(@_));
423							};
424
425							=head2 cdf ($x)
426
427							Cumulative distribution function. Returns estimated probability of
428							random data point from the sample being less than C<$x>.
429
430							As a special case, C accounts for I of zeroth bin count (if any).
431
432							Not present in Statistics::Descriptive::Full, but appears in
433							L.
434
435							=head2 cdf ($x, $y)
436
437							Returns probability of a value being between C<$x> and C<$y> ($x <= $y).
438							This is essentially C.
439
440							=cut
441
442							sub cdf {
443	4			4	1	9	my $self = shift;
444	4	50				11	return unless $self->{count};
445	4					10	return $self->_count(@_) / $self->{count};
446							};
447
448							=head2 percentile( $n )
449
450							Find $n-th percentile, i.e. a value below which lies $n % of the data.
451
452							0-th percentile is by definition -inf and is returned as undef
453							(see Statistics::Descriptive).
454
455							$n is a real number, not necessarily integer.
456
457							=cut
458
459							sub percentile {
460	48			48	1	3068	my $self = shift;
461	48					80	my $x = shift;
462
463							# assert 0<=$x<=100
464	48	50	33			218	croak __PACKAGE__.": percentile() argument must be between 0 and 100"
465							unless 0<= $x and $x <= 100;
466
467	48					124	my $need = $x * $self->{count} / 100;
468	48	100				137	return if $need < 1;
469
470							# dichotomize
471							# $i is lowest value >= needed
472							# $need doesnt exceed last bin!
473	42					99	my $i = _bin_search_ge( $self->_probability, $need );
474	42					94	return $self->_sort->[ $i ];
475							};
476
477							=head2 quantile( 0..4 )
478
479							From Statistics::Descriptive manual:
480
481							0 => zero quartile (Q0) : minimal value
482							1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile
483							2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile
484							3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile
485							4 => fourth quartile (Q4) : maximal value
486
487							=cut
488
489							sub quantile {
490							my $self = shift;
491							my $t = shift;
492
493							croak (__PACKAGE__.": quantile() argument must be one of 0..4")
494							unless $t =~ /^[0-4]$/;
495
496							$t or return $self->min;
497							return $self->percentile($t * 100 / 4);
498							};
499
500							=head2 median
501
502							Return median of data, a value that divides the sample in half.
503							Same as percentile(50).
504
505							=cut
506
507							sub median {
508	16			16	1	1171	my $self = shift;
509	16					46	return $self->percentile(50);
510							};
511
512							=head2 trimmed_mean( $ltrim, [ $utrim ] )
513
514							Return mean of sample with $ltrim and $utrim fraction of data points
515							remover from lower and upper ends respectively.
516
517							ltrim defaults to 0, and rtrim to ltrim.
518
519							=cut
520
521							sub trimmed_mean {
522	1			1	1	3	my $self = shift;
523	1					2	my ($lower, $upper) = @_;
524	1		50			3	$lower \|\|= 0;
525	1	50				4	$upper = $lower unless defined $upper;
526
527	1					4	my $min = $self->percentile($lower * 100);
528	1					4	my $max = $self->percentile(100 - $upper * 100);
529
530	1	50				4	return unless $min < $max;
531
532	1			5		6	return $self->mean_of(sub{$_[0]}, $min, $max);
	5					12
533							};
534
535							=head2 harmonic_mean
536
537							Return harmonic mean of the data, i.e. 1/E(1/x).
538
539							Return undef if division by zero occurs (see Statistics::Descriptive).
540
541							=cut
542
543							sub harmonic_mean {
544	2			2	1	708	my $self = shift;
545
546	2					4	my $ret;
547	2					4	eval {
548	2			5		6	$ret = $self->count / $self->sum_of(sub { 1/$_[0] });
	5					22
549							};
550	2	50	66			23	if ($@ and $@ !~ /division.*zero/) {
551	0					0	die $@; # rethrow ALL BUT 1/0 which yields undef
552							};
553	2					12	return $ret;
554							};
555
556							=head2 geometric_mean
557
558							Return geometric mean of the data, that is, exp(E(log x)).
559
560							Dies unless all data points are of the same sign.
561
562							=cut
563
564							sub geometric_mean {
565	2			2	1	719	my $self = shift;
566
567	2	100				28	return unless $self->count;
568	1	50				3	croak __PACKAGE__.": geometric_mean() called on mixed sign sample"
569							if $self->min * $self->max < 0;
570
571	1	50				3	return 0 if $self->{data}{0};
572							# this must be dog slow, but we already log() too much at this point.
573	1			5		6	my $ret = exp( $self->sum_of( sub { log abs $_[0] } ) / $self->{count} );
	5					13
574	1	50				5	return $self->min < 0 ? -$ret : $ret;
575							};
576
577							=head2 skewness
578
579							Return skewness of the distribution, calculated as
580							n/(n-1)(n-2) * E((x-E(x))3)/std_dev3 (this is consistent with Excel).
581
582							=cut
583
584							sub skewness {
585	20			20	1	1567	my $self = shift;
586
587	20					50	my $n = $self->{count};
588	20	100				73	return unless $n > 2;
589
590							# code stolen from Statistics::Descriptive
591	18					59	my $skew = $n * $self->std_moment(3);
592	18					49	my $correction = $n / ( ($n-1) * ($n-2) );
593	18					95	return $correction * $skew;
594							};
595
596							=head2 kurtosis
597
598							Return kurtosis of the distribution, that is 4-th standardized moment - 3.
599							The exact formula used here is consistent with that of Excel and
600							Statistics::Descriptive.
601
602							=cut
603
604							sub kurtosis {
605	22			22	1	805	my $self = shift;
606
607	22					56	my $n = $self->{count};
608	22	100				85	return unless $n > 3;
609
610							# code stolen from Statistics::Descriptive
611	20					53	my $kurt = $n * $self->std_moment(4);
612	20					73	my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) );
613	20					58	my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) );
614
615	20					99	return $correction1 * $kurt - $correction2;
616							};
617
618							=head2 central_moment( $n )
619
620							Return $n-th central moment, that is, E((x - E(x))^$n).
621
622							Not present in Statistics::Descriptive::Full.
623
624							=cut
625
626							sub central_moment {
627							my $self = shift;
628							my $n = shift;
629
630							my $mean = $self->mean;
631							return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) / $self->{count};
632							};
633
634							=head2 std_moment( $n )
635
636							Return $n-th standardized moment, that is,
637							E((x - E(x))$n) / std_dev(x)$n.
638
639							Not present in Statistics::Descriptive::Full.
640
641							=cut
642
643							sub std_moment {
644							my $self = shift;
645							my $n = shift;
646
647							my $mean = $self->mean;
648							my $dev = $self->std_dev;
649							return $self->sum_of(sub{ ($_[0] - $mean) ** $n })
650							/ ( $dev*$n $self->{count} );
651							};
652
653							=head2 mode
654
655							Mode of a distribution is the most common value for a discrete distribution,
656							or maximum of probability density for continuous one.
657
658							For now we assume that the distribution IS discrete, and return the bin with
659							the biggest hit count.
660
661							NOTE A better algorithm is still wanted. Experimental.
662							Behavior may change in the future.
663
664							=cut
665
666							# Naive implementation
667							# Find bin w/greatest count and return it
668							sub mode {
669							my $self = shift;
670
671							return if !$self->count;
672
673							my $index = $self->_sort;
674							return $index->[0] if @$index == 1;
675
676							my @count = map { $self->{data}{$_} } @$index;
677
678							my $max_index;
679							my $max_growth = 0;
680							for (my $i = 0; $i<@count; $i++) {
681							$count[$i] > $max_growth or next;
682							$max_index = $i;
683							$max_growth = $count[$i];
684							};
685
686							return $index->[$max_index];
687							};
688
689							=head2 frequency_distribution_ref( \@index )
690
691							=head2 frequency_distribution_ref( $n )
692
693							=head2 frequency_distribution_ref
694
695							Return numbers of data point counts below each number in @index as hashref.
696
697							If a number is given instead of arrayref, @index is created
698							by dividing [min, max] into $n intervals.
699
700							If no parameters are given, return previous result, if any.
701
702							=cut
703
704							sub frequency_distribution_ref {
705	3			3	1	1106	my $self = shift;
706	3					5	my $index = shift;
707
708	3	100				9	return unless $self->count;
709							# ah, compatibility - return last value
710							return $self->{cache}{frequency_distribution_ref}
711	2	50				5	unless defined $index;
712							# make index if number given
713	2	100				8	if (!ref $index) {
714	1	50				3	croak __PACKAGE__.": frequency_distribution_ref(): ".
715							"argument must be array, of number > 2, not $index"
716							unless $index > 2;
717	1					8	my $min = $self->_lower($self->min);
718	1					4	my $max = $self->_upper($self->max);
719	1					3	my $step = ($max - $min) / $index;
720	1					4	$index = [ map { $min + $_ * $step } 1..$index ];
	5					9
721							};
722
723	2					10	@$index = (-$INF, sort { $a <=> $b } @$index);
	12					24
724
725	2					5	my @count;
726	2					9	for (my $i = 0; $i<@$index-1; $i++) {
727	9					27	push @count, $self->_count( $index->[$i], $index->[$i+1] );
728							};
729	2					5	shift @$index; # remove -inf
730
731	2					9	my %hash;
732	2					16	@hash{@$index} = @count;
733	2					7	$self->{cache}{frequency_distribution_ref} = \%hash;
734	2					9	return \%hash;
735							};
736
737							=head1 Specific methods
738
739							The folowing methods only apply to this module, or are experimental.
740
741							=cut
742
743							=head2 bucket_width
744
745							Get bin width (relative to center of bin). Percentiles are off
746							by no more than half of this. DEPRECATED.
747
748							=cut
749
750							sub bucket_width {
751	5			5	1	10	my $self = shift;
752	5					32	return $self->{base} - 1;
753							};
754
755							=head2 log_base
756
757							Get upper/lower bound ratio for logarithmic bins.
758							This represents relative precision of sample.
759
760							=head2 linear_width
761
762							Get width of linear buckets.
763							This represents absolute precision of sample.
764
765							=head2 linear_threshold
766
767							Get absolute value threshold below which interpolation is switched to linear.
768
769							=cut
770
771							sub log_base {
772	3			3	1	11	my $self = shift;
773	3					34	return $self->{base};
774							};
775
776							sub linear_width {
777	3			3	1	6	my $self = shift;
778	3					26	return $self->{linear_width};
779							};
780
781							sub linear_threshold {
782	8			8	1	33	my $self = shift;
783	8					41	return $self->{linear_thresh};
784							};
785
786							=head2 add_data_hash ( { value => weight, ... } )
787
788							Add values with counts/weights.
789							This can be used to import data from other
790							Statistics::Descriptive::LogScale object.
791
792							Returns self, so that methods can be chained.
793
794							Negative counts are allowed and treated as "forgetting" data.
795							If a bin count goes below zero, such bin is simply discarded.
796							Minus infinity weight is allowed and has the same effect.
797							Data is guaranteed to remain consistent.
798
799							If incorrect data is given (i.e. non-numeric, undef, or +infinity),
800							an exception is thrown and nothing changes.
801
802							B Cache may be reset, even if no data was actually inserted.
803
804							B It is possible to add infinite values to data pool.
805							The module will try and calculate whatever can still be calculated.
806							However, no portable way of serializing such values is done yet.
807
808							=cut
809
810							sub add_data_hash {
811	19			19	1	62	my $self = shift;
812	19					41	my $hash = shift;
813
814							# check incoming data for being numeric, and no +inf values
815	19					34	eval {
816	23			23		51696	use warnings FATAL => qw(numeric);
	23					64
	23					46706
817	19					114	while (my ($k, $v) = each %$hash) {
818	289	100	33			1546	$k == 0+$k and $v == 0+$v and $v < $INF
			66
819							or die "Infinite count for $k\n";
820							}
821							};
822	19	100				278	croak __PACKAGE__.": add_data_hash failed: $@"
823							if $@;
824
825	17					40	delete $self->{cache};
826
827							# update our counters
828	17					78	foreach (keys %$hash) {
829	286	50				517	next unless $hash->{$_};
830	286					522	my $key = $self->_round($_);
831
832							# Insert data. Make sure -Inf doesn't corrupt our counter.
833	286		100			1016	my $newcount = ($self->{data}{$key} \|\| 0) + $hash->{$_};
834	286	100				513	if ($newcount > 0) {
835							# normal insert
836	274					663	$self->{data}{$key} = $newcount;
837	274					479	$self->{count} += $hash->{$_};
838							} else {
839							# We're "forgetting" data, AND the bin got empty
840	12		100			44	$self->{count} -= delete $self->{data}{$key} \|\| 0;
841							};
842							};
843	17					55	$self;
844							};
845
846							=head2 get_data_hash( %options )
847
848							Return distribution hashref {value => number of occurances}.
849
850							This is inverse of add_data_hash.
851
852							Options may include:
853
854							=over
855
856							=item * min - ignore values below this. (See find_boundaries)
857
858							=item * max - ignore values above this. (See find_boundaries)
859
860							=item * ltrim - ignore this % of values on lower end. (See find_boundaries)
861
862							=item * rtrim - ignore this % of values on upper end. (See find_boundaries)
863
864							=item * noise_thresh - strip bins with count below this.
865
866							=back
867
868							=cut
869
870							sub get_data_hash {
871	63			63	1	12106	my ($self, %opt) = @_;
872
873							# shallow copy of data if no options given
874	63	100				197	return {%{ $self->{data} }} unless %opt;
	61					739
875
876	2					8	my ($min, $max) = $self->find_boundaries( %opt );
877	2		100			10	my $noize = $opt{noize_thresh} \|\| 0;
878
879	2					5	my $data = $self->{data};
880	2					4	my %hash;
881	2					8	foreach (keys %$data ) {
882	13	100				38	$_ < $min and next;
883	8	50				24	$_ > $max and next;
884	8	100				18	$data->{$_} < $noize and next;
885	6					10	$hash{$_} = $data->{$_};
886							};
887
888	2					13	return \%hash;
889							};
890
891							=head2 TO_JSON()
892
893							Return enough data to recreate the whole object as an unblessed hashref.
894
895							This routine conforms with C, hence the name.
896							Can be called as
897
898							my $str = JSON::XS->new->allow_blessed->convert_blessed->encode( $this );
899
900							B This module DOES NOT require JSON::XS or serialize to JSON.
901							It just deals with data.
902							Use C, C, C or any serializer of choice.
903
904							my $raw_data = $stat->TO_JSON;
905							Statistics::Descriptive::LogScale->new( %$raw_data );
906
907							Would generate an exact copy of C<$stat> object
908							(provided it's S::D::L and not a subclass).
909
910							=head2 clone( [ %options ] )
911
912							Copy constructor - returns copy of an existing object.
913							Cache is not preserved.
914
915							Constructor options may be given to override existing data. See new().
916
917							Trim options may be given to get partial data. See get_data_hash().
918
919							=cut
920
921							sub clone {
922	7			7	1	1099	my ($self, %opt) = @_;
923
924	7					19	my $raw = $self->TO_JSON;
925	7	100				22	if (%opt) {
926							$raw->{data} = $self->get_data_hash( %opt )
927	4	100				10	unless exists $opt{data};
928							exists $opt{$_} and $raw->{$_} = $opt{$_}
929	4		66			34	for @new_keys;
930							};
931
932	7					39	return (ref $self)->new( %$raw );
933							};
934
935							sub TO_JSON {
936	31			31	1	219	my $self = shift;
937							# UGLY HACK Increase linear_thresh by a factor of base ** 1/10
938							# so that it's rounded down to present value
939							return {
940							CLASS => ref $self,
941							VERSION => $VERSION,
942							base => $self->{base},
943							linear_width => $self->{linear_width},
944	31					148	linear_thresh => $self->{linear_thresh} * ($self->{base}+9)/10,
945							data => $self->get_data_hash,
946							};
947							};
948
949							=head2 scale_sample( $scale )
950
951							Multiply all bins' counts by given value. This can be used to adjust
952							significance of previous data before adding new data (e.g. gradually
953							"forgetting" past data in a long-running application).
954
955							=cut
956
957							sub scale_sample {
958	1			1	1	7	my $self = shift;
959	1					3	my $factor = shift;
960	1	50				5	$factor > 0 or croak (__PACKAGE__.": scale_sample: factor must be positive");
961
962	1					3	delete $self->{cache};
963	1					3	foreach (@{ $self->_sort }) {
	1					4
964	100					141	$self->{data}{$_} *= $factor;
965							};
966	1					2	$self->{count} *= $factor;
967	1					3	return $self;
968							};
969
970							=head2 mean_of( $code, [$min, $max] )
971
972							Return expectation of $code over sample within given range.
973
974							$code is expected to be a pure function (i.e. depending only on its input
975							value, and having no side effects).
976
977							The underlying integration mechanism only calculates $code once per bin,
978							so $code should be stable as in not vary wildly over small intervals.
979
980							=cut
981
982							sub mean_of {
983	24			24	1	80	my $self = shift;
984	24					67	my ($code, $min, $max) = @_;
985
986	24			429		119	my $weight = $self->sum_of( sub {1}, $min, $max );
	429					906
987	24	100				112	return unless $weight;
988	23					70	return $self->sum_of($code, $min, $max) / $weight;
989							};
990
991							=head1 Experimental methods
992
993							These methods may be subject to change in the future, or stay, if they
994							are good.
995
996							=head2 sum_of ( $code, [ $min, $max ] )
997
998							Integrate arbitrary function over the sample within the [ $min, $max ] interval.
999							Default values for both limits are infinities of appropriate sign.
1000
1001							Values in the edge bins are cut using interpolation if needed.
1002
1003							NOTE: sum_of(sub{1}, $a, $b) would return rough nubmer of data points
1004							between $a and $b.
1005
1006							EXPERIMENTAL. The method name may change in the future.
1007
1008							=cut
1009
1010							sub sum_of {
1011	147			147	1	945	my $self = shift;
1012	147					361	my ($code, $realmin, $realmax) = @_;
1013
1014							# Just app up stuff
1015	147	100	100			614	if (!defined $realmin and !defined $realmax) {
1016	124					223	my $sum = 0;
1017	124					186	while (my ($val, $count) = each %{ $self->{data} }) {
	63236					162918
1018	63112					100733	$sum += $count * $code->( $val );
1019							};
1020	124					679	return $sum;
1021							};
1022
1023	23	100				69	$realmin = -$INF unless defined $realmin;
1024	23	100				61	$realmax = $INF unless defined $realmax;
1025	23	50				70	return 0 if( $realmin >= $realmax );
1026
1027							# correct limits. $min, $max are indices; $left, $right are limits
1028	23					71	my $min = $self->_round($realmin);
1029	23					64	my $max = $self->_round($realmax);
1030	23					68	my $left = $self->_lower($realmin);
1031	23					72	my $right = $self->_upper($realmax);
1032
1033							# find first bin that's above $left
1034	23					66	my $keys = $self->_sort;
1035	23					79	my $i = _bin_search_ge($keys, $left);
1036
1037							# warn "sum_of [$min, $max]";
1038							# add up bins
1039	23					53	my $sum = 0;
1040	23					72	for (; $i < @$keys; $i++) {
1041	284					1000	my $val = $keys->[$i];
1042	284	100				684	last if $val > $right;
1043	264					653	$sum += $self->{data}{$val} * $code->( $val );
1044							};
1045
1046							# cut edges: the hard part
1047							# min and max are now used as indices
1048							# if min or max hits 0, we cut it in half (i.e. into equal 0+ and 0-)
1049							# warn "Add up, sum_of = $sum";
1050	23	100				153	if ($self->{data}{$max}) {
1051	20					60	my $width = $self->_upper($max) - $self->_lower($max);
1052	20	100				65	my $part = $width
1053							? ($self->_upper($max) - $realmax) / $width
1054							: 0.5;
1055	20					106	$sum -= $self->{data}{$max} * $code->($max) * $part;
1056							};
1057							# warn "Cut R, sum_of = $sum";
1058	23	100				131	if ($self->{data}{$min}) {
1059	20					56	my $width = $self->_upper($min) - $self->_lower($min);
1060	20	100				66	my $part = $width
1061							? ($realmin - $self->_lower($min)) / $width
1062							: 0.5;
1063	20					81	$sum -= $self->{data}{$min} * $code->($min) * $part;
1064							};
1065							# warn "Cut L, sum_of = $sum";
1066
1067	23					174	return $sum;
1068							}; # end sum_of
1069
1070							=head2 histogram ( %options )
1071
1072							Returns array of form [ [ count0_1, x0, x1 ], [count1_2, x1, x2 ], ... ]
1073							where countX_Y is number of data points between X and Y.
1074
1075							Options may include:
1076
1077							=over
1078
1079							=item * count (+) - number of intervals to divide sample into.
1080
1081							=item * index (+) - interval borders as array. Will be sorted before processing.
1082
1083							=item * min - ignore values below this. Default = $self->min - epsilon.
1084
1085							=item * max - ignore values above this. Default = $self->max + epsilon.
1086
1087							=item * ltrim - ignore this % of values on lower end.
1088
1089							=item * rtrim - ignore this % of values on upper end.
1090
1091							=item * normalize_to - adjust counts so that max number becomes nnn.
1092							This may be useful if one intends to draw pictures.
1093
1094							=back
1095
1096							Either count or index must be present.
1097
1098							NOTE: this is equivalent to frequency_distribution_ref but better suited
1099							for omitting sample tails and outputting pretty pictures.
1100
1101							=cut
1102
1103							sub histogram {
1104	5			5	1	3170	my ($self, %opt) = @_;
1105
1106	5	100				21	return unless $self->count;
1107	4					21	my ($min, $max) = $self->find_boundaries( %opt );
1108							# build/check index
1109	4	100				10	my @index = @{ $opt{index} \|\| [] };
	4					25
1110	4	100				13	if (!@index) {
1111	3					7	my $n = $opt{count};
1112	3	50				9	$n > 0 or croak (__PACKAGE__.": histogram: insufficient options (count < 1 )");
1113	3					10	my $step = ($max - $min) / $n;
1114	3					10	for (my $x = $min; $x <= $max; $x += $step) {
1115	16					45	push @index, $x;
1116							};
1117							} else {
1118							# sort & uniq raw index
1119	1					4	my %known;
1120	1					5	@index = grep { !$known{$_}++ } @index;
	5					24
1121	1					7	@index = sort { $a <=> $b } @index;
	8					19
1122	1	50				7	@index > 1 or croak (__PACKAGE__.": histogram: insufficient options (index < 2)");
1123							};
1124
1125							# finally: estimated counts between indices!
1126	4					9	my @ret;
1127	4					15	for ( my $i = 0; $i<@index-1; $i++) {
1128	17					51	my $count = $self->_count( $index[$i], $index[$i+1] );
1129	17					84	push @ret, [ $count, $index[$i], $index[$i+1] ];
1130							};
1131
1132							# if normalize - find maximum & divide by it
1133	4	100				13	if (my $norm = $opt{normalize_to}) {
1134	1					3	my $max = 0;
1135							$max < $_->[0] and $max = $_->[0]
1136	1		66			8	for @ret;
1137	1					3	$norm /= $max;
1138	1					4	$_->[0] *= $norm for @ret;
1139							};
1140
1141	4					25	return \@ret;
1142							};
1143
1144							=head2 find_boundaries( %opt )
1145
1146							Return ($min, $max) of part of sample denoted by options.
1147
1148							Options may include:
1149
1150							=over
1151
1152							=item * min - ignore values below this. default = min() - epsilon.
1153
1154							=item * max - ignore values above this. default = max() + epsilon.
1155
1156							=item * ltrim - ignore this % of values on lower end.
1157
1158							=item * rtrim - ignore this % of values on upper end.
1159
1160							=back
1161
1162							If no options are given, the whole sample is guaranteed to reside between
1163							returned values.
1164
1165							=cut
1166
1167							sub find_boundaries {
1168	9			9	1	1492	my $self = shift;
1169	9					28	my %opt = @_;
1170
1171	9	100				26	return unless $self->count;
1172
1173							# preprocess boundaries
1174	8	100				35	my $min = defined $opt{min} ? $opt{min} : $self->_lower( $self->min );
1175	8	100				35	my $max = defined $opt{max} ? $opt{max} : $self->_upper( $self->max );
1176
1177	8	100				27	if ($opt{ltrim}) {
1178	1					5	my $newmin = $self->percentile( $opt{ltrim} );
1179	1	50	33			7	defined $newmin and $newmin > $min and $min = $newmin;
1180							};
1181	8	100				25	if ($opt{utrim}) {
1182	1					3	my $newmax = $self->percentile( 100-$opt{utrim} );
1183	1	50	33			5	defined $newmax and $newmax < $max and $max = $newmax;
1184							};
1185
1186	8					33	return ($min, $max);
1187							};
1188
1189							=head2 format( "printf-like expression", ... )
1190
1191							Returns a summary as requested by format string.
1192							Just as with printf and sprintf, a placeholder starts with a C<%>,
1193							followed by formatting options and a
1194
1195							The following placeholders are supported:
1196
1197							=over
1198
1199							=item * % - a literal %
1200
1201							=item * s, f, g - a normal printf acting on an extra argument.
1202							The number of extra arguments MUST match the number of such placeholders,
1203							or this function dies.
1204
1205							=item * n - count;
1206
1207							=item * m - min;
1208
1209							=item * M - max,
1210
1211							=item * a - mean,
1212
1213							=item * d - standard deviation,
1214
1215							=item * S - skewness,
1216
1217							=item * K - kurtosis,
1218
1219							=item * q(x) - x-th quantile (requires argument),
1220
1221							=item * p(x) - x-th percentile (requires argument),
1222
1223							=item * P(x) - cdf - the inferred cumulative distribution function (x)
1224							(requires argument),
1225
1226							=item * e(n) - central_moment - central moment of n-th power
1227							(requires argument),
1228
1229							=item * E(n) - std_moment - standard moment of n-th power (requires argument),
1230
1231							=back
1232
1233							For example,
1234
1235							$stat->format( "99%% results lie between %p(0.5) and %p(99.5)" );
1236
1237							Or
1238
1239							for( my $i = 0; $i < @stats; $i++ ) {
1240							print $stats[$i]->format( "%s-th average value is %a +- %d", $i );
1241							};
1242
1243							=cut
1244
1245							my %format = (
1246							# percent literal
1247							'%' => '%',
1248							# placeholders without parameters
1249							n => 'count',
1250							m => 'min',
1251							M => 'max',
1252							a => 'mean',
1253							d => 'std_dev',
1254							S => 'skewness',
1255							K => 'kurtosis',
1256							# placeholders with 1 parameter
1257							q => 'quantile?',
1258							p => 'percentile?',
1259							P => 'cdf?',
1260							e => 'central_moment?',
1261							E => 'std_moment?',
1262							);
1263
1264							my %printf = (
1265							s => 1,
1266							f => 1,
1267							g => 1,
1268							);
1269
1270							my $re_format = join "\|", keys %format, keys %printf;
1271							$re_format = qr((?:$re_format));
1272
1273							sub format {
1274	5			5	1	20	my ($self, $format, @extra) = @_;
1275
1276
1277							# FIXME this accepts %m(5), then dies - UGLY
1278							# TODO rewrite this as a giant sprintf... one day...
1279	5					115	$format =~ s <%([0-9.\-+ #]*)($re_format)(?:$($re_num)?$){0,1}>
	7					24
1280							< _format_dispatch($self, $2, $1, $3, \@extra) >ge;
1281	5	50				14
1282							croak __PACKAGE__.": Extra arguments in format()"
1283	5					31	if @extra;
1284							return $format;
1285							};
1286
1287	7			7		32	sub _format_dispatch {
1288							my ($obj, $method, $float, $arg, $extra) = @_;
1289
1290	7	100				29	# Handle % escapes
1291	1					4	if ($method !~ /^[a-zA-Z]/) {
1292							return $method;
1293							};
1294	6	100				49	# Handle printf built-in formats
1295	1	50				6	if (!$format{$method}) {
1296							croak __PACKAGE__.": Not enough arguments in format()"
1297	1					30	unless @$extra;
1298							return sprintf "%${float}${method}", shift @$extra;
1299							};
1300
1301	5					10	# Now we know it's LogScale's own method
1302	5	100				20	$method = $format{$method};
1303	2	50				7	if ($method =~ s/\?$//) {
1304							die "Missing argument in method $method" if !defined $arg;
1305	3	50				10	} else {
1306							die "Extra argument in method $method" if defined $arg;
1307	5					21	};
1308							my $result = $obj->$method($arg);
1309
1310	5	100	100			17	# work around S::D::Full's convention that "-inf == undef"
1311							$result = -999
1312	5					44	if ($method eq 'percentile' and !defined $result);
1313							return sprintf "%${float}f", $result;
1314							};
1315
1316							################################################################
1317							# No more public methods please
1318
1319							# MEMOIZE
1320							# We'll keep methods' returned values under {cache}.
1321							# All setters destroy said cache altogether.
1322							# PLEASE replace this with a ready-made module if there's one.
1323
1324							# Sorry for this black magic, but it's too hard to write //= in EVERY method
1325							# Long story short
1326							# The next sub replaces $self->foo() with
1327							# sub { $self->{cache}{foo} //= $self->originnal_foo }
1328							# All setter methods are EXPECTED to destroy {cache} altogether.
1329
1330							# NOTE if you plan subclassing the method, re-memoize methods you change.
1331	253			253		567	sub _memoize_method {
1332							my ($class, $name, $arg) = @_;
1333	253					960
1334	253	50				704	my $orig_code = $class->can($name);
1335							die "Error in memoizer section ($name)"
1336							unless ref $orig_code eq 'CODE';
1337
1338							# begin long conditional
1339							my $cached_code = !$arg
1340	208	100		208		7283	? sub {
1341	103					284	if (!exists $_[0]->{cache}{$name}) {
1342							$_[0]->{cache}{$name} = $orig_code->($_[0]);
1343	208					816	};
1344							return $_[0]->{cache}{$name};
1345							}
1346	171			171		2088	: sub {
1347	171					281	my $self = shift;
1348	171	100				412	my $arg = shift;
1349							$arg = '' unless defined $arg;
1350	171	100				526
1351	102					253	if (!exists $self->{cache}{"$name:$arg"}) {
1352							$self->{cache}{"$name:$arg"} = $orig_code->($self, $arg);
1353	170					773	};
1354	253	100				1041	return $self->{cache}{"$name:$arg"};
1355							};
1356							# conditional ends here
1357	23			23		222
	23					59
	23					847
1358	23			23		145	no strict 'refs'; ## no critic
	23					56
	23					2697
1359	253					450	no warnings 'redefine'; ## no critic
	253					1028
1360							*{$class."::".$name} = $cached_code;
1361							}; # end of _memoize_method
1362
1363							# Memoize all the methods w/o arguments
1364							foreach ( qw(sum sumsq mean min max mode) ) {
1365							__PACKAGE__->_memoize_method($_);
1366							};
1367
1368							# Memoize methods with 1 argument
1369							foreach (qw(quantile central_moment std_moment standard_deviation variance)) {
1370							__PACKAGE__->_memoize_method($_, 1);
1371							};
1372
1373							# add shorter alias of standard_deviation (this must happen AFTER memoization)
1374	23			23		170	{
	23					62
	23					8005
1375							no warnings 'once'; ## no critic
1376							*std_dev = \&standard_deviation;
1377							};
1378
1379							# Get number of values below $x
1380							# Like sum_of(sub{1}, undef, $x), but faster.
1381							# Used by cdf()
1382	105			105		192	sub _count {
1383	105	100				277	my $self = shift;
1384	74					120	@_>1 and return $self->_count($_[1]) - $self->_count($_[0]);
1385							my $x = shift;
1386	74					175
1387	74					171	my $upper = $self->_upper($x);
1388	74	100				189	my $i = _bin_search_gt( $self->_sort, $upper );
1389	65					144	!$i-- and return 0;
1390							my $count = $self->_probability->[$i];
1391
1392	65					140	# interpolate
1393	65	100				325	my $bin = $self->_round( $x );
1394	34					65	if (my $val = $self->{data}{$bin}) {
1395	34	100				138	my $width = ($upper - $bin) * 2;
1396	34					74	my $part = $width ? ( ($upper - $x) / $width) : 1/2;
1397							$count -= $part * $val;
1398	65					208	};
1399							return $count;
1400							};
1401
1402							# BINARY SEARCH
1403							# Not a method, just a function
1404							# Takes sorted \@array and a $num
1405							# Return lowest $i such that $array[$i] >= $num
1406							# Return (scalar @array) if no such $i exists
1407	147			147		1102	sub _bin_search_ge {
1408							my ($array, $x) = @_;
1409	147	100	100			692
1410	126					248	return 0 unless @$array and $array->[0] < $x;
1411	126					199	my $l = 0;
1412	126					315	my $r = @$array;
1413	405					805	while ($l+1 < $r) {
1414	405	100				1180	my $m = int( ($l + $r) /2);
1415							$array->[$m] < $x ? $l = $m : $r = $m;
1416	126					276	};
1417							return $l+1;
1418							};
1419	74			74		162	sub _bin_search_gt {
1420	74					148	my ($array, $x) = @_;
1421	74	100	100			273	my $i = _bin_search_ge(@_);
1422	74					126	$i++ if defined $array->[$i] and $array->[$i] == $x;
1423							return $i;
1424							};
1425
1426							# THE CORE
1427							# Here come the number=>bin functions
1428							# round() generates bin center
1429							# upper() and lower() are respective boundaries.
1430							# Here's the algorithm:
1431							# 1) determine whether bin is linear or logarithmic
1432							# 2) for linear bins, return bin# * bucket_width (==2*absolute error)
1433							# add/subtract 0.5 to get edges.
1434							# 3) for logarithmic bins, return base ** bin# with appropriate sign
1435							# multiply by precalculated constant to get edges
1436							# note that +0.5 doesn't work here, since sqrt(a*b) != (a+b)/2
1437							# This part is fragile and can be written better
1438
1439							# center of bin containing x
1440	90893			90893		127270	sub _round {
1441	90893					123324	my $self = shift;
1442							my $x = shift;
1443	23			23		197
	23					58
	23					3313
1444							use warnings FATAL => qw(numeric uninitialized);
1445	90893	100				182093
1446							if (abs($x) <= $self->{linear_thresh}) {
1447	1761		100			5531	return $self->{linear_width}
1448							&& $self->{linear_width} * floor( $x / $self->{linear_width} + 0.5 );
1449	89129					201463	};
1450	89129					163434	my $i = floor (((log abs $x) - $self->{logfloor})/ $self->{logbase});
1451	89129	100				344638	my $value = $self->{base} ** $i;
1452							return $x < 0 ? -$value : $value;
1453							};
1454
1455							# lower, upper limits of bin containing x
1456	883			883		55043	sub _lower {
1457	883					1363	my $self = shift;
1458							my $x = shift;
1459	23			23		170
	23					61
	23					7518
1460							use warnings FATAL => qw(numeric uninitialized);
1461	883	100				2795
1462							if (abs($x) <= $self->{linear_thresh}) {
1463	184		66			1449	return $self->{linear_width}
1464							&& $self->{linear_width} * (floor( $x / $self->{linear_width} + 0.5) - 0.5);
1465	699					2433	};
1466	699	100				1602	my $i = floor (((log abs $x) - $self->{logfloor} )/ $self->{logbase});
1467	310					1325	if ($x > 0) {
1468							return $self->{floor} * $self->{base}**($i);
1469	389					1727	} else {
1470							return -$self->{floor} * $self->{base}**($i+1);
1471							};
1472							};
1473
1474	462			462		6588	sub _upper {
1475							return -$_[0]->_lower(-$_[1]);
1476							};
1477
1478							# build bin index
1479	191			191		324	sub _sort {
1480							my $self = shift;
1481	191		100			703	return $self->{cache}{sorted}
	1247					2033
	32					298
1482							\|\|= [ sort { $a <=> $b } keys %{ $self->{data} } ];
1483							};
1484
1485							# build cumulative bin counts index
1486	107			107		166	sub _probability {
1487	107		66			313	my $self = shift;
1488	20					36	return $self->{cache}{probability} \|\|= do {
1489	20					33	my @array;
1490	20					48	my $sum = 0;
	20					70
1491	165					247	foreach (@{ $self->_sort }) {
1492	165					339	$sum += $self->{data}{$_};
1493							push @array, $sum;
1494	20					123	};
1495							\@array;
1496							};
1497							};
1498
1499							=head1 AUTHOR
1500
1501							Konstantin S. Uvarin, C<< >>
1502
1503							=head1 BUGS
1504
1505							The module is currently under development. There may be bugs.
1506
1507							C only works for discrete distributions, and simply returns
1508							the first bin with largest bin count.
1509							A better algorithm is wanted.
1510
1511							C should have been made a private method.
1512							Its signature and/or name may change in the future.
1513
1514							See the TODO file in the distribution package.
1515
1516							Please feel free to post bugs and/or feature requests to github:
1517							L
1518
1519							Alternatively, you can use CPAN RT
1520							via e-mail C,
1521							or through the web interface at
1522							L.
1523
1524							Your contribution is appreciated.
1525
1526							=head1 SUPPORT
1527
1528							You can find documentation for this module with the perldoc command.
1529
1530							perldoc Statistics::Descriptive::LogScale
1531
1532							You can also look for information at:
1533
1534							=over 4
1535
1536							=item * GitHub:
1537
1538							L
1539
1540							=item * RT: CPAN's request tracker (report bugs here)
1541
1542							L
1543
1544							=item * AnnoCPAN: Annotated CPAN documentation
1545
1546							L
1547
1548							=item * CPAN Ratings
1549
1550							L
1551
1552							=item * Search CPAN
1553
1554							L
1555
1556							=back
1557
1558							=head1 ACKNOWLEDGEMENTS
1559
1560							This module was inspired by a talk that Andrew Aksyonoff, author of
1561							L,
1562							has given at HighLoad++ conference in Moscow, 2012.
1563
1564							L was and is used as reference when in doubt.
1565							Several code snippets were shamelessly stolen from there.
1566
1567							C and C parameter names were suggested by
1568							CountZero from http://perlmonks.org
1569
1570							=head1 LICENSE AND COPYRIGHT
1571
1572							Copyright 2013-2015 Konstantin S. Uvarin.
1573
1574							This program is free software; you can redistribute it and/or modify it
1575							under the terms of either: the GNU General Public License as published
1576							by the Free Software Foundation; or the Artistic License.
1577
1578							See http://dev.perl.org/licenses/ for more information.
1579
1580							=cut
1581
1582							1; # End of Statistics::Descriptive::LogScale