File Coverage

blib/lib/Statistics/Descriptive/LogScale.pm

Criterion	Covered	Total	%
statement	376	377	99.7
branch	142	168	84.5
condition	52	68	76.4
subroutine	58	58	100.0
pod	29	29	100.0
total	657	700	93.8

line	stmt	bran	cond	sub	pod	time	code
1	25			25		1456323	use 5.006;
	25					337
2	25			25		146	use strict;
	25					51
	25					592
3	25			25		122	use warnings;
	25					57
	25					1766
4
5							package Statistics::Descriptive::LogScale;
6
7							=head1 NAME
8
9							Statistics::Descriptive::LogScale - Memory-efficient approximate univariate
10							descriptive statistics class.
11
12							=head1 VERSION
13
14							Version 0.10
15
16							=cut
17
18							our $VERSION = 0.11;
19
20							=head1 SYNOPSIS
21
22							=head2 Basic usage
23
24							The basic usage is roughly the same as that of L.
25
26							use Statistics::Descriptive::LogScale;
27							my $stat = Statistics::Descriptive::LogScale->new ();
28
29							while(<>) {
30							chomp;
31							$stat->add_data($_);
32							};
33
34							# This can also be done in O(1) memory, precisely
35							printf "Mean: %f +- %f\n", $stat->mean, $stat->standard_deviation;
36							# This requires storing actual data, or approximating
37							printf "25%% : %f\n", $stat->percentile(25);
38							printf "Median: %f\n", $stat->median;
39							printf "75%% : %f\n", $stat->percentile(75);
40
41							=head2 Save/load
42
43							This is not present in L.
44							The save/load interface is designed compatible with JSON::XS.
45							However, any other serializer can be used.
46							The C method is I to return unblessed hashref
47							with enough information to restore the original object.
48
49							use Statistics::Descriptive::LogScale;
50							my $stat = Statistics::Descriptive::LogScale->new ();
51
52							# ..... much later
53							# Save
54							print $fd encoder_of_choice( $stat->TO_JSON )
55							or die "Failed to save: $!";
56
57							# ..... and even later
58							# Load
59							my $plain_hash = decoder_of_choice( $raw_data );
60							my $copy_of_stat = Statistics::Descriptive::LogScale->new( %$plain_hash );
61
62							# Import into existing LogScale instance
63							my $plain_hash = decoder_of_choice( $more_raw_data );
64							$copy_of_stat->add_data_hash( $plain_hash->{data} );
65
66							=head2 Histograms
67
68							Both L and L
69							offer C method for querying data point counts.
70							However, there's also C method for making pretty pictures.
71							Here's a simple text-based histogram.
72							A proper GD example was too long to fit into this margin.
73
74							use strict;
75							use warnings;
76
77							use Statistics::Descriptive::LogScale;
78							my $stat = Statistics::Descriptive::LogScale->new ();
79
80							# collect/load data ...
81							my $re_float = qr([-+]?(?:\d+\.?\d*\|\.\d+)(?:[Ee][-+]?\d+)?);
82							while (<>) {
83							$stat->add_data($_) for m/($re_float)/g;
84							};
85							die "Empty set"
86							unless $stat->count;
87
88							# get data in [ count, lower_bound, upper_bound ] format as arrayref
89							my $hist = $stat->histogram( count => 20 );
90
91							# find maximum value to use as a scale factor
92							my $scale = $hist->[0][0];
93							$scale < $_->[0] and $scale = $_->[0] for @$hist;
94
95							foreach (@$hist) {
96							printf "%10f %s\n", $_->[1], '#' x ($_->[0] * 68 / $scale);
97							};
98							printf "%10f\n", $hist->[-1][2];
99
100							=head1 DESCRIPTION
101
102							This module aims at providing some advanced statistical functions without
103							storing all data in memory, at the cost of certain (predictable) precision loss.
104
105							Data is represented by a set of bins that only store counts of fitting
106							data points.
107							Most bins are logarithmic, i.e. lower end / upper end ratio is constant.
108							However, around zero linear approximation may be user instead
109							(see "linear_width" and "linear_thresh" parameters in new()).
110
111							All operations are then performed on the bins, introducing relative error
112							which does not, however, exceed the bins' relative width ("base").
113
114							=head1 METHODS
115
116							=cut
117
118							########################################################################
119							# == HOW IT WORKS ==
120							# Buckets are stored in a hash: { $value => $count, ... }
121							# {base} is bin width, {logbase} == log {base} (cache)
122							# {linear_thresh} is where we switch to equal bin approximation
123							# {linear_width} is width of bin around zero (==linear_thresh if not given)
124							# {floor} is lower bound of bin whose center is 1. {logfloor} = log {floor}
125							# Nearly all meaningful subs have to scan all the bins, which is bad,
126							# but anyway better than scanning full sample.
127
128	25			25		175	use Carp;
	25					57
	25					1538
129	25			25		8320	use POSIX qw(floor ceil);
	25					141281
	25					170
130
131							# Fields are NOT used internally for now, so this is just a declaration
132	25					112	use fields qw(
133							data count
134							linear_width base logbase floor linear_thresh only_linear logfloor
135							cache
136	25			25		42260	);
	25					36235
137
138							# Some internal constants
139							# This is for infinite portability^W^W portable infinity
140							my $INF = 999;
141							my $re_num = qr/(?:[-+]?(?:\d+\.?\d*\|\.\d+)(?:[Ee][-+]?\d+)?)/;
142
143							=head2 new( %options )
144
145							%options may include:
146
147							=over
148
149							=item * base - ratio of adjacent bins. Default is 10^(1/232), which gives
150							1% precision and exact decimal powers.
151							This value represents acceptable relative error in analysis results.
152
153							B Actual value may be slightly less than requested one.
154							This is done so to avoid troubles with future rounding in (de)serialization.
155
156							=item * linear_width - width of linear bins around zero.
157							This value represents precision of incoming data.
158							Default is zero, i.e. we assume that the measurement is precise.
159
160							B Actual value may be less (by no more than a factor of C)
161							so that borders of linear and logarithmic bins fit nicely.
162
163							=item * linear_thresh - where to switch to linear approximation.
164							If only one of C and C is given,
165							the other will be calculated.
166							However, user may want to specify both in some cases.
167
168							B Actual value may be less (by no more than a factor of C)
169							so that borders of linear and logarithmic bins fit nicely.
170
171							=item * only_linear = 1 (B) -
172							throw away log approximation and become a discrete statistics
173							class with fixed precision.
174							C must be given in this case.
175
176							B This obviously kills memory efficiency, unless one knows beforehand
177							that all values come from a finite pool.
178
179							=item * data - hashref with C<{ value => weight }> for initializing data.
180							Used for cloning.
181							See C.
182
183							=item * zero_thresh - absolute value threshold below which everything is
184							considered zero.
185							DEPRECATED, C and C override this if given.
186
187							=back
188
189							=cut
190
191							my @new_keys = qw( base linear_width linear_thresh data zero_thresh only_linear );
192							# TODO Throw if extra options given?
193							# TODO Check args better
194							sub new {
195	46			46	1	23519	my $class = shift;
196	46					196	my %opt = @_;
197
198							# First, check for only_linear option
199	46	100				179	if ($opt{only_linear}) {
200							$opt{linear_width}
201	4	50				15	or croak "only_linear option given, but no linear width";
202	4					12	$opt{only_linear} = $opt{linear_width};
203	4					24	delete $opt{$_} for qw(linear_width base linear_thresh zero_thresh);
204							};
205
206							# base for logarithmic bins, sane default: +-1%, exact decimal powers
207							# UGLY HACK number->string->number to avoid
208							# future serialization inconsistencies
209	46		100			236	$opt{base} \|\|= 10**(1/232);
210	46					392	$opt{base} = 0 + "$opt{base}";
211	46	50				184	$opt{base} > 1 or croak __PACKAGE__.": new(): base must be >1";
212
213							# calculate where to switch to linear approximation
214							# the condition is: linear bin( thresh ) ~~ log bin( thresh )
215							# i.e. thresh * base - thresh ~~ absolute error * 2
216							# i.e. thresh ~~ absolute_error * 2 / (base - 1)
217							# also support legacy API (zero_thresh)
218	46	100				164	if (defined $opt{linear_thresh} ) {
219	13		100			45	$opt{linear_width} \|\|= $opt{linear_thresh} * ($opt{base}-1);
220							} else {
221	33					97	$opt{linear_thresh} = $opt{zero_thresh};
222							};
223							$opt{linear_thresh} = abs($opt{linear_width}) / ($opt{base} - 1)
224	46	100	100			212	if $opt{linear_width} and !$opt{linear_thresh};
225	46		100			244	$opt{linear_thresh} \|\|= 0;
226	46	50				141	$opt{linear_thresh} >= 0
227							or croak __PACKAGE__.": new(): linear_thresh must be >= 0";
228
229							# Can't use fields::new anymore
230							# due to JSON::XS incompatibility with restricted hashes
231	46					130	my $self = bless {}, $class;
232
233	46					191	$self->{base} = $opt{base};
234							# cache values to ease calculations
235							# floor = (lower end of bin) / (center of bin)
236	46					152	$self->{floor} = 2/(1+$opt{base});
237	46					177	$self->{logbase} = log $opt{base};
238	46					125	$self->{logfloor} = log $self->{floor};
239
240							# bootstrap linear_thresh - make it fit bin edge
241	46					116	$self->{linear_width} = $self->{linear_thresh} = 0;
242	46					187	$self->{linear_thresh} = $self->_lower( $opt{linear_thresh} );
243
244							# divide anything below linear_thresh into odd number of bins
245							# not exceeding requested linear_width
246	46	100				184	if ($self->{linear_thresh}) {
247	15		66			53	my $linear_width = $opt{linear_width} \|\| 2 * $self->{linear_thresh};
248	15					57	my $n_linear = ceil(2 * $self->{linear_thresh} / abs($linear_width));
249	15	100				48	$n_linear++ unless $n_linear % 2;
250	15					36	$self->{linear_width} = (2 * $self->{linear_thresh} / $n_linear);
251							};
252
253	46	100				153	if ($opt{only_linear}) {
254	4					10	$self->{linear_width} = $opt{only_linear};
255	4					7	$self->{linear_thresh} = $INF;
256	4					7	$self->{only_linear} = 1;
257							};
258
259	46					178	$self->clear;
260	46	100				130	if ($opt{data}) {
261	11					35	$self->add_data_hash($opt{data});
262							};
263	46					212	return $self;
264							};
265
266							=head1 General statistical methods
267
268							These methods are used to query the distribution properties. They generally
269							follow the interface of L and co,
270							with minor additions.
271
272							All methods return C on empty data set, except for
273							C, C, C, C and C
274							which all return 0.
275
276							B This module caches whatever it calculates very agressively.
277							Don't hesitate to use statistical functions (except for sum_of/mean_of)
278							more than once. The cache is deleted upon data entry.
279
280							=head2 clear
281
282							Destroy all stored data.
283
284							=cut
285
286							sub clear {
287	60			60	1	11725	my $self = shift;
288	60					2405	$self->{data} = {};
289	60					181	$self->{count} = 0;
290	60					134	delete $self->{cache};
291	60					126	return $self;
292							};
293
294							=head2 add_data( @data )
295
296							Add numbers to the data pool.
297
298							Returns self, so that methods can be chained.
299
300							If incorrect data is given (i.e. non-numeric, undef),
301							an exception is thrown and only partial data gets inserted.
302							The state of object is guaranteed to remain consistent in such case.
303
304							B Cache is reset, even if no data was actually inserted.
305
306							B It is possible to add infinite values to data pool.
307							The module will try and calculate whatever can still be calculated.
308							However, no portable way of serializing such values is done yet.
309
310							=cut
311
312							sub add_data {
313	51217			51217	1	150318	my $self = shift;
314
315	51217					64994	delete $self->{cache};
316	51217					73172	foreach (@_) {
317	90167					143648	$self->{data}{ $self->_round($_) }++;
318	90164					156371	$self->{count}++;
319							};
320	51214					85781	$self;
321							};
322
323							=head2 count
324
325							Returns number of data points.
326
327							=cut
328
329							sub count {
330	69			69	1	9123	my $self = shift;
331	69					291	return $self->{count};
332							};
333
334							=head2 min
335
336							=head2 max
337
338							Values of minimal and maximal bins.
339
340							NOTE: Due to rounding, some of the actual inserted values may fall outside
341							of the min..max range. This may change in the future.
342
343							=cut
344
345							sub min {
346							my $self = shift;
347							return $self->_sort->[0];
348							};
349
350							sub max {
351							my $self = shift;
352							return $self->_sort->[-1];
353							};
354
355							=head2 sample_range
356
357							Return sample range of the dataset, i.e. max() - min().
358
359							=cut
360
361							sub sample_range {
362	5			5	1	934	my $self = shift;
363	5	100				29	return $self->count ? $self->max - $self->min : undef;
364							};
365
366							=head2 sum
367
368							Return sum of all data points.
369
370							=cut
371
372							sub sum {
373							my $self = shift;
374							return $self->sum_of(sub { $_[0] });
375							};
376
377							=head2 sumsq
378
379							Return sum of squares of all datapoints.
380
381							=cut
382
383							sub sumsq {
384							my $self = shift;
385							return $self->sum_of(sub { $_[0] * $_[0] });
386							};
387
388							=head2 mean
389
390							Return mean, or average value, i.e. sum()/count().
391
392							=cut
393
394							sub mean {
395							my $self = shift;
396							return $self->{count} ? $self->sum / $self->{count} : undef;
397							};
398
399							=head2 variance
400
401							=head2 variance( $correction )
402
403							Return data variance, i.e. E((x - E(x)) ** 2).
404
405							Bessel's correction (division by n-1 instead of n) is used by default.
406							This may be changed by specifying $correction explicitly.
407
408							B The binning strategy used here should also introduce variance bias.
409							This is not yet accounted for.
410
411							=cut
412
413							sub variance {
414							my $self = shift;
415							my $correction = shift;
416
417							# in fact we'll receive correction='' because of how cache works
418							$correction = 1 unless defined $correction and length $correction;
419
420							return 0 if ($self->{count} < 1 + $correction);
421
422							my $var = $self->sumsq - $self->sum**2 / $self->{count};
423							return $var <= 0 ? 0 : $var / ( $self->{count} - $correction );
424							};
425
426							=head2 standard_deviation
427
428							=head2 standard_deviation( $correction )
429
430							=head2 std_dev
431
432							=head2 stdev
433
434							Return standard deviation, i.e. square root of variance.
435
436							Bessel's correction (division by n-1 instead of n) is used by default.
437							This may be changed by specifying $correction explicitly.
438
439							B The binning strategy used here should also introduce variance bias.
440							This is not yet accounted for.
441
442							=cut
443
444							sub standard_deviation {
445							my $self = shift;
446
447							return sqrt($self->variance(@_));
448							};
449
450							=head2 cdf ($x)
451
452							Cumulative distribution function. Returns estimated probability of
453							random data point from the sample being less than C<$x>.
454
455							As a special case, C accounts for I of zeroth bin count (if any).
456
457							Not present in Statistics::Descriptive::Full, but appears in
458							L.
459
460							=head2 cdf ($x, $y)
461
462							Returns probability of a value being between C<$x> and C<$y> ($x <= $y).
463							This is essentially C.
464
465							=cut
466
467							sub cdf {
468	4			4	1	8	my $self = shift;
469	4	50				12	return unless $self->{count};
470	4					9	return $self->_count(@_) / $self->{count};
471							};
472
473							=head2 percentile( $n )
474
475							Find $n-th percentile, i.e. a value below which lies $n % of the data.
476
477							0-th percentile is by definition -inf and is returned as undef
478							(see Statistics::Descriptive).
479
480							$n is a real number, not necessarily integer.
481
482							=cut
483
484							sub percentile {
485	48			48	1	4949	my $self = shift;
486	48					99	my $x = shift;
487
488							# assert 0<=$x<=100
489	48	50	33			270	croak __PACKAGE__.": percentile() argument must be between 0 and 100"
490							unless 0<= $x and $x <= 100;
491
492	48					158	my $need = $x * $self->{count} / 100;
493	48	100				191	return if $need < 1;
494
495							# dichotomize
496							# $i is lowest value >= needed
497							# $need doesnt exceed last bin!
498	42					154	my $i = _bin_search_ge( $self->_probability, $need );
499	42					123	return $self->_sort->[ $i ];
500							};
501
502							=head2 quantile( 0..4 )
503
504							From Statistics::Descriptive manual:
505
506							0 => zero quartile (Q0) : minimal value
507							1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile
508							2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile
509							3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile
510							4 => fourth quartile (Q4) : maximal value
511
512							=cut
513
514							sub quantile {
515							my $self = shift;
516							my $t = shift;
517
518							croak (__PACKAGE__.": quantile() argument must be one of 0..4")
519							unless $t =~ /^[0-4]$/;
520
521							$t or return $self->min;
522							return $self->percentile($t * 100 / 4);
523							};
524
525							=head2 median
526
527							Return median of data, a value that divides the sample in half.
528							Same as percentile(50).
529
530							=cut
531
532							sub median {
533	16			16	1	1389	my $self = shift;
534	16					40	return $self->percentile(50);
535							};
536
537							=head2 trimmed_mean( $ltrim, [ $utrim ] )
538
539							Return mean of sample with $ltrim and $utrim fraction of data points
540							remover from lower and upper ends respectively.
541
542							ltrim defaults to 0, and rtrim to ltrim.
543
544							=cut
545
546							sub trimmed_mean {
547	1			1	1	2	my $self = shift;
548	1					4	my ($lower, $upper) = @_;
549	1		50			3	$lower \|\|= 0;
550	1	50				4	$upper = $lower unless defined $upper;
551
552	1					6	my $min = $self->percentile($lower * 100);
553	1					5	my $max = $self->percentile(100 - $upper * 100);
554
555	1	50				4	return unless $min < $max;
556
557	1			5		9	return $self->mean_of(sub{$_[0]}, $min, $max);
	5					8
558							};
559
560							=head2 harmonic_mean
561
562							Return harmonic mean of the data, i.e. 1/E(1/x).
563
564							Return undef if division by zero occurs (see Statistics::Descriptive).
565
566							=cut
567
568							sub harmonic_mean {
569	2			2	1	988	my $self = shift;
570
571	2					4	my $ret;
572	2					4	eval {
573	2			5		6	$ret = $self->count / $self->sum_of(sub { 1/$_[0] });
	5					10
574							};
575	2	50	66			33	if ($@ and $@ !~ /division.*zero/) {
576	0					0	die $@; # rethrow ALL BUT 1/0 which yields undef
577							};
578	2					13	return $ret;
579							};
580
581							=head2 geometric_mean
582
583							Return geometric mean of the data, that is, exp(E(log x)).
584
585							Dies unless all data points are of the same sign.
586
587							=cut
588
589							sub geometric_mean {
590	2			2	1	1050	my $self = shift;
591
592	2	100				8	return unless $self->count;
593	1	50				4	croak __PACKAGE__.": geometric_mean() called on mixed sign sample"
594							if $self->min * $self->max < 0;
595
596	1	50				4	return 0 if $self->{data}{0};
597							# this must be dog slow, but we already log() too much at this point.
598	1			5		6	my $ret = exp( $self->sum_of( sub { log abs $_[0] } ) / $self->{count} );
	5					13
599	1	50				5	return $self->min < 0 ? -$ret : $ret;
600							};
601
602							=head2 skewness
603
604							Return skewness of the distribution, calculated as
605							n/(n-1)(n-2) * E((x-E(x))3)/std_dev3 (this is consistent with Excel).
606
607							=cut
608
609							sub skewness {
610	20			20	1	2017	my $self = shift;
611
612	20					38	my $n = $self->{count};
613	20	100				63	return unless $n > 2;
614
615							# code stolen from Statistics::Descriptive
616	18					57	my $skew = $n * $self->std_moment(3);
617	18					44	my $correction = $n / ( ($n-1) * ($n-2) );
618	18					96	return $correction * $skew;
619							};
620
621							=head2 kurtosis
622
623							Return kurtosis of the distribution, that is 4-th standardized moment - 3.
624							The exact formula used here is consistent with that of Excel and
625							Statistics::Descriptive.
626
627							=cut
628
629							sub kurtosis {
630	22			22	1	1036	my $self = shift;
631
632	22					47	my $n = $self->{count};
633	22	100				67	return unless $n > 3;
634
635							# code stolen from Statistics::Descriptive
636	20					55	my $kurt = $n * $self->std_moment(4);
637	20					74	my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) );
638	20					56	my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) );
639
640	20					99	return $correction1 * $kurt - $correction2;
641							};
642
643							=head2 central_moment( $n )
644
645							Return $n-th central moment, that is, E((x - E(x))^$n).
646
647							Not present in Statistics::Descriptive::Full.
648
649							=cut
650
651							sub central_moment {
652							my $self = shift;
653							my $n = shift;
654
655							my $mean = $self->mean;
656							return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) / $self->{count};
657							};
658
659							=head2 std_moment( $n )
660
661							Return $n-th standardized moment, that is,
662							E((x - E(x))$n) / std_dev(x)$n.
663
664							Not present in Statistics::Descriptive::Full.
665
666							=cut
667
668							sub std_moment {
669							my $self = shift;
670							my $n = shift;
671
672							my $mean = $self->mean;
673							my $dev = $self->std_dev;
674							return $self->sum_of(sub{ ($_[0] - $mean) ** $n })
675							/ ( $dev*$n $self->{count} );
676							};
677
678							=head2 abs_moment( $power, [$offset] )
679
680							Return $n-th moment of absolute value, that is, C.
681
682							Default value for offset if E(x).
683							Power may be fractional.
684
685							B Experimental. Not present in Statistics::Descriptive::Full.
686
687							=cut
688
689							sub abs_moment {
690							my ($self, $power, $offset) = @_;
691
692							$offset = $self->mean unless defined $offset;
693							return $self->sum_of(sub{ return abs($_[0] - $offset) ** $power })
694							/ $self->{count};
695							};
696
697							=head2 std_abs_moment( $power, [$offset] )
698
699							Returns standardized absolute moment - like above, but scaled
700							down by a factor of to standard deviation to n-th power.
701
702							That is, C
703
704							Default value for offset if E(x).
705							Power may be fractional.
706
707							B Experimental. Not present in Statistics::Descriptive::Full.
708
709							=cut
710
711							sub std_abs_moment {
712	14			14	1	51	my ($self, $power, $offset) = @_;
713
714	14					43	return $self->abs_moment($power, $offset)
715							/
716							($self->abs_moment(2, $offset) ** ($power/2));
717							};
718
719							=head2 mode
720
721							Mode of a distribution is the most common value for a discrete distribution,
722							or maximum of probability density for continuous one.
723
724							For now we assume that the distribution IS discrete, and return the bin with
725							the biggest hit count.
726
727							NOTE A better algorithm is still wanted. Experimental.
728							Behavior may change in the future.
729
730							=cut
731
732							# Naive implementation
733							# Find bin w/greatest count and return it
734							sub mode {
735							my $self = shift;
736
737							return if !$self->count;
738
739							my $index = $self->_sort;
740							return $index->[0] if @$index == 1;
741
742							my @count = map { $self->{data}{$_} } @$index;
743
744							my $max_index;
745							my $max_growth = 0;
746							for (my $i = 0; $i<@count; $i++) {
747							$count[$i] > $max_growth or next;
748							$max_index = $i;
749							$max_growth = $count[$i];
750							};
751
752							return $index->[$max_index];
753							};
754
755							=head2 frequency_distribution_ref( \@index )
756
757							=head2 frequency_distribution_ref( $n )
758
759							=head2 frequency_distribution_ref
760
761							Return numbers of data point counts below each number in @index as hashref.
762
763							If a number is given instead of arrayref, @index is created
764							by dividing [min, max] into $n intervals.
765
766							If no parameters are given, return previous result, if any.
767
768							=cut
769
770							sub frequency_distribution_ref {
771	3			3	1	989	my $self = shift;
772	3					6	my $index = shift;
773
774	3	100				9	return unless $self->count;
775							# ah, compatibility - return last value
776							return $self->{cache}{frequency_distribution_ref}
777	2	50				4	unless defined $index;
778							# make index if number given
779	2	100				5	if (!ref $index) {
780	1	50				3	croak __PACKAGE__.": frequency_distribution_ref(): ".
781							"argument must be array, of number > 2, not $index"
782							unless $index > 2;
783	1					4	my $min = $self->_lower($self->min);
784	1					7	my $max = $self->_upper($self->max);
785	1					3	my $step = ($max - $min) / $index;
786	1					4	$index = [ map { $min + $_ * $step } 1..$index ];
	5					9
787							};
788
789	2					10	@$index = (-$INF, sort { $a <=> $b } @$index);
	12					20
790
791	2					8	my @count;
792	2					7	for (my $i = 0; $i<@$index-1; $i++) {
793	9					22	push @count, $self->_count( $index->[$i], $index->[$i+1] );
794							};
795	2					4	shift @$index; # remove -inf
796
797	2					3	my %hash;
798	2					19	@hash{@$index} = @count;
799	2					6	$self->{cache}{frequency_distribution_ref} = \%hash;
800	2					11	return \%hash;
801							};
802
803							=head1 Specific methods
804
805							The folowing methods only apply to this module, or are experimental.
806
807							=cut
808
809							=head2 bucket_width
810
811							Get bin width (relative to center of bin). Percentiles are off
812							by no more than half of this. DEPRECATED.
813
814							=cut
815
816							sub bucket_width {
817	5			5	1	10	my $self = shift;
818	5					30	return $self->{base} - 1;
819							};
820
821							=head2 log_base
822
823							Get upper/lower bound ratio for logarithmic bins.
824							This represents relative precision of sample.
825
826							=head2 linear_width
827
828							Get width of linear buckets.
829							This represents absolute precision of sample.
830
831							=head2 linear_threshold
832
833							Get absolute value threshold below which interpolation is switched to linear.
834
835							=cut
836
837							sub log_base {
838	3			3	1	6	my $self = shift;
839	3					11	return $self->{base};
840							};
841
842							sub linear_width {
843	3			3	1	7	my $self = shift;
844	3					17	return $self->{linear_width};
845							};
846
847							sub linear_threshold {
848	8			8	1	27	my $self = shift;
849	8					38	return $self->{linear_thresh};
850							};
851
852							=head2 add_data_hash ( { value => weight, ... } )
853
854							Add values with counts/weights.
855							This can be used to import data from other
856							Statistics::Descriptive::LogScale object.
857
858							Returns self, so that methods can be chained.
859
860							Negative counts are allowed and treated as "forgetting" data.
861							If a bin count goes below zero, such bin is simply discarded.
862							Minus infinity weight is allowed and has the same effect.
863							Data is guaranteed to remain consistent.
864
865							If incorrect data is given (i.e. non-numeric, undef, or +infinity),
866							an exception is thrown and nothing changes.
867
868							B Cache may be reset, even if no data was actually inserted.
869
870							B It is possible to add infinite values to data pool.
871							The module will try and calculate whatever can still be calculated.
872							However, no portable way of serializing such values is done yet.
873
874							=cut
875
876							sub add_data_hash {
877	20			20	1	56	my $self = shift;
878	20					30	my $hash = shift;
879
880							# check incoming data for being numeric, and no +inf values
881	20					35	eval {
882	25			25		57971	use warnings FATAL => qw(numeric);
	25					84
	25					45637
883	20					100	while (my ($k, $v) = each %$hash) {
884	294	100	33			1437	$k == 0+$k and $v == 0+$v and $v < $INF
			66
885							or die "Infinite count for $k\n";
886							}
887							};
888	20	100				282	croak __PACKAGE__.": add_data_hash failed: $@"
889							if $@;
890
891	18					42	delete $self->{cache};
892
893							# update our counters
894	18					70	foreach (keys %$hash) {
895	291	50				453	next unless $hash->{$_};
896	291					454	my $key = $self->_round($_);
897
898							# Insert data. Make sure -Inf doesn't corrupt our counter.
899	291		100			950	my $newcount = ($self->{data}{$key} \|\| 0) + $hash->{$_};
900	291	100				444	if ($newcount > 0) {
901							# normal insert
902	279					650	$self->{data}{$key} = $newcount;
903	279					469	$self->{count} += $hash->{$_};
904							} else {
905							# We're "forgetting" data, AND the bin got empty
906	12		100			50	$self->{count} -= delete $self->{data}{$key} \|\| 0;
907							};
908							};
909	18					49	$self;
910							};
911
912							=head2 get_data_hash( %options )
913
914							Return distribution hashref {value => number of occurances}.
915
916							This is inverse of add_data_hash.
917
918							Options may include:
919
920							=over
921
922							=item * min - ignore values below this. (See find_boundaries)
923
924							=item * max - ignore values above this. (See find_boundaries)
925
926							=item * ltrim - ignore this % of values on lower end. (See find_boundaries)
927
928							=item * rtrim - ignore this % of values on upper end. (See find_boundaries)
929
930							=item * noise_thresh - strip bins with count below this.
931
932							=back
933
934							=cut
935
936							sub get_data_hash {
937	65			65	1	11750	my ($self, %opt) = @_;
938
939							# shallow copy of data if no options given
940	65	100				168	return {%{ $self->{data} }} unless %opt;
	63					682
941
942	2					8	my ($min, $max) = $self->find_boundaries( %opt );
943	2		100			6	my $noize = $opt{noize_thresh} \|\| 0;
944
945	2					4	my $data = $self->{data};
946	2					4	my %hash;
947	2					5	foreach (keys %$data ) {
948	13	100				30	$_ < $min and next;
949	8	50				12	$_ > $max and next;
950	8	100				16	$data->{$_} < $noize and next;
951	6					10	$hash{$_} = $data->{$_};
952							};
953
954	2					9	return \%hash;
955							};
956
957							=head2 TO_JSON()
958
959							Return enough data to recreate the whole object as an unblessed hashref.
960
961							This routine conforms with C, hence the name.
962							Can be called as
963
964							my $str = JSON::XS->new->allow_blessed->convert_blessed->encode( $this );
965
966							B This module DOES NOT require JSON::XS or serialize to JSON.
967							It just deals with data.
968							Use C, C, C or any serializer of choice.
969
970							my $raw_data = $stat->TO_JSON;
971							Statistics::Descriptive::LogScale->new( %$raw_data );
972
973							Would generate an exact copy of C<$stat> object
974							(provided it's S::D::L and not a subclass).
975
976							=head2 clone( [ %options ] )
977
978							Copy constructor - returns copy of an existing object.
979							Cache is not preserved.
980
981							Constructor options may be given to override existing data. See new().
982
983							Trim options may be given to get partial data. See get_data_hash().
984
985							=cut
986
987							sub clone {
988	8			8	1	981	my ($self, %opt) = @_;
989
990	8					26	my $raw = $self->TO_JSON;
991	8	100				28	if (%opt) {
992							$raw->{data} = $self->get_data_hash( %opt )
993	4	100				12	unless exists $opt{data};
994							exists $opt{$_} and $raw->{$_} = $opt{$_}
995	4		66			30	for @new_keys;
996							};
997
998	8					41	return (ref $self)->new( %$raw );
999							};
1000
1001							sub TO_JSON {
1002	32			32	1	173	my $self = shift;
1003							# UGLY HACK Increase linear_thresh by a factor of base ** 1/10
1004							# so that it's rounded down to present value
1005							return {
1006							CLASS => ref $self,
1007							VERSION => $VERSION,
1008							base => $self->{base},
1009							linear_width => $self->{linear_width},
1010							linear_thresh => $self->{linear_thresh} * ($self->{base}+9)/10,
1011							only_linear => $self->{only_linear},
1012	32					132	data => $self->get_data_hash,
1013							};
1014							};
1015
1016							=head2 scale_sample( $scale )
1017
1018							Multiply all bins' counts by given value. This can be used to adjust
1019							significance of previous data before adding new data (e.g. gradually
1020							"forgetting" past data in a long-running application).
1021
1022							=cut
1023
1024							sub scale_sample {
1025	1			1	1	9	my $self = shift;
1026	1					3	my $factor = shift;
1027	1	50				7	$factor > 0 or croak (__PACKAGE__.": scale_sample: factor must be positive");
1028
1029	1					3	delete $self->{cache};
1030	1					3	foreach (@{ $self->_sort }) {
	1					5
1031	100					135	$self->{data}{$_} *= $factor;
1032							};
1033	1					2	$self->{count} *= $factor;
1034	1					3	return $self;
1035							};
1036
1037							=head2 mean_of( $code, [$min, $max] )
1038
1039							Return expectation of $code over sample within given range.
1040
1041							$code is expected to be a pure function (i.e. depending only on its input
1042							value, and having no side effects).
1043
1044							The underlying integration mechanism only calculates $code once per bin,
1045							so $code should be stable as in not vary wildly over small intervals.
1046
1047							=cut
1048
1049							sub mean_of {
1050	24			24	1	97	my $self = shift;
1051	24					82	my ($code, $min, $max) = @_;
1052
1053	24			429		131	my $weight = $self->sum_of( sub {1}, $min, $max );
	429					1091
1054	24	100				125	return unless $weight;
1055	23					106	return $self->sum_of($code, $min, $max) / $weight;
1056							};
1057
1058							=head1 Experimental methods
1059
1060							These methods may be subject to change in the future, or stay, if they
1061							are good.
1062
1063							=head2 sum_of ( $code, [ $min, $max ] )
1064
1065							Integrate arbitrary function over the sample within the [ $min, $max ] interval.
1066							Default values for both limits are infinities of appropriate sign.
1067
1068							Values in the edge bins are cut using interpolation if needed.
1069
1070							NOTE: sum_of(sub{1}, $a, $b) would return rough nubmer of data points
1071							between $a and $b.
1072
1073							EXPERIMENTAL. The method name may change in the future.
1074
1075							=cut
1076
1077							sub sum_of {
1078	189			189	1	1081	my $self = shift;
1079	189					487	my ($code, $realmin, $realmax) = @_;
1080
1081							# Just app up stuff
1082	189	100	100			853	if (!defined $realmin and !defined $realmax) {
1083	166					338	my $sum = 0;
1084	166					298	while (my ($val, $count) = each %{ $self->{data} }) {
	63889					149935
1085	63723					90845	$sum += $count * $code->( $val );
1086							};
1087	166					1035	return $sum;
1088							};
1089
1090	23	100				58	$realmin = -$INF unless defined $realmin;
1091	23	100				58	$realmax = $INF unless defined $realmax;
1092	23	50				70	return 0 if( $realmin >= $realmax );
1093
1094							# correct limits. $min, $max are indices; $left, $right are limits
1095	23					63	my $min = $self->_round($realmin);
1096	23					62	my $max = $self->_round($realmax);
1097	23					65	my $left = $self->_lower($realmin);
1098	23					67	my $right = $self->_upper($realmax);
1099
1100							# find first bin that's above $left
1101	23					60	my $keys = $self->_sort;
1102	23					72	my $i = _bin_search_ge($keys, $left);
1103
1104							# warn "sum_of [$min, $max]";
1105							# add up bins
1106	23					44	my $sum = 0;
1107	23					53	for (; $i < @$keys; $i++) {
1108	284					921	my $val = $keys->[$i];
1109	284	100				598	last if $val > $right;
1110	264					564	$sum += $self->{data}{$val} * $code->( $val );
1111							};
1112
1113							# cut edges: the hard part
1114							# min and max are now used as indices
1115							# if min or max hits 0, we cut it in half (i.e. into equal 0+ and 0-)
1116							# warn "Add up, sum_of = $sum";
1117	23	100				132	if ($self->{data}{$max}) {
1118	20					53	my $width = $self->_upper($max) - $self->_lower($max);
1119	20	100				67	my $part = $width
1120							? ($self->_upper($max) - $realmax) / $width
1121							: 0.5;
1122	20					112	$sum -= $self->{data}{$max} * $code->($max) * $part;
1123							};
1124							# warn "Cut R, sum_of = $sum";
1125	23	100				120	if ($self->{data}{$min}) {
1126	20					50	my $width = $self->_upper($min) - $self->_lower($min);
1127	20	100				60	my $part = $width
1128							? ($realmin - $self->_lower($min)) / $width
1129							: 0.5;
1130	20					68	$sum -= $self->{data}{$min} * $code->($min) * $part;
1131							};
1132							# warn "Cut L, sum_of = $sum";
1133
1134	23					150	return $sum;
1135							}; # end sum_of
1136
1137							=head2 histogram ( %options )
1138
1139							Returns array of form [ [ count0_1, x0, x1 ], [count1_2, x1, x2 ], ... ]
1140							where countX_Y is number of data points between X and Y.
1141
1142							Options may include:
1143
1144							=over
1145
1146							=item * count (+) - number of intervals to divide sample into.
1147
1148							=item * index (+) - interval borders as array. Will be sorted before processing.
1149
1150							=item * min - ignore values below this. Default = $self->min - epsilon.
1151
1152							=item * max - ignore values above this. Default = $self->max + epsilon.
1153
1154							=item * ltrim - ignore this % of values on lower end.
1155
1156							=item * rtrim - ignore this % of values on upper end.
1157
1158							=item * normalize_to - adjust counts so that max number becomes nnn.
1159							This may be useful if one intends to draw pictures.
1160
1161							=back
1162
1163							Either count or index must be present.
1164
1165							NOTE: this is equivalent to frequency_distribution_ref but better suited
1166							for omitting sample tails and outputting pretty pictures.
1167
1168							=cut
1169
1170							sub histogram {
1171	5			5	1	2338	my ($self, %opt) = @_;
1172
1173	5	100				15	return unless $self->count;
1174	4					12	my ($min, $max) = $self->find_boundaries( %opt );
1175							# build/check index
1176	4	100				7	my @index = @{ $opt{index} \|\| [] };
	4					19
1177	4	100				11	if (!@index) {
1178	3					4	my $n = $opt{count};
1179	3	50				7	$n > 0 or croak (__PACKAGE__.": histogram: insufficient options (count < 1 )");
1180	3					7	my $step = ($max - $min) / $n;
1181	3					9	for (my $x = $min; $x <= $max; $x += $step) {
1182	16					31	push @index, $x;
1183							};
1184							} else {
1185							# sort & uniq raw index
1186	1					3	my %known;
1187	1					2	@index = grep { !$known{$_}++ } @index;
	5					14
1188	1					3	@index = sort { $a <=> $b } @index;
	8					12
1189	1	50				4	@index > 1 or croak (__PACKAGE__.": histogram: insufficient options (index < 2)");
1190							};
1191
1192							# finally: estimated counts between indices!
1193	4					7	my @ret;
1194	4					9	for ( my $i = 0; $i<@index-1; $i++) {
1195	17					37	my $count = $self->_count( $index[$i], $index[$i+1] );
1196	17					61	push @ret, [ $count, $index[$i], $index[$i+1] ];
1197							};
1198
1199							# if normalize - find maximum & divide by it
1200	4	100				11	if (my $norm = $opt{normalize_to}) {
1201	1					2	my $max = 0;
1202							$max < $_->[0] and $max = $_->[0]
1203	1		66			7	for @ret;
1204	1					2	$norm /= $max;
1205	1					4	$_->[0] *= $norm for @ret;
1206							};
1207
1208	4					17	return \@ret;
1209							};
1210
1211							=head2 find_boundaries( %opt )
1212
1213							Return ($min, $max) of part of sample denoted by options.
1214
1215							Options may include:
1216
1217							=over
1218
1219							=item * min - ignore values below this. default = min() - epsilon.
1220
1221							=item * max - ignore values above this. default = max() + epsilon.
1222
1223							=item * ltrim - ignore this % of values on lower end.
1224
1225							=item * rtrim - ignore this % of values on upper end.
1226
1227							=back
1228
1229							If no options are given, the whole sample is guaranteed to reside between
1230							returned values.
1231
1232							=cut
1233
1234							sub find_boundaries {
1235	9			9	1	1600	my $self = shift;
1236	9					22	my %opt = @_;
1237
1238	9	100				52	return unless $self->count;
1239
1240							# preprocess boundaries
1241	8	100				31	my $min = defined $opt{min} ? $opt{min} : $self->_lower( $self->min );
1242	8	100				31	my $max = defined $opt{max} ? $opt{max} : $self->_upper( $self->max );
1243
1244	8	100				22	if ($opt{ltrim}) {
1245	1					5	my $newmin = $self->percentile( $opt{ltrim} );
1246	1	50	33			7	defined $newmin and $newmin > $min and $min = $newmin;
1247							};
1248	8	100				19	if ($opt{utrim}) {
1249	1					4	my $newmax = $self->percentile( 100-$opt{utrim} );
1250	1	50	33			6	defined $newmax and $newmax < $max and $max = $newmax;
1251							};
1252
1253	8					25	return ($min, $max);
1254							};
1255
1256							=head2 format( "printf-like expression", ... )
1257
1258							Returns a summary as requested by format string.
1259							Just as with printf and sprintf, a placeholder starts with a C<%>,
1260							followed by formatting options and a
1261
1262							The following placeholders are supported:
1263
1264							=over
1265
1266							=item * % - a literal %
1267
1268							=item * s, f, g - a normal printf acting on an extra argument.
1269							The number of extra arguments MUST match the number of such placeholders,
1270							or this function dies.
1271
1272							=item * n - count;
1273
1274							=item * m - min;
1275
1276							=item * M - max,
1277
1278							=item * a - mean,
1279
1280							=item * d - standard deviation,
1281
1282							=item * S - skewness,
1283
1284							=item * K - kurtosis,
1285
1286							=item * q(x) - x-th quantile (requires argument),
1287
1288							=item * p(x) - x-th percentile (requires argument),
1289
1290							=item * P(x) - cdf - the inferred cumulative distribution function (x)
1291							(requires argument),
1292
1293							=item * e(n) - central_moment - central moment of n-th power
1294							(requires argument),
1295
1296							=item * E(n) - std_moment - standard moment of n-th power (requires argument),
1297
1298							=item * A(n) - abs_moment - absolute moment of n-th power (requires argument).
1299
1300							=back
1301
1302							For example,
1303
1304							$stat->format( "99%% results lie between %p(0.5) and %p(99.5)" );
1305
1306							Or
1307
1308							for( my $i = 0; $i < @stats; $i++ ) {
1309							print $stats[$i]->format( "%s-th average value is %a +- %d", $i );
1310							};
1311
1312							=cut
1313
1314							my %format = (
1315							# percent literal
1316							'%' => '%',
1317							# placeholders without parameters
1318							n => 'count',
1319							m => 'min',
1320							M => 'max',
1321							a => 'mean',
1322							d => 'std_dev',
1323							S => 'skewness',
1324							K => 'kurtosis',
1325							# placeholders with 1 parameter
1326							q => 'quantile?',
1327							p => 'percentile?',
1328							P => 'cdf?',
1329							e => 'central_moment?',
1330							E => 'std_moment?',
1331							A => 'abs_moment?',
1332							);
1333
1334							my %printf = (
1335							s => 1,
1336							f => 1,
1337							g => 1,
1338							);
1339
1340							my $re_format = join "\|", keys %format, keys %printf;
1341							$re_format = qr((?:$re_format));
1342
1343							sub format {
1344	5			5	1	18	my ($self, $format, @extra) = @_;
1345
1346							# FIXME this accepts %m(5), then dies - UGLY
1347							# TODO rewrite this as a giant sprintf... one day...
1348	5					114	$format =~ s <%([0-9.\-+ #]*)($re_format)(?:$($re_num)?$){0,1}>
	7					21
1349							< _format_dispatch($self, $2, $1, $3, \@extra) >ge;
1350	5	50				17
1351							croak __PACKAGE__.": Extra arguments in format()"
1352	5					35	if @extra;
1353							return $format;
1354							};
1355
1356	7			7		30	sub _format_dispatch {
1357							my ($obj, $method, $float, $arg, $extra) = @_;
1358
1359	7	100				24	# Handle % escapes
1360	1					4	if ($method !~ /^[a-zA-Z]/) {
1361							return $method;
1362							};
1363	6	100				19	# Handle printf built-in formats
1364	1	50				3	if (!$format{$method}) {
1365							croak __PACKAGE__.": Not enough arguments in format()"
1366	1					9	unless @$extra;
1367							return sprintf "%${float}${method}", shift @$extra;
1368							};
1369
1370	5					10	# Now we know it's LogScale's own method
1371	5	100				18	$method = $format{$method};
1372	2	50				9	if ($method =~ s/\?$//) {
1373							die "Missing argument in method $method" if !defined $arg;
1374	3	50				7	} else {
1375							die "Extra argument in method $method" if defined $arg;
1376	5					21	};
1377							my $result = $obj->$method($arg);
1378
1379	5	100	100			21	# work around S::D::Full's convention that "-inf == undef"
1380							$result = -999
1381	5					42	if ($method eq 'percentile' and !defined $result);
1382							return sprintf "%${float}f", $result;
1383							};
1384
1385							################################################################
1386							# No more public methods please
1387
1388							# MEMOIZE
1389							# We'll keep methods' returned values under {cache}.
1390							# All setters destroy said cache altogether.
1391							# PLEASE replace this with a ready-made module if there's one.
1392
1393							# Sorry for this black magic, but it's too hard to write //= in EVERY method
1394							# Long story short
1395							# The next sub replaces $self->foo() with
1396							# sub { $self->{cache}{foo} //= $self->originnal_foo }
1397							# All setter methods are EXPECTED to destroy {cache} altogether.
1398
1399							# NOTE if you plan subclassing the method, re-memoize methods you change.
1400	300			300		636	sub _memoize_method {
1401							my ($class, $name, $arg) = @_;
1402	300					1083
1403	300	50				781	my $orig_code = $class->can($name);
1404							die "Error in memoizer section ($name)"
1405							unless ref $orig_code eq 'CODE';
1406
1407							# begin long conditional
1408							my $cached_code = !$arg
1409	253	100		253		8570	? sub {
1410	111					323	if (!exists $_[0]->{cache}{$name}) {
1411							$_[0]->{cache}{$name} = $orig_code->($_[0]);
1412	253					1142	};
1413							return $_[0]->{cache}{$name};
1414							}
1415	242			242		2660	: sub {
1416	242					420	my $self = shift;
1417	25			25		240	my $arg = do {
	25					63
	25					3424
1418	242					697	no warnings 'uninitialized'; ## no critic
1419							join ':', @_;
1420							};
1421	242	100				852
1422	143					411	if (!exists $self->{cache}{"$name:$arg"}) {
1423							$self->{cache}{"$name:$arg"} = $orig_code->($self, @_);
1424	241					1390	};
1425	300	100				1250	return $self->{cache}{"$name:$arg"};
1426							};
1427							# conditional ends here
1428	25			25		181
	25					58
	25					902
1429	25			25		179	no strict 'refs'; ## no critic
	25					69
	25					2587
1430	300					519	no warnings 'redefine'; ## no critic
	300					1137
1431							*{$class."::".$name} = $cached_code;
1432							}; # end of _memoize_method
1433
1434							# Memoize all the methods w/o arguments
1435							foreach ( qw(sum sumsq mean min max mode) ) {
1436							__PACKAGE__->_memoize_method($_);
1437							};
1438
1439							# Memoize methods with 1 argument
1440							foreach (qw(quantile central_moment std_moment abs_moment standard_deviation
1441							variance)) {
1442							__PACKAGE__->_memoize_method($_, 1);
1443							};
1444
1445							# add shorter alias of standard_deviation (this must happen AFTER memoization)
1446	25			25		177	{
	25					69
	25					8698
1447							no warnings 'once'; ## no critic
1448							*std_dev = \&standard_deviation;
1449							*stdev = \&standard_deviation;
1450							};
1451
1452							# Get number of values below $x
1453							# Like sum_of(sub{1}, undef, $x), but faster.
1454							# Used by cdf()
1455	105			105		144	sub _count {
1456	105	100				214	my $self = shift;
1457	74					95	@_>1 and return $self->_count($_[1]) - $self->_count($_[0]);
1458							my $x = shift;
1459	74					130
1460	74					137	my $upper = $self->_upper($x);
1461	74	100				148	my $i = _bin_search_gt( $self->_sort, $upper );
1462	65					110	!$i-- and return 0;
1463							my $count = $self->_probability->[$i];
1464
1465	65					115	# interpolate
1466	65	100				273	my $bin = $self->_round( $x );
1467	34					57	if (my $val = $self->{data}{$bin}) {
1468	34	100				71	my $width = ($upper - $bin) * 2;
1469	34					50	my $part = $width ? ( ($upper - $x) / $width) : 1/2;
1470							$count -= $part * $val;
1471	65					178	};
1472							return $count;
1473							};
1474
1475							# BINARY SEARCH
1476							# Not a method, just a function
1477							# Takes sorted \@array and a $num
1478							# Return lowest $i such that $array[$i] >= $num
1479							# Return (scalar @array) if no such $i exists
1480	147			147		865	sub _bin_search_ge {
1481							my ($array, $x) = @_;
1482	147	100	100			640
1483	126					226	return 0 unless @$array and $array->[0] < $x;
1484	126					200	my $l = 0;
1485	126					286	my $r = @$array;
1486	405					774	while ($l+1 < $r) {
1487	405	100				1045	my $m = int( ($l + $r) /2);
1488							$array->[$m] < $x ? $l = $m : $r = $m;
1489	126					280	};
1490							return $l+1;
1491							};
1492	74			74		130	sub _bin_search_gt {
1493	74					112	my ($array, $x) = @_;
1494	74	100	100			216	my $i = _bin_search_ge(@_);
1495	74					108	$i++ if defined $array->[$i] and $array->[$i] == $x;
1496							return $i;
1497							};
1498
1499							# THE CORE
1500							# Here come the number=>bin functions
1501							# round() generates bin center
1502							# upper() and lower() are respective boundaries.
1503							# Here's the algorithm:
1504							# 1) determine whether bin is linear or logarithmic
1505							# 2) for linear bins, return bin# * bucket_width (==2*absolute error)
1506							# add/subtract 0.5 to get edges.
1507							# 3) for logarithmic bins, return base ** bin# with appropriate sign
1508							# multiply by precalculated constant to get edges
1509							# note that +0.5 doesn't work here, since sqrt(a*b) != (a+b)/2
1510							# This part is fragile and can be written better
1511
1512							# center of bin containing x
1513	90924			90924		121697	sub _round {
1514	90924					114147	my $self = shift;
1515							my $x = shift;
1516	25			25		189
	25					73
	25					3763
1517							use warnings FATAL => qw(numeric uninitialized);
1518	90924	100				168333
1519							if (abs($x) <= $self->{linear_thresh}) {
1520	1792		100			5680	return $self->{linear_width}
1521							&& $self->{linear_width} * floor( $x / $self->{linear_width} + 0.5 );
1522	89129					178971	};
1523	89129					149352	my $i = floor (((log abs $x) - $self->{logfloor})/ $self->{logbase});
1524	89129	100				316496	my $value = $self->{base} ** $i;
1525							return $x < 0 ? -$value : $value;
1526							};
1527
1528							# lower, upper limits of bin containing x
1529	887			887		52542	sub _lower {
1530	887					1353	my $self = shift;
1531							my $x = shift;
1532	25			25		199
	25					65
	25					8028
1533							use warnings FATAL => qw(numeric uninitialized);
1534	887	100				2322
1535							if (abs($x) <= $self->{linear_thresh}) {
1536	188		66			1784	return $self->{linear_width}
1537							&& $self->{linear_width} * (floor( $x / $self->{linear_width} + 0.5) - 0.5);
1538	699					2178	};
1539	699	100				1490	my $i = floor (((log abs $x) - $self->{logfloor} )/ $self->{logbase});
1540	310					1294	if ($x > 0) {
1541							return $self->{floor} * $self->{base}**($i);
1542	389					1624	} else {
1543							return -$self->{floor} * $self->{base}**($i+1);
1544							};
1545							};
1546
1547	462			462		5227	sub _upper {
1548							return -$_[0]->_lower(-$_[1]);
1549							};
1550
1551							# build bin index
1552	191			191		308	sub _sort {
1553							my $self = shift;
1554	191		100			758	return $self->{cache}{sorted}
	1234					2257
	32					292
1555							\|\|= [ sort { $a <=> $b } keys %{ $self->{data} } ];
1556							};
1557
1558							# build cumulative bin counts index
1559	107			107		170	sub _probability {
1560	107		66			330	my $self = shift;
1561	20					54	return $self->{cache}{probability} \|\|= do {
1562	20					41	my @array;
1563	20					43	my $sum = 0;
	20					89
1564	165					324	foreach (@{ $self->_sort }) {
1565	165					391	$sum += $self->{data}{$_};
1566							push @array, $sum;
1567	20					115	};
1568							\@array;
1569							};
1570							};
1571
1572							=head1 AUTHOR
1573
1574							Konstantin S. Uvarin, C<< >>
1575
1576							=head1 BUGS
1577
1578							The module is currently under development. There may be bugs.
1579
1580							C only works for discrete distributions, and simply returns
1581							the first bin with largest bin count.
1582							A better algorithm is wanted.
1583
1584							C should have been made a private method.
1585							Its signature and/or name may change in the future.
1586
1587							See the TODO file in the distribution package.
1588
1589							Please feel free to post bugs and/or feature requests to github:
1590							L
1591
1592							Alternatively, you can use CPAN RT
1593							via e-mail C,
1594							or through the web interface at
1595							L.
1596
1597							Your contribution is appreciated.
1598
1599							=head1 SUPPORT
1600
1601							You can find documentation for this module with the perldoc command.
1602
1603							perldoc Statistics::Descriptive::LogScale
1604
1605							You can also look for information at:
1606
1607							=over 4
1608
1609							=item * GitHub:
1610
1611							L
1612
1613							=item * RT: CPAN's request tracker (report bugs here)
1614
1615							L
1616
1617							=item * AnnoCPAN: Annotated CPAN documentation
1618
1619							L
1620
1621							=item * CPAN Ratings
1622
1623							L
1624
1625							=item * Search CPAN
1626
1627							L
1628
1629							=back
1630
1631							=head1 ACKNOWLEDGEMENTS
1632
1633							This module was inspired by a talk that Andrew Aksyonoff, author of
1634							L,
1635							has given at HighLoad++ conference in Moscow, 2012.
1636
1637							L was and is used as reference when in doubt.
1638							Several code snippets were shamelessly stolen from there.
1639
1640							C and C parameter names were suggested by
1641							CountZero from http://perlmonks.org
1642
1643							=head1 LICENSE AND COPYRIGHT
1644
1645							Copyright 2013-2015 Konstantin S. Uvarin.
1646
1647							This program is free software; you can redistribute it and/or modify it
1648							under the terms of either: the GNU General Public License as published
1649							by the Free Software Foundation; or the Artistic License.
1650
1651							See http://dev.perl.org/licenses/ for more information.
1652
1653							=cut
1654
1655							1; # End of Statistics::Descriptive::LogScale