File Coverage

blib/lib/Statistics/Descriptive/LogScale.pm
Criterion Covered Total %
statement 376 377 99.7
branch 142 168 84.5
condition 52 68 76.4
subroutine 58 58 100.0
pod 29 29 100.0
total 657 700 93.8


line stmt bran cond sub pod time code
1 25     25   1456323 use 5.006;
  25         337  
2 25     25   146 use strict;
  25         51  
  25         592  
3 25     25   122 use warnings;
  25         57  
  25         1766  
4              
5             package Statistics::Descriptive::LogScale;
6              
7             =head1 NAME
8              
9             Statistics::Descriptive::LogScale - Memory-efficient approximate univariate
10             descriptive statistics class.
11              
12             =head1 VERSION
13              
14             Version 0.10
15              
16             =cut
17              
18             our $VERSION = 0.11;
19              
20             =head1 SYNOPSIS
21              
22             =head2 Basic usage
23              
24             The basic usage is roughly the same as that of L.
25              
26             use Statistics::Descriptive::LogScale;
27             my $stat = Statistics::Descriptive::LogScale->new ();
28              
29             while(<>) {
30             chomp;
31             $stat->add_data($_);
32             };
33              
34             # This can also be done in O(1) memory, precisely
35             printf "Mean: %f +- %f\n", $stat->mean, $stat->standard_deviation;
36             # This requires storing actual data, or approximating
37             printf "25%% : %f\n", $stat->percentile(25);
38             printf "Median: %f\n", $stat->median;
39             printf "75%% : %f\n", $stat->percentile(75);
40              
41             =head2 Save/load
42              
43             This is not present in L.
44             The save/load interface is designed compatible with JSON::XS.
45             However, any other serializer can be used.
46             The C method is I to return unblessed hashref
47             with enough information to restore the original object.
48              
49             use Statistics::Descriptive::LogScale;
50             my $stat = Statistics::Descriptive::LogScale->new ();
51              
52             # ..... much later
53             # Save
54             print $fd encoder_of_choice( $stat->TO_JSON )
55             or die "Failed to save: $!";
56              
57             # ..... and even later
58             # Load
59             my $plain_hash = decoder_of_choice( $raw_data );
60             my $copy_of_stat = Statistics::Descriptive::LogScale->new( %$plain_hash );
61              
62             # Import into existing LogScale instance
63             my $plain_hash = decoder_of_choice( $more_raw_data );
64             $copy_of_stat->add_data_hash( $plain_hash->{data} );
65              
66             =head2 Histograms
67              
68             Both L and L
69             offer C method for querying data point counts.
70             However, there's also C method for making pretty pictures.
71             Here's a simple text-based histogram.
72             A proper GD example was too long to fit into this margin.
73              
74             use strict;
75             use warnings;
76              
77             use Statistics::Descriptive::LogScale;
78             my $stat = Statistics::Descriptive::LogScale->new ();
79              
80             # collect/load data ...
81             my $re_float = qr([-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?);
82             while (<>) {
83             $stat->add_data($_) for m/($re_float)/g;
84             };
85             die "Empty set"
86             unless $stat->count;
87              
88             # get data in [ count, lower_bound, upper_bound ] format as arrayref
89             my $hist = $stat->histogram( count => 20 );
90              
91             # find maximum value to use as a scale factor
92             my $scale = $hist->[0][0];
93             $scale < $_->[0] and $scale = $_->[0] for @$hist;
94              
95             foreach (@$hist) {
96             printf "%10f %s\n", $_->[1], '#' x ($_->[0] * 68 / $scale);
97             };
98             printf "%10f\n", $hist->[-1][2];
99              
100             =head1 DESCRIPTION
101              
102             This module aims at providing some advanced statistical functions without
103             storing all data in memory, at the cost of certain (predictable) precision loss.
104              
105             Data is represented by a set of bins that only store counts of fitting
106             data points.
107             Most bins are logarithmic, i.e. lower end / upper end ratio is constant.
108             However, around zero linear approximation may be user instead
109             (see "linear_width" and "linear_thresh" parameters in new()).
110              
111             All operations are then performed on the bins, introducing relative error
112             which does not, however, exceed the bins' relative width ("base").
113              
114             =head1 METHODS
115              
116             =cut
117              
118             ########################################################################
119             # == HOW IT WORKS ==
120             # Buckets are stored in a hash: { $value => $count, ... }
121             # {base} is bin width, {logbase} == log {base} (cache)
122             # {linear_thresh} is where we switch to equal bin approximation
123             # {linear_width} is width of bin around zero (==linear_thresh if not given)
124             # {floor} is lower bound of bin whose center is 1. {logfloor} = log {floor}
125             # Nearly all meaningful subs have to scan all the bins, which is bad,
126             # but anyway better than scanning full sample.
127              
128 25     25   175 use Carp;
  25         57  
  25         1538  
129 25     25   8320 use POSIX qw(floor ceil);
  25         141281  
  25         170  
130              
131             # Fields are NOT used internally for now, so this is just a declaration
132 25         112 use fields qw(
133             data count
134             linear_width base logbase floor linear_thresh only_linear logfloor
135             cache
136 25     25   42260 );
  25         36235  
137              
138             # Some internal constants
139             # This is for infinite portability^W^W portable infinity
140             my $INF = 9**9**9;
141             my $re_num = qr/(?:[-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?)/;
142              
143             =head2 new( %options )
144              
145             %options may include:
146              
147             =over
148              
149             =item * base - ratio of adjacent bins. Default is 10^(1/232), which gives
150             1% precision and exact decimal powers.
151             This value represents acceptable relative error in analysis results.
152              
153             B Actual value may be slightly less than requested one.
154             This is done so to avoid troubles with future rounding in (de)serialization.
155              
156             =item * linear_width - width of linear bins around zero.
157             This value represents precision of incoming data.
158             Default is zero, i.e. we assume that the measurement is precise.
159              
160             B Actual value may be less (by no more than a factor of C)
161             so that borders of linear and logarithmic bins fit nicely.
162              
163             =item * linear_thresh - where to switch to linear approximation.
164             If only one of C and C is given,
165             the other will be calculated.
166             However, user may want to specify both in some cases.
167              
168             B Actual value may be less (by no more than a factor of C)
169             so that borders of linear and logarithmic bins fit nicely.
170              
171             =item * only_linear = 1 (B) -
172             throw away log approximation and become a discrete statistics
173             class with fixed precision.
174             C must be given in this case.
175              
176             B This obviously kills memory efficiency, unless one knows beforehand
177             that all values come from a finite pool.
178              
179             =item * data - hashref with C<{ value => weight }> for initializing data.
180             Used for cloning.
181             See C.
182              
183             =item * zero_thresh - absolute value threshold below which everything is
184             considered zero.
185             DEPRECATED, C and C override this if given.
186              
187             =back
188              
189             =cut
190              
191             my @new_keys = qw( base linear_width linear_thresh data zero_thresh only_linear );
192             # TODO Throw if extra options given?
193             # TODO Check args better
194             sub new {
195 46     46 1 23519 my $class = shift;
196 46         196 my %opt = @_;
197              
198             # First, check for only_linear option
199 46 100       179 if ($opt{only_linear}) {
200             $opt{linear_width}
201 4 50       15 or croak "only_linear option given, but no linear width";
202 4         12 $opt{only_linear} = $opt{linear_width};
203 4         24 delete $opt{$_} for qw(linear_width base linear_thresh zero_thresh);
204             };
205              
206             # base for logarithmic bins, sane default: +-1%, exact decimal powers
207             # UGLY HACK number->string->number to avoid
208             # future serialization inconsistencies
209 46   100     236 $opt{base} ||= 10**(1/232);
210 46         392 $opt{base} = 0 + "$opt{base}";
211 46 50       184 $opt{base} > 1 or croak __PACKAGE__.": new(): base must be >1";
212              
213             # calculate where to switch to linear approximation
214             # the condition is: linear bin( thresh ) ~~ log bin( thresh )
215             # i.e. thresh * base - thresh ~~ absolute error * 2
216             # i.e. thresh ~~ absolute_error * 2 / (base - 1)
217             # also support legacy API (zero_thresh)
218 46 100       164 if (defined $opt{linear_thresh} ) {
219 13   100     45 $opt{linear_width} ||= $opt{linear_thresh} * ($opt{base}-1);
220             } else {
221 33         97 $opt{linear_thresh} = $opt{zero_thresh};
222             };
223             $opt{linear_thresh} = abs($opt{linear_width}) / ($opt{base} - 1)
224 46 100 100     212 if $opt{linear_width} and !$opt{linear_thresh};
225 46   100     244 $opt{linear_thresh} ||= 0;
226 46 50       141 $opt{linear_thresh} >= 0
227             or croak __PACKAGE__.": new(): linear_thresh must be >= 0";
228              
229             # Can't use fields::new anymore
230             # due to JSON::XS incompatibility with restricted hashes
231 46         130 my $self = bless {}, $class;
232              
233 46         191 $self->{base} = $opt{base};
234             # cache values to ease calculations
235             # floor = (lower end of bin) / (center of bin)
236 46         152 $self->{floor} = 2/(1+$opt{base});
237 46         177 $self->{logbase} = log $opt{base};
238 46         125 $self->{logfloor} = log $self->{floor};
239              
240             # bootstrap linear_thresh - make it fit bin edge
241 46         116 $self->{linear_width} = $self->{linear_thresh} = 0;
242 46         187 $self->{linear_thresh} = $self->_lower( $opt{linear_thresh} );
243              
244             # divide anything below linear_thresh into odd number of bins
245             # not exceeding requested linear_width
246 46 100       184 if ($self->{linear_thresh}) {
247 15   66     53 my $linear_width = $opt{linear_width} || 2 * $self->{linear_thresh};
248 15         57 my $n_linear = ceil(2 * $self->{linear_thresh} / abs($linear_width));
249 15 100       48 $n_linear++ unless $n_linear % 2;
250 15         36 $self->{linear_width} = (2 * $self->{linear_thresh} / $n_linear);
251             };
252              
253 46 100       153 if ($opt{only_linear}) {
254 4         10 $self->{linear_width} = $opt{only_linear};
255 4         7 $self->{linear_thresh} = $INF;
256 4         7 $self->{only_linear} = 1;
257             };
258              
259 46         178 $self->clear;
260 46 100       130 if ($opt{data}) {
261 11         35 $self->add_data_hash($opt{data});
262             };
263 46         212 return $self;
264             };
265              
266             =head1 General statistical methods
267              
268             These methods are used to query the distribution properties. They generally
269             follow the interface of L and co,
270             with minor additions.
271              
272             All methods return C on empty data set, except for
273             C, C, C, C and C
274             which all return 0.
275              
276             B This module caches whatever it calculates very agressively.
277             Don't hesitate to use statistical functions (except for sum_of/mean_of)
278             more than once. The cache is deleted upon data entry.
279              
280             =head2 clear
281              
282             Destroy all stored data.
283              
284             =cut
285              
286             sub clear {
287 60     60 1 11725 my $self = shift;
288 60         2405 $self->{data} = {};
289 60         181 $self->{count} = 0;
290 60         134 delete $self->{cache};
291 60         126 return $self;
292             };
293              
294             =head2 add_data( @data )
295              
296             Add numbers to the data pool.
297              
298             Returns self, so that methods can be chained.
299              
300             If incorrect data is given (i.e. non-numeric, undef),
301             an exception is thrown and only partial data gets inserted.
302             The state of object is guaranteed to remain consistent in such case.
303              
304             B Cache is reset, even if no data was actually inserted.
305              
306             B It is possible to add infinite values to data pool.
307             The module will try and calculate whatever can still be calculated.
308             However, no portable way of serializing such values is done yet.
309              
310             =cut
311              
312             sub add_data {
313 51217     51217 1 150318 my $self = shift;
314              
315 51217         64994 delete $self->{cache};
316 51217         73172 foreach (@_) {
317 90167         143648 $self->{data}{ $self->_round($_) }++;
318 90164         156371 $self->{count}++;
319             };
320 51214         85781 $self;
321             };
322              
323             =head2 count
324              
325             Returns number of data points.
326              
327             =cut
328              
329             sub count {
330 69     69 1 9123 my $self = shift;
331 69         291 return $self->{count};
332             };
333              
334             =head2 min
335              
336             =head2 max
337              
338             Values of minimal and maximal bins.
339              
340             NOTE: Due to rounding, some of the actual inserted values may fall outside
341             of the min..max range. This may change in the future.
342              
343             =cut
344              
345             sub min {
346             my $self = shift;
347             return $self->_sort->[0];
348             };
349              
350             sub max {
351             my $self = shift;
352             return $self->_sort->[-1];
353             };
354              
355             =head2 sample_range
356              
357             Return sample range of the dataset, i.e. max() - min().
358              
359             =cut
360              
361             sub sample_range {
362 5     5 1 934 my $self = shift;
363 5 100       29 return $self->count ? $self->max - $self->min : undef;
364             };
365              
366             =head2 sum
367              
368             Return sum of all data points.
369              
370             =cut
371              
372             sub sum {
373             my $self = shift;
374             return $self->sum_of(sub { $_[0] });
375             };
376              
377             =head2 sumsq
378              
379             Return sum of squares of all datapoints.
380              
381             =cut
382              
383             sub sumsq {
384             my $self = shift;
385             return $self->sum_of(sub { $_[0] * $_[0] });
386             };
387              
388             =head2 mean
389              
390             Return mean, or average value, i.e. sum()/count().
391              
392             =cut
393              
394             sub mean {
395             my $self = shift;
396             return $self->{count} ? $self->sum / $self->{count} : undef;
397             };
398              
399             =head2 variance
400              
401             =head2 variance( $correction )
402              
403             Return data variance, i.e. E((x - E(x)) ** 2).
404              
405             Bessel's correction (division by n-1 instead of n) is used by default.
406             This may be changed by specifying $correction explicitly.
407              
408             B The binning strategy used here should also introduce variance bias.
409             This is not yet accounted for.
410              
411             =cut
412              
413             sub variance {
414             my $self = shift;
415             my $correction = shift;
416              
417             # in fact we'll receive correction='' because of how cache works
418             $correction = 1 unless defined $correction and length $correction;
419              
420             return 0 if ($self->{count} < 1 + $correction);
421              
422             my $var = $self->sumsq - $self->sum**2 / $self->{count};
423             return $var <= 0 ? 0 : $var / ( $self->{count} - $correction );
424             };
425              
426             =head2 standard_deviation
427              
428             =head2 standard_deviation( $correction )
429              
430             =head2 std_dev
431              
432             =head2 stdev
433              
434             Return standard deviation, i.e. square root of variance.
435              
436             Bessel's correction (division by n-1 instead of n) is used by default.
437             This may be changed by specifying $correction explicitly.
438              
439             B The binning strategy used here should also introduce variance bias.
440             This is not yet accounted for.
441              
442             =cut
443              
444             sub standard_deviation {
445             my $self = shift;
446              
447             return sqrt($self->variance(@_));
448             };
449              
450             =head2 cdf ($x)
451              
452             Cumulative distribution function. Returns estimated probability of
453             random data point from the sample being less than C<$x>.
454              
455             As a special case, C accounts for I of zeroth bin count (if any).
456              
457             Not present in Statistics::Descriptive::Full, but appears in
458             L.
459              
460             =head2 cdf ($x, $y)
461              
462             Returns probability of a value being between C<$x> and C<$y> ($x <= $y).
463             This is essentially C.
464              
465             =cut
466              
467             sub cdf {
468 4     4 1 8 my $self = shift;
469 4 50       12 return unless $self->{count};
470 4         9 return $self->_count(@_) / $self->{count};
471             };
472              
473             =head2 percentile( $n )
474              
475             Find $n-th percentile, i.e. a value below which lies $n % of the data.
476              
477             0-th percentile is by definition -inf and is returned as undef
478             (see Statistics::Descriptive).
479              
480             $n is a real number, not necessarily integer.
481              
482             =cut
483              
484             sub percentile {
485 48     48 1 4949 my $self = shift;
486 48         99 my $x = shift;
487              
488             # assert 0<=$x<=100
489 48 50 33     270 croak __PACKAGE__.": percentile() argument must be between 0 and 100"
490             unless 0<= $x and $x <= 100;
491              
492 48         158 my $need = $x * $self->{count} / 100;
493 48 100       191 return if $need < 1;
494              
495             # dichotomize
496             # $i is lowest value >= needed
497             # $need doesnt exceed last bin!
498 42         154 my $i = _bin_search_ge( $self->_probability, $need );
499 42         123 return $self->_sort->[ $i ];
500             };
501              
502             =head2 quantile( 0..4 )
503              
504             From Statistics::Descriptive manual:
505              
506             0 => zero quartile (Q0) : minimal value
507             1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile
508             2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile
509             3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile
510             4 => fourth quartile (Q4) : maximal value
511              
512             =cut
513              
514             sub quantile {
515             my $self = shift;
516             my $t = shift;
517              
518             croak (__PACKAGE__.": quantile() argument must be one of 0..4")
519             unless $t =~ /^[0-4]$/;
520              
521             $t or return $self->min;
522             return $self->percentile($t * 100 / 4);
523             };
524              
525             =head2 median
526              
527             Return median of data, a value that divides the sample in half.
528             Same as percentile(50).
529              
530             =cut
531              
532             sub median {
533 16     16 1 1389 my $self = shift;
534 16         40 return $self->percentile(50);
535             };
536              
537             =head2 trimmed_mean( $ltrim, [ $utrim ] )
538              
539             Return mean of sample with $ltrim and $utrim fraction of data points
540             remover from lower and upper ends respectively.
541              
542             ltrim defaults to 0, and rtrim to ltrim.
543              
544             =cut
545              
546             sub trimmed_mean {
547 1     1 1 2 my $self = shift;
548 1         4 my ($lower, $upper) = @_;
549 1   50     3 $lower ||= 0;
550 1 50       4 $upper = $lower unless defined $upper;
551              
552 1         6 my $min = $self->percentile($lower * 100);
553 1         5 my $max = $self->percentile(100 - $upper * 100);
554              
555 1 50       4 return unless $min < $max;
556              
557 1     5   9 return $self->mean_of(sub{$_[0]}, $min, $max);
  5         8  
558             };
559              
560             =head2 harmonic_mean
561              
562             Return harmonic mean of the data, i.e. 1/E(1/x).
563              
564             Return undef if division by zero occurs (see Statistics::Descriptive).
565              
566             =cut
567              
568             sub harmonic_mean {
569 2     2 1 988 my $self = shift;
570              
571 2         4 my $ret;
572 2         4 eval {
573 2     5   6 $ret = $self->count / $self->sum_of(sub { 1/$_[0] });
  5         10  
574             };
575 2 50 66     33 if ($@ and $@ !~ /division.*zero/) {
576 0         0 die $@; # rethrow ALL BUT 1/0 which yields undef
577             };
578 2         13 return $ret;
579             };
580              
581             =head2 geometric_mean
582              
583             Return geometric mean of the data, that is, exp(E(log x)).
584              
585             Dies unless all data points are of the same sign.
586              
587             =cut
588              
589             sub geometric_mean {
590 2     2 1 1050 my $self = shift;
591              
592 2 100       8 return unless $self->count;
593 1 50       4 croak __PACKAGE__.": geometric_mean() called on mixed sign sample"
594             if $self->min * $self->max < 0;
595              
596 1 50       4 return 0 if $self->{data}{0};
597             # this must be dog slow, but we already log() too much at this point.
598 1     5   6 my $ret = exp( $self->sum_of( sub { log abs $_[0] } ) / $self->{count} );
  5         13  
599 1 50       5 return $self->min < 0 ? -$ret : $ret;
600             };
601              
602             =head2 skewness
603              
604             Return skewness of the distribution, calculated as
605             n/(n-1)(n-2) * E((x-E(x))**3)/std_dev**3 (this is consistent with Excel).
606              
607             =cut
608              
609             sub skewness {
610 20     20 1 2017 my $self = shift;
611              
612 20         38 my $n = $self->{count};
613 20 100       63 return unless $n > 2;
614              
615             # code stolen from Statistics::Descriptive
616 18         57 my $skew = $n * $self->std_moment(3);
617 18         44 my $correction = $n / ( ($n-1) * ($n-2) );
618 18         96 return $correction * $skew;
619             };
620              
621             =head2 kurtosis
622              
623             Return kurtosis of the distribution, that is 4-th standardized moment - 3.
624             The exact formula used here is consistent with that of Excel and
625             Statistics::Descriptive.
626              
627             =cut
628              
629             sub kurtosis {
630 22     22 1 1036 my $self = shift;
631              
632 22         47 my $n = $self->{count};
633 22 100       67 return unless $n > 3;
634              
635             # code stolen from Statistics::Descriptive
636 20         55 my $kurt = $n * $self->std_moment(4);
637 20         74 my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) );
638 20         56 my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) );
639              
640 20         99 return $correction1 * $kurt - $correction2;
641             };
642              
643             =head2 central_moment( $n )
644              
645             Return $n-th central moment, that is, E((x - E(x))^$n).
646              
647             Not present in Statistics::Descriptive::Full.
648              
649             =cut
650              
651             sub central_moment {
652             my $self = shift;
653             my $n = shift;
654              
655             my $mean = $self->mean;
656             return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) / $self->{count};
657             };
658              
659             =head2 std_moment( $n )
660              
661             Return $n-th standardized moment, that is,
662             E((x - E(x))**$n) / std_dev(x)**$n.
663              
664             Not present in Statistics::Descriptive::Full.
665              
666             =cut
667              
668             sub std_moment {
669             my $self = shift;
670             my $n = shift;
671              
672             my $mean = $self->mean;
673             my $dev = $self->std_dev;
674             return $self->sum_of(sub{ ($_[0] - $mean) ** $n })
675             / ( $dev**$n * $self->{count} );
676             };
677              
678             =head2 abs_moment( $power, [$offset] )
679              
680             Return $n-th moment of absolute value, that is, C.
681              
682             Default value for offset if E(x).
683             Power may be fractional.
684              
685             B Experimental. Not present in Statistics::Descriptive::Full.
686              
687             =cut
688              
689             sub abs_moment {
690             my ($self, $power, $offset) = @_;
691              
692             $offset = $self->mean unless defined $offset;
693             return $self->sum_of(sub{ return abs($_[0] - $offset) ** $power })
694             / $self->{count};
695             };
696              
697             =head2 std_abs_moment( $power, [$offset] )
698              
699             Returns standardized absolute moment - like above, but scaled
700             down by a factor of to standard deviation to n-th power.
701              
702             That is, C
703              
704             Default value for offset if E(x).
705             Power may be fractional.
706              
707             B Experimental. Not present in Statistics::Descriptive::Full.
708              
709             =cut
710              
711             sub std_abs_moment {
712 14     14 1 51 my ($self, $power, $offset) = @_;
713              
714 14         43 return $self->abs_moment($power, $offset)
715             /
716             ($self->abs_moment(2, $offset) ** ($power/2));
717             };
718              
719             =head2 mode
720              
721             Mode of a distribution is the most common value for a discrete distribution,
722             or maximum of probability density for continuous one.
723              
724             For now we assume that the distribution IS discrete, and return the bin with
725             the biggest hit count.
726              
727             NOTE A better algorithm is still wanted. Experimental.
728             Behavior may change in the future.
729              
730             =cut
731              
732             # Naive implementation
733             # Find bin w/greatest count and return it
734             sub mode {
735             my $self = shift;
736              
737             return if !$self->count;
738              
739             my $index = $self->_sort;
740             return $index->[0] if @$index == 1;
741              
742             my @count = map { $self->{data}{$_} } @$index;
743              
744             my $max_index;
745             my $max_growth = 0;
746             for (my $i = 0; $i<@count; $i++) {
747             $count[$i] > $max_growth or next;
748             $max_index = $i;
749             $max_growth = $count[$i];
750             };
751              
752             return $index->[$max_index];
753             };
754              
755             =head2 frequency_distribution_ref( \@index )
756              
757             =head2 frequency_distribution_ref( $n )
758              
759             =head2 frequency_distribution_ref
760              
761             Return numbers of data point counts below each number in @index as hashref.
762              
763             If a number is given instead of arrayref, @index is created
764             by dividing [min, max] into $n intervals.
765              
766             If no parameters are given, return previous result, if any.
767              
768             =cut
769              
770             sub frequency_distribution_ref {
771 3     3 1 989 my $self = shift;
772 3         6 my $index = shift;
773              
774 3 100       9 return unless $self->count;
775             # ah, compatibility - return last value
776             return $self->{cache}{frequency_distribution_ref}
777 2 50       4 unless defined $index;
778             # make index if number given
779 2 100       5 if (!ref $index) {
780 1 50       3 croak __PACKAGE__.": frequency_distribution_ref(): ".
781             "argument must be array, of number > 2, not $index"
782             unless $index > 2;
783 1         4 my $min = $self->_lower($self->min);
784 1         7 my $max = $self->_upper($self->max);
785 1         3 my $step = ($max - $min) / $index;
786 1         4 $index = [ map { $min + $_ * $step } 1..$index ];
  5         9  
787             };
788              
789 2         10 @$index = (-$INF, sort { $a <=> $b } @$index);
  12         20  
790              
791 2         8 my @count;
792 2         7 for (my $i = 0; $i<@$index-1; $i++) {
793 9         22 push @count, $self->_count( $index->[$i], $index->[$i+1] );
794             };
795 2         4 shift @$index; # remove -inf
796              
797 2         3 my %hash;
798 2         19 @hash{@$index} = @count;
799 2         6 $self->{cache}{frequency_distribution_ref} = \%hash;
800 2         11 return \%hash;
801             };
802              
803             =head1 Specific methods
804              
805             The folowing methods only apply to this module, or are experimental.
806              
807             =cut
808              
809             =head2 bucket_width
810              
811             Get bin width (relative to center of bin). Percentiles are off
812             by no more than half of this. DEPRECATED.
813              
814             =cut
815              
816             sub bucket_width {
817 5     5 1 10 my $self = shift;
818 5         30 return $self->{base} - 1;
819             };
820              
821             =head2 log_base
822              
823             Get upper/lower bound ratio for logarithmic bins.
824             This represents relative precision of sample.
825              
826             =head2 linear_width
827              
828             Get width of linear buckets.
829             This represents absolute precision of sample.
830              
831             =head2 linear_threshold
832              
833             Get absolute value threshold below which interpolation is switched to linear.
834              
835             =cut
836              
837             sub log_base {
838 3     3 1 6 my $self = shift;
839 3         11 return $self->{base};
840             };
841              
842             sub linear_width {
843 3     3 1 7 my $self = shift;
844 3         17 return $self->{linear_width};
845             };
846              
847             sub linear_threshold {
848 8     8 1 27 my $self = shift;
849 8         38 return $self->{linear_thresh};
850             };
851              
852             =head2 add_data_hash ( { value => weight, ... } )
853              
854             Add values with counts/weights.
855             This can be used to import data from other
856             Statistics::Descriptive::LogScale object.
857              
858             Returns self, so that methods can be chained.
859              
860             Negative counts are allowed and treated as "forgetting" data.
861             If a bin count goes below zero, such bin is simply discarded.
862             Minus infinity weight is allowed and has the same effect.
863             Data is guaranteed to remain consistent.
864              
865             If incorrect data is given (i.e. non-numeric, undef, or +infinity),
866             an exception is thrown and nothing changes.
867              
868             B Cache may be reset, even if no data was actually inserted.
869              
870             B It is possible to add infinite values to data pool.
871             The module will try and calculate whatever can still be calculated.
872             However, no portable way of serializing such values is done yet.
873              
874             =cut
875              
876             sub add_data_hash {
877 20     20 1 56 my $self = shift;
878 20         30 my $hash = shift;
879              
880             # check incoming data for being numeric, and no +inf values
881 20         35 eval {
882 25     25   57971 use warnings FATAL => qw(numeric);
  25         84  
  25         45637  
883 20         100 while (my ($k, $v) = each %$hash) {
884 294 100 33     1437 $k == 0+$k and $v == 0+$v and $v < $INF
      66        
885             or die "Infinite count for $k\n";
886             }
887             };
888 20 100       282 croak __PACKAGE__.": add_data_hash failed: $@"
889             if $@;
890              
891 18         42 delete $self->{cache};
892              
893             # update our counters
894 18         70 foreach (keys %$hash) {
895 291 50       453 next unless $hash->{$_};
896 291         454 my $key = $self->_round($_);
897              
898             # Insert data. Make sure -Inf doesn't corrupt our counter.
899 291   100     950 my $newcount = ($self->{data}{$key} || 0) + $hash->{$_};
900 291 100       444 if ($newcount > 0) {
901             # normal insert
902 279         650 $self->{data}{$key} = $newcount;
903 279         469 $self->{count} += $hash->{$_};
904             } else {
905             # We're "forgetting" data, AND the bin got empty
906 12   100     50 $self->{count} -= delete $self->{data}{$key} || 0;
907             };
908             };
909 18         49 $self;
910             };
911              
912             =head2 get_data_hash( %options )
913              
914             Return distribution hashref {value => number of occurances}.
915              
916             This is inverse of add_data_hash.
917              
918             Options may include:
919              
920             =over
921              
922             =item * min - ignore values below this. (See find_boundaries)
923              
924             =item * max - ignore values above this. (See find_boundaries)
925              
926             =item * ltrim - ignore this % of values on lower end. (See find_boundaries)
927              
928             =item * rtrim - ignore this % of values on upper end. (See find_boundaries)
929              
930             =item * noise_thresh - strip bins with count below this.
931              
932             =back
933              
934             =cut
935              
936             sub get_data_hash {
937 65     65 1 11750 my ($self, %opt) = @_;
938              
939             # shallow copy of data if no options given
940 65 100       168 return {%{ $self->{data} }} unless %opt;
  63         682  
941              
942 2         8 my ($min, $max) = $self->find_boundaries( %opt );
943 2   100     6 my $noize = $opt{noize_thresh} || 0;
944              
945 2         4 my $data = $self->{data};
946 2         4 my %hash;
947 2         5 foreach (keys %$data ) {
948 13 100       30 $_ < $min and next;
949 8 50       12 $_ > $max and next;
950 8 100       16 $data->{$_} < $noize and next;
951 6         10 $hash{$_} = $data->{$_};
952             };
953              
954 2         9 return \%hash;
955             };
956              
957             =head2 TO_JSON()
958              
959             Return enough data to recreate the whole object as an unblessed hashref.
960              
961             This routine conforms with C, hence the name.
962             Can be called as
963              
964             my $str = JSON::XS->new->allow_blessed->convert_blessed->encode( $this );
965              
966             B This module DOES NOT require JSON::XS or serialize to JSON.
967             It just deals with data.
968             Use C, C, C or any serializer of choice.
969              
970             my $raw_data = $stat->TO_JSON;
971             Statistics::Descriptive::LogScale->new( %$raw_data );
972              
973             Would generate an exact copy of C<$stat> object
974             (provided it's S::D::L and not a subclass).
975              
976             =head2 clone( [ %options ] )
977              
978             Copy constructor - returns copy of an existing object.
979             Cache is not preserved.
980              
981             Constructor options may be given to override existing data. See new().
982              
983             Trim options may be given to get partial data. See get_data_hash().
984              
985             =cut
986              
987             sub clone {
988 8     8 1 981 my ($self, %opt) = @_;
989              
990 8         26 my $raw = $self->TO_JSON;
991 8 100       28 if (%opt) {
992             $raw->{data} = $self->get_data_hash( %opt )
993 4 100       12 unless exists $opt{data};
994             exists $opt{$_} and $raw->{$_} = $opt{$_}
995 4   66     30 for @new_keys;
996             };
997              
998 8         41 return (ref $self)->new( %$raw );
999             };
1000              
1001             sub TO_JSON {
1002 32     32 1 173 my $self = shift;
1003             # UGLY HACK Increase linear_thresh by a factor of base ** 1/10
1004             # so that it's rounded down to present value
1005             return {
1006             CLASS => ref $self,
1007             VERSION => $VERSION,
1008             base => $self->{base},
1009             linear_width => $self->{linear_width},
1010             linear_thresh => $self->{linear_thresh} * ($self->{base}+9)/10,
1011             only_linear => $self->{only_linear},
1012 32         132 data => $self->get_data_hash,
1013             };
1014             };
1015              
1016             =head2 scale_sample( $scale )
1017              
1018             Multiply all bins' counts by given value. This can be used to adjust
1019             significance of previous data before adding new data (e.g. gradually
1020             "forgetting" past data in a long-running application).
1021              
1022             =cut
1023              
1024             sub scale_sample {
1025 1     1 1 9 my $self = shift;
1026 1         3 my $factor = shift;
1027 1 50       7 $factor > 0 or croak (__PACKAGE__.": scale_sample: factor must be positive");
1028              
1029 1         3 delete $self->{cache};
1030 1         3 foreach (@{ $self->_sort }) {
  1         5  
1031 100         135 $self->{data}{$_} *= $factor;
1032             };
1033 1         2 $self->{count} *= $factor;
1034 1         3 return $self;
1035             };
1036              
1037             =head2 mean_of( $code, [$min, $max] )
1038              
1039             Return expectation of $code over sample within given range.
1040              
1041             $code is expected to be a pure function (i.e. depending only on its input
1042             value, and having no side effects).
1043              
1044             The underlying integration mechanism only calculates $code once per bin,
1045             so $code should be stable as in not vary wildly over small intervals.
1046              
1047             =cut
1048              
1049             sub mean_of {
1050 24     24 1 97 my $self = shift;
1051 24         82 my ($code, $min, $max) = @_;
1052              
1053 24     429   131 my $weight = $self->sum_of( sub {1}, $min, $max );
  429         1091  
1054 24 100       125 return unless $weight;
1055 23         106 return $self->sum_of($code, $min, $max) / $weight;
1056             };
1057              
1058             =head1 Experimental methods
1059              
1060             These methods may be subject to change in the future, or stay, if they
1061             are good.
1062              
1063             =head2 sum_of ( $code, [ $min, $max ] )
1064              
1065             Integrate arbitrary function over the sample within the [ $min, $max ] interval.
1066             Default values for both limits are infinities of appropriate sign.
1067              
1068             Values in the edge bins are cut using interpolation if needed.
1069              
1070             NOTE: sum_of(sub{1}, $a, $b) would return rough nubmer of data points
1071             between $a and $b.
1072              
1073             EXPERIMENTAL. The method name may change in the future.
1074              
1075             =cut
1076              
1077             sub sum_of {
1078 189     189 1 1081 my $self = shift;
1079 189         487 my ($code, $realmin, $realmax) = @_;
1080              
1081             # Just app up stuff
1082 189 100 100     853 if (!defined $realmin and !defined $realmax) {
1083 166         338 my $sum = 0;
1084 166         298 while (my ($val, $count) = each %{ $self->{data} }) {
  63889         149935  
1085 63723         90845 $sum += $count * $code->( $val );
1086             };
1087 166         1035 return $sum;
1088             };
1089              
1090 23 100       58 $realmin = -$INF unless defined $realmin;
1091 23 100       58 $realmax = $INF unless defined $realmax;
1092 23 50       70 return 0 if( $realmin >= $realmax );
1093              
1094             # correct limits. $min, $max are indices; $left, $right are limits
1095 23         63 my $min = $self->_round($realmin);
1096 23         62 my $max = $self->_round($realmax);
1097 23         65 my $left = $self->_lower($realmin);
1098 23         67 my $right = $self->_upper($realmax);
1099              
1100             # find first bin that's above $left
1101 23         60 my $keys = $self->_sort;
1102 23         72 my $i = _bin_search_ge($keys, $left);
1103              
1104             # warn "sum_of [$min, $max]";
1105             # add up bins
1106 23         44 my $sum = 0;
1107 23         53 for (; $i < @$keys; $i++) {
1108 284         921 my $val = $keys->[$i];
1109 284 100       598 last if $val > $right;
1110 264         564 $sum += $self->{data}{$val} * $code->( $val );
1111             };
1112              
1113             # cut edges: the hard part
1114             # min and max are now used as indices
1115             # if min or max hits 0, we cut it in half (i.e. into equal 0+ and 0-)
1116             # warn "Add up, sum_of = $sum";
1117 23 100       132 if ($self->{data}{$max}) {
1118 20         53 my $width = $self->_upper($max) - $self->_lower($max);
1119 20 100       67 my $part = $width
1120             ? ($self->_upper($max) - $realmax) / $width
1121             : 0.5;
1122 20         112 $sum -= $self->{data}{$max} * $code->($max) * $part;
1123             };
1124             # warn "Cut R, sum_of = $sum";
1125 23 100       120 if ($self->{data}{$min}) {
1126 20         50 my $width = $self->_upper($min) - $self->_lower($min);
1127 20 100       60 my $part = $width
1128             ? ($realmin - $self->_lower($min)) / $width
1129             : 0.5;
1130 20         68 $sum -= $self->{data}{$min} * $code->($min) * $part;
1131             };
1132             # warn "Cut L, sum_of = $sum";
1133              
1134 23         150 return $sum;
1135             }; # end sum_of
1136              
1137             =head2 histogram ( %options )
1138              
1139             Returns array of form [ [ count0_1, x0, x1 ], [count1_2, x1, x2 ], ... ]
1140             where countX_Y is number of data points between X and Y.
1141              
1142             Options may include:
1143              
1144             =over
1145              
1146             =item * count (+) - number of intervals to divide sample into.
1147              
1148             =item * index (+) - interval borders as array. Will be sorted before processing.
1149              
1150             =item * min - ignore values below this. Default = $self->min - epsilon.
1151              
1152             =item * max - ignore values above this. Default = $self->max + epsilon.
1153              
1154             =item * ltrim - ignore this % of values on lower end.
1155              
1156             =item * rtrim - ignore this % of values on upper end.
1157              
1158             =item * normalize_to - adjust counts so that max number becomes nnn.
1159             This may be useful if one intends to draw pictures.
1160              
1161             =back
1162              
1163             Either count or index must be present.
1164              
1165             NOTE: this is equivalent to frequency_distribution_ref but better suited
1166             for omitting sample tails and outputting pretty pictures.
1167              
1168             =cut
1169              
1170             sub histogram {
1171 5     5 1 2338 my ($self, %opt) = @_;
1172              
1173 5 100       15 return unless $self->count;
1174 4         12 my ($min, $max) = $self->find_boundaries( %opt );
1175             # build/check index
1176 4 100       7 my @index = @{ $opt{index} || [] };
  4         19  
1177 4 100       11 if (!@index) {
1178 3         4 my $n = $opt{count};
1179 3 50       7 $n > 0 or croak (__PACKAGE__.": histogram: insufficient options (count < 1 )");
1180 3         7 my $step = ($max - $min) / $n;
1181 3         9 for (my $x = $min; $x <= $max; $x += $step) {
1182 16         31 push @index, $x;
1183             };
1184             } else {
1185             # sort & uniq raw index
1186 1         3 my %known;
1187 1         2 @index = grep { !$known{$_}++ } @index;
  5         14  
1188 1         3 @index = sort { $a <=> $b } @index;
  8         12  
1189 1 50       4 @index > 1 or croak (__PACKAGE__.": histogram: insufficient options (index < 2)");
1190             };
1191              
1192             # finally: estimated counts between indices!
1193 4         7 my @ret;
1194 4         9 for ( my $i = 0; $i<@index-1; $i++) {
1195 17         37 my $count = $self->_count( $index[$i], $index[$i+1] );
1196 17         61 push @ret, [ $count, $index[$i], $index[$i+1] ];
1197             };
1198              
1199             # if normalize - find maximum & divide by it
1200 4 100       11 if (my $norm = $opt{normalize_to}) {
1201 1         2 my $max = 0;
1202             $max < $_->[0] and $max = $_->[0]
1203 1   66     7 for @ret;
1204 1         2 $norm /= $max;
1205 1         4 $_->[0] *= $norm for @ret;
1206             };
1207              
1208 4         17 return \@ret;
1209             };
1210              
1211             =head2 find_boundaries( %opt )
1212              
1213             Return ($min, $max) of part of sample denoted by options.
1214              
1215             Options may include:
1216              
1217             =over
1218              
1219             =item * min - ignore values below this. default = min() - epsilon.
1220              
1221             =item * max - ignore values above this. default = max() + epsilon.
1222              
1223             =item * ltrim - ignore this % of values on lower end.
1224              
1225             =item * rtrim - ignore this % of values on upper end.
1226              
1227             =back
1228              
1229             If no options are given, the whole sample is guaranteed to reside between
1230             returned values.
1231              
1232             =cut
1233              
1234             sub find_boundaries {
1235 9     9 1 1600 my $self = shift;
1236 9         22 my %opt = @_;
1237              
1238 9 100       52 return unless $self->count;
1239              
1240             # preprocess boundaries
1241 8 100       31 my $min = defined $opt{min} ? $opt{min} : $self->_lower( $self->min );
1242 8 100       31 my $max = defined $opt{max} ? $opt{max} : $self->_upper( $self->max );
1243              
1244 8 100       22 if ($opt{ltrim}) {
1245 1         5 my $newmin = $self->percentile( $opt{ltrim} );
1246 1 50 33     7 defined $newmin and $newmin > $min and $min = $newmin;
1247             };
1248 8 100       19 if ($opt{utrim}) {
1249 1         4 my $newmax = $self->percentile( 100-$opt{utrim} );
1250 1 50 33     6 defined $newmax and $newmax < $max and $max = $newmax;
1251             };
1252              
1253 8         25 return ($min, $max);
1254             };
1255              
1256             =head2 format( "printf-like expression", ... )
1257              
1258             Returns a summary as requested by format string.
1259             Just as with printf and sprintf, a placeholder starts with a C<%>,
1260             followed by formatting options and a
1261              
1262             The following placeholders are supported:
1263              
1264             =over
1265              
1266             =item * % - a literal %
1267              
1268             =item * s, f, g - a normal printf acting on an extra argument.
1269             The number of extra arguments MUST match the number of such placeholders,
1270             or this function dies.
1271              
1272             =item * n - count;
1273              
1274             =item * m - min;
1275              
1276             =item * M - max,
1277              
1278             =item * a - mean,
1279              
1280             =item * d - standard deviation,
1281              
1282             =item * S - skewness,
1283              
1284             =item * K - kurtosis,
1285              
1286             =item * q(x) - x-th quantile (requires argument),
1287              
1288             =item * p(x) - x-th percentile (requires argument),
1289              
1290             =item * P(x) - cdf - the inferred cumulative distribution function (x)
1291             (requires argument),
1292              
1293             =item * e(n) - central_moment - central moment of n-th power
1294             (requires argument),
1295              
1296             =item * E(n) - std_moment - standard moment of n-th power (requires argument),
1297              
1298             =item * A(n) - abs_moment - absolute moment of n-th power (requires argument).
1299              
1300             =back
1301              
1302             For example,
1303              
1304             $stat->format( "99%% results lie between %p(0.5) and %p(99.5)" );
1305              
1306             Or
1307              
1308             for( my $i = 0; $i < @stats; $i++ ) {
1309             print $stats[$i]->format( "%s-th average value is %a +- %d", $i );
1310             };
1311              
1312             =cut
1313              
1314             my %format = (
1315             # percent literal
1316             '%' => '%',
1317             # placeholders without parameters
1318             n => 'count',
1319             m => 'min',
1320             M => 'max',
1321             a => 'mean',
1322             d => 'std_dev',
1323             S => 'skewness',
1324             K => 'kurtosis',
1325             # placeholders with 1 parameter
1326             q => 'quantile?',
1327             p => 'percentile?',
1328             P => 'cdf?',
1329             e => 'central_moment?',
1330             E => 'std_moment?',
1331             A => 'abs_moment?',
1332             );
1333              
1334             my %printf = (
1335             s => 1,
1336             f => 1,
1337             g => 1,
1338             );
1339              
1340             my $re_format = join "|", keys %format, keys %printf;
1341             $re_format = qr((?:$re_format));
1342              
1343             sub format {
1344 5     5 1 18 my ($self, $format, @extra) = @_;
1345              
1346             # FIXME this accepts %m(5), then dies - UGLY
1347             # TODO rewrite this as a giant sprintf... one day...
1348 5         114 $format =~ s <%([0-9.\-+ #]*)($re_format)(?:\(($re_num)?\)){0,1}>
  7         21  
1349             < _format_dispatch($self, $2, $1, $3, \@extra) >ge;
1350 5 50       17  
1351             croak __PACKAGE__.": Extra arguments in format()"
1352 5         35 if @extra;
1353             return $format;
1354             };
1355              
1356 7     7   30 sub _format_dispatch {
1357             my ($obj, $method, $float, $arg, $extra) = @_;
1358              
1359 7 100       24 # Handle % escapes
1360 1         4 if ($method !~ /^[a-zA-Z]/) {
1361             return $method;
1362             };
1363 6 100       19 # Handle printf built-in formats
1364 1 50       3 if (!$format{$method}) {
1365             croak __PACKAGE__.": Not enough arguments in format()"
1366 1         9 unless @$extra;
1367             return sprintf "%${float}${method}", shift @$extra;
1368             };
1369              
1370 5         10 # Now we know it's LogScale's own method
1371 5 100       18 $method = $format{$method};
1372 2 50       9 if ($method =~ s/\?$//) {
1373             die "Missing argument in method $method" if !defined $arg;
1374 3 50       7 } else {
1375             die "Extra argument in method $method" if defined $arg;
1376 5         21 };
1377             my $result = $obj->$method($arg);
1378              
1379 5 100 100     21 # work around S::D::Full's convention that "-inf == undef"
1380             $result = -9**9**9
1381 5         42 if ($method eq 'percentile' and !defined $result);
1382             return sprintf "%${float}f", $result;
1383             };
1384              
1385             ################################################################
1386             # No more public methods please
1387              
1388             # MEMOIZE
1389             # We'll keep methods' returned values under {cache}.
1390             # All setters destroy said cache altogether.
1391             # PLEASE replace this with a ready-made module if there's one.
1392              
1393             # Sorry for this black magic, but it's too hard to write //= in EVERY method
1394             # Long story short
1395             # The next sub replaces $self->foo() with
1396             # sub { $self->{cache}{foo} //= $self->originnal_foo }
1397             # All setter methods are EXPECTED to destroy {cache} altogether.
1398              
1399             # NOTE if you plan subclassing the method, re-memoize methods you change.
1400 300     300   636 sub _memoize_method {
1401             my ($class, $name, $arg) = @_;
1402 300         1083  
1403 300 50       781 my $orig_code = $class->can($name);
1404             die "Error in memoizer section ($name)"
1405             unless ref $orig_code eq 'CODE';
1406              
1407             # begin long conditional
1408             my $cached_code = !$arg
1409 253 100   253   8570 ? sub {
1410 111         323 if (!exists $_[0]->{cache}{$name}) {
1411             $_[0]->{cache}{$name} = $orig_code->($_[0]);
1412 253         1142 };
1413             return $_[0]->{cache}{$name};
1414             }
1415 242     242   2660 : sub {
1416 242         420 my $self = shift;
1417 25     25   240 my $arg = do {
  25         63  
  25         3424  
1418 242         697 no warnings 'uninitialized'; ## no critic
1419             join ':', @_;
1420             };
1421 242 100       852  
1422 143         411 if (!exists $self->{cache}{"$name:$arg"}) {
1423             $self->{cache}{"$name:$arg"} = $orig_code->($self, @_);
1424 241         1390 };
1425 300 100       1250 return $self->{cache}{"$name:$arg"};
1426             };
1427             # conditional ends here
1428 25     25   181  
  25         58  
  25         902  
1429 25     25   179 no strict 'refs'; ## no critic
  25         69  
  25         2587  
1430 300         519 no warnings 'redefine'; ## no critic
  300         1137  
1431             *{$class."::".$name} = $cached_code;
1432             }; # end of _memoize_method
1433              
1434             # Memoize all the methods w/o arguments
1435             foreach ( qw(sum sumsq mean min max mode) ) {
1436             __PACKAGE__->_memoize_method($_);
1437             };
1438              
1439             # Memoize methods with 1 argument
1440             foreach (qw(quantile central_moment std_moment abs_moment standard_deviation
1441             variance)) {
1442             __PACKAGE__->_memoize_method($_, 1);
1443             };
1444              
1445             # add shorter alias of standard_deviation (this must happen AFTER memoization)
1446 25     25   177 {
  25         69  
  25         8698  
1447             no warnings 'once'; ## no critic
1448             *std_dev = \&standard_deviation;
1449             *stdev = \&standard_deviation;
1450             };
1451              
1452             # Get number of values below $x
1453             # Like sum_of(sub{1}, undef, $x), but faster.
1454             # Used by cdf()
1455 105     105   144 sub _count {
1456 105 100       214 my $self = shift;
1457 74         95 @_>1 and return $self->_count($_[1]) - $self->_count($_[0]);
1458             my $x = shift;
1459 74         130  
1460 74         137 my $upper = $self->_upper($x);
1461 74 100       148 my $i = _bin_search_gt( $self->_sort, $upper );
1462 65         110 !$i-- and return 0;
1463             my $count = $self->_probability->[$i];
1464              
1465 65         115 # interpolate
1466 65 100       273 my $bin = $self->_round( $x );
1467 34         57 if (my $val = $self->{data}{$bin}) {
1468 34 100       71 my $width = ($upper - $bin) * 2;
1469 34         50 my $part = $width ? ( ($upper - $x) / $width) : 1/2;
1470             $count -= $part * $val;
1471 65         178 };
1472             return $count;
1473             };
1474              
1475             # BINARY SEARCH
1476             # Not a method, just a function
1477             # Takes sorted \@array and a $num
1478             # Return lowest $i such that $array[$i] >= $num
1479             # Return (scalar @array) if no such $i exists
1480 147     147   865 sub _bin_search_ge {
1481             my ($array, $x) = @_;
1482 147 100 100     640  
1483 126         226 return 0 unless @$array and $array->[0] < $x;
1484 126         200 my $l = 0;
1485 126         286 my $r = @$array;
1486 405         774 while ($l+1 < $r) {
1487 405 100       1045 my $m = int( ($l + $r) /2);
1488             $array->[$m] < $x ? $l = $m : $r = $m;
1489 126         280 };
1490             return $l+1;
1491             };
1492 74     74   130 sub _bin_search_gt {
1493 74         112 my ($array, $x) = @_;
1494 74 100 100     216 my $i = _bin_search_ge(@_);
1495 74         108 $i++ if defined $array->[$i] and $array->[$i] == $x;
1496             return $i;
1497             };
1498              
1499             # THE CORE
1500             # Here come the number=>bin functions
1501             # round() generates bin center
1502             # upper() and lower() are respective boundaries.
1503             # Here's the algorithm:
1504             # 1) determine whether bin is linear or logarithmic
1505             # 2) for linear bins, return bin# * bucket_width (==2*absolute error)
1506             # add/subtract 0.5 to get edges.
1507             # 3) for logarithmic bins, return base ** bin# with appropriate sign
1508             # multiply by precalculated constant to get edges
1509             # note that +0.5 doesn't work here, since sqrt(a*b) != (a+b)/2
1510             # This part is fragile and can be written better
1511              
1512             # center of bin containing x
1513 90924     90924   121697 sub _round {
1514 90924         114147 my $self = shift;
1515             my $x = shift;
1516 25     25   189  
  25         73  
  25         3763  
1517             use warnings FATAL => qw(numeric uninitialized);
1518 90924 100       168333  
1519             if (abs($x) <= $self->{linear_thresh}) {
1520 1792   100     5680 return $self->{linear_width}
1521             && $self->{linear_width} * floor( $x / $self->{linear_width} + 0.5 );
1522 89129         178971 };
1523 89129         149352 my $i = floor (((log abs $x) - $self->{logfloor})/ $self->{logbase});
1524 89129 100       316496 my $value = $self->{base} ** $i;
1525             return $x < 0 ? -$value : $value;
1526             };
1527              
1528             # lower, upper limits of bin containing x
1529 887     887   52542 sub _lower {
1530 887         1353 my $self = shift;
1531             my $x = shift;
1532 25     25   199  
  25         65  
  25         8028  
1533             use warnings FATAL => qw(numeric uninitialized);
1534 887 100       2322  
1535             if (abs($x) <= $self->{linear_thresh}) {
1536 188   66     1784 return $self->{linear_width}
1537             && $self->{linear_width} * (floor( $x / $self->{linear_width} + 0.5) - 0.5);
1538 699         2178 };
1539 699 100       1490 my $i = floor (((log abs $x) - $self->{logfloor} )/ $self->{logbase});
1540 310         1294 if ($x > 0) {
1541             return $self->{floor} * $self->{base}**($i);
1542 389         1624 } else {
1543             return -$self->{floor} * $self->{base}**($i+1);
1544             };
1545             };
1546              
1547 462     462   5227 sub _upper {
1548             return -$_[0]->_lower(-$_[1]);
1549             };
1550              
1551             # build bin index
1552 191     191   308 sub _sort {
1553             my $self = shift;
1554 191   100     758 return $self->{cache}{sorted}
  1234         2257  
  32         292  
1555             ||= [ sort { $a <=> $b } keys %{ $self->{data} } ];
1556             };
1557              
1558             # build cumulative bin counts index
1559 107     107   170 sub _probability {
1560 107   66     330 my $self = shift;
1561 20         54 return $self->{cache}{probability} ||= do {
1562 20         41 my @array;
1563 20         43 my $sum = 0;
  20         89  
1564 165         324 foreach (@{ $self->_sort }) {
1565 165         391 $sum += $self->{data}{$_};
1566             push @array, $sum;
1567 20         115 };
1568             \@array;
1569             };
1570             };
1571              
1572             =head1 AUTHOR
1573              
1574             Konstantin S. Uvarin, C<< >>
1575              
1576             =head1 BUGS
1577              
1578             The module is currently under development. There may be bugs.
1579              
1580             C only works for discrete distributions, and simply returns
1581             the first bin with largest bin count.
1582             A better algorithm is wanted.
1583              
1584             C should have been made a private method.
1585             Its signature and/or name may change in the future.
1586              
1587             See the TODO file in the distribution package.
1588              
1589             Please feel free to post bugs and/or feature requests to github:
1590             L
1591              
1592             Alternatively, you can use CPAN RT
1593             via e-mail C,
1594             or through the web interface at
1595             L.
1596              
1597             Your contribution is appreciated.
1598              
1599             =head1 SUPPORT
1600              
1601             You can find documentation for this module with the perldoc command.
1602              
1603             perldoc Statistics::Descriptive::LogScale
1604              
1605             You can also look for information at:
1606              
1607             =over 4
1608              
1609             =item * GitHub:
1610              
1611             L
1612              
1613             =item * RT: CPAN's request tracker (report bugs here)
1614              
1615             L
1616              
1617             =item * AnnoCPAN: Annotated CPAN documentation
1618              
1619             L
1620              
1621             =item * CPAN Ratings
1622              
1623             L
1624              
1625             =item * Search CPAN
1626              
1627             L
1628              
1629             =back
1630              
1631             =head1 ACKNOWLEDGEMENTS
1632              
1633             This module was inspired by a talk that Andrew Aksyonoff, author of
1634             L,
1635             has given at HighLoad++ conference in Moscow, 2012.
1636              
1637             L was and is used as reference when in doubt.
1638             Several code snippets were shamelessly stolen from there.
1639              
1640             C and C parameter names were suggested by
1641             CountZero from http://perlmonks.org
1642              
1643             =head1 LICENSE AND COPYRIGHT
1644              
1645             Copyright 2013-2015 Konstantin S. Uvarin.
1646              
1647             This program is free software; you can redistribute it and/or modify it
1648             under the terms of either: the GNU General Public License as published
1649             by the Free Software Foundation; or the Artistic License.
1650              
1651             See http://dev.perl.org/licenses/ for more information.
1652              
1653             =cut
1654              
1655             1; # End of Statistics::Descriptive::LogScale