File Coverage

blib/lib/Statistics/Descriptive/LogScale.pm
Criterion Covered Total %
statement 363 364 99.7
branch 139 164 84.7
condition 52 68 76.4
subroutine 56 56 100.0
pod 28 28 100.0
total 638 680 93.8


line stmt bran cond sub pod time code
1 23     23   1406756 use 5.006;
  23         334  
2 23     23   147 use strict;
  23         52  
  23         623  
3 23     23   125 use warnings;
  23         61  
  23         1692  
4              
5             package Statistics::Descriptive::LogScale;
6              
7             =head1 NAME
8              
9             Statistics::Descriptive::LogScale - Memory-efficient approximate univariate
10             descriptive statistics class.
11              
12             =head1 VERSION
13              
14             Version 0.10
15              
16             =cut
17              
18             our $VERSION = 0.10;
19              
20             =head1 SYNOPSIS
21              
22             =head2 Basic usage
23              
24             The basic usage is roughly the same as that of L.
25              
26             use Statistics::Descriptive::LogScale;
27             my $stat = Statistics::Descriptive::LogScale->new ();
28              
29             while(<>) {
30             chomp;
31             $stat->add_data($_);
32             };
33              
34             # This can also be done in O(1) memory, precisely
35             printf "Mean: %f +- %f\n", $stat->mean, $stat->standard_deviation;
36             # This requires storing actual data, or approximating
37             printf "25%% : %f\n", $stat->percentile(25);
38             printf "Median: %f\n", $stat->median;
39             printf "75%% : %f\n", $stat->percentile(75);
40              
41             =head2 Save/load
42              
43             This is not present in L.
44             The save/load interface is designed compatible with JSON::XS.
45             However, any other serializer can be used.
46             The C method is I to return unblessed hashref
47             with enough information to restore the original object.
48              
49             use Statistics::Descriptive::LogScale;
50             my $stat = Statistics::Descriptive::LogScale->new ();
51              
52             # ..... much later
53             # Save
54             print $fd encoder_of_choice( $stat->TO_JSON )
55             or die "Failed to save: $!";
56              
57             # ..... and even later
58             # Load
59             my $plain_hash = decoder_of_choice( $raw_data );
60             my $copy_of_stat = Statistics::Descriptive::LogScale->new( %$plain_hash );
61              
62             # Import into existing LogScale instance
63             my $plain_hash = decoder_of_choice( $more_raw_data );
64             $copy_of_stat->add_data_hash( $plain_hash->{data} );
65              
66             =head2 Histograms
67              
68             Both L and L
69             offer C method for querying data point counts.
70             However, there's also C method for making pretty pictures.
71             Here's a simple text-based histogram.
72             A proper GD example was too long to fit into this margin.
73              
74             use strict;
75             use warnings;
76              
77             use Statistics::Descriptive::LogScale;
78             my $stat = Statistics::Descriptive::LogScale->new ();
79              
80             # collect/load data ...
81             my $re_float = qr([-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?);
82             while (<>) {
83             $stat->add_data($_) for m/($re_float)/g;
84             };
85             die "Empty set"
86             unless $stat->count;
87              
88             # get data in [ count, lower_bound, upper_bound ] format as arrayref
89             my $hist = $stat->histogram( count => 20 );
90              
91             # find maximum value to use as a scale factor
92             my $scale = $hist->[0][0];
93             $scale < $_->[0] and $scale = $_->[0] for @$hist;
94              
95             foreach (@$hist) {
96             printf "%10f %s\n", $_->[1], '#' x ($_->[0] * 68 / $scale);
97             };
98             printf "%10f\n", $hist->[-1][2];
99              
100             =head1 DESCRIPTION
101              
102             This module aims at providing some advanced statistical functions without
103             storing all data in memory, at the cost of certain (predictable) precision loss.
104              
105             Data is represented by a set of bins that only store counts of fitting
106             data points.
107             Most bins are logarithmic, i.e. lower end / upper end ratio is constant.
108             However, around zero linear approximation may be user instead
109             (see "linear_width" and "linear_thresh" parameters in new()).
110              
111             All operations are then performed on the bins, introducing relative error
112             which does not, however, exceed the bins' relative width ("base").
113              
114             =head1 METHODS
115              
116             =cut
117              
118             ########################################################################
119             # == HOW IT WORKS ==
120             # Buckets are stored in a hash: { $value => $count, ... }
121             # {base} is bin width, {logbase} == log {base} (cache)
122             # {linear_thresh} is where we switch to equal bin approximation
123             # {linear_width} is width of bin around zero (==linear_thresh if not given)
124             # {floor} is lower bound of bin whose center is 1. {logfloor} = log {floor}
125             # Nearly all meaningful subs have to scan all the bins, which is bad,
126             # but anyway better than scanning full sample.
127              
128 23     23   160 use Carp;
  23         56  
  23         1504  
129 23     23   7634 use POSIX qw(floor ceil);
  23         136190  
  23         165  
130              
131             # Fields are NOT used internally for now, so this is just a declaration
132 23         104 use fields qw(
133             data count
134             linear_width base logbase floor linear_thresh logfloor
135             cache
136 23     23   41360 );
  23         34367  
137              
138             # Some internal constants
139             # This is for infinite portability^W^W portable infinity
140             my $INF = 9**9**9;
141             my $re_num = qr/(?:[-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?)/;
142              
143             =head2 new( %options )
144              
145             %options may include:
146              
147             =over
148              
149             =item * base - ratio of adjacent bins. Default is 10^(1/232), which gives
150             1% precision and exact decimal powers.
151             This value represents acceptable relative error in analysis results.
152              
153             B Actual value may be slightly less than requested one.
154             This is done so to avoid troubles with future rounding in (de)serialization.
155              
156             =item * linear_width - width of linear bins around zero.
157             This value represents precision of incoming data.
158             Default is zero, i.e. we assume that the measurement is precise.
159              
160             B Actual value may be less (by no more than a factor of C)
161             so that borders of linear and logarithmic bins fit nicely.
162              
163             =item * linear_thresh - where to switch to linear approximation.
164             If only one of C and C is given,
165             the other will be calculated.
166             However, user may want to specify both in some cases.
167              
168             B Actual value may be less (by no more than a factor of C)
169             so that borders of linear and logarithmic bins fit nicely.
170              
171             =item * data - hashref with C<{ value => weight }> for initializing data.
172             Used for cloning.
173             See C.
174              
175             =item * zero_thresh - absolute value threshold below which everything is
176             considered zero.
177             DEPRECATED, C and C override this if given.
178              
179             =back
180              
181             =cut
182              
183             my @new_keys = qw( base linear_thresh linear_width zero_thresh data );
184             # TODO Throw if extra options given?
185             sub new {
186 42     42 1 24214 my $class = shift;
187 42         187 my %opt = @_;
188              
189             # base for logarithmic bins, sane default: +-1%, exact decimal powers
190             # UGLY HACK number->string->number to avoid
191             # future serialization inconsistencies
192 42   100     275 $opt{base} ||= 10**(1/232);
193 42         376 $opt{base} = 0 + "$opt{base}";
194 42 50       170 $opt{base} > 1 or croak __PACKAGE__.": new(): base must be >1";
195              
196             # calculate where to switch to linear approximation
197             # the condition is: linear bin( thresh ) ~~ log bin( thresh )
198             # i.e. thresh * base - thresh ~~ absolute error * 2
199             # i.e. thresh ~~ absolute_error * 2 / (base - 1)
200             # also support legacy API (zero_thresh)
201 42 100       150 if (defined $opt{linear_thresh} ) {
202 13   100     61 $opt{linear_width} ||= $opt{linear_thresh} * ($opt{base}-1);
203             } else {
204 29         83 $opt{linear_thresh} = $opt{zero_thresh};
205             };
206             $opt{linear_thresh} = abs($opt{linear_width}) / ($opt{base} - 1)
207 42 100 100     217 if $opt{linear_width} and !$opt{linear_thresh};
208 42   100     206 $opt{linear_thresh} ||= 0;
209 42 50       172 $opt{linear_thresh} >= 0
210             or croak __PACKAGE__.": new(): linear_thresh must be >= 0";
211              
212             # Can't use fields::new anymore
213             # due to JSON::XS incompatibility with restricted hashes
214 42         135 my $self = bless {}, $class;
215              
216 42         198 $self->{base} = $opt{base};
217             # cache values to ease calculations
218             # floor = (lower end of bin) / (center of bin)
219 42         187 $self->{floor} = 2/(1+$opt{base});
220 42         186 $self->{logbase} = log $opt{base};
221 42         127 $self->{logfloor} = log $self->{floor};
222              
223             # bootstrap linear_thresh - make it fit bin edge
224 42         109 $self->{linear_width} = $self->{linear_thresh} = 0;
225 42         192 $self->{linear_thresh} = $self->_lower( $opt{linear_thresh} );
226              
227             # divide anything below linear_thresh into odd number of bins
228             # not exceeding requested linear_width
229 42 100       146 if ($self->{linear_thresh}) {
230 15   66     64 my $linear_width = $opt{linear_width} || 2 * $self->{linear_thresh};
231 15         76 my $n_linear = ceil(2 * $self->{linear_thresh} / abs($linear_width));
232 15 100       63 $n_linear++ unless $n_linear % 2;
233 15         48 $self->{linear_width} = (2 * $self->{linear_thresh} / $n_linear);
234             };
235              
236 42         177 $self->clear;
237 42 100       139 if ($opt{data}) {
238 10         37 $self->add_data_hash($opt{data});
239             };
240 42         206 return $self;
241             };
242              
243             =head1 General statistical methods
244              
245             These methods are used to query the distribution properties. They generally
246             follow the interface of L and co,
247             with minor additions.
248              
249             All methods return C on empty data set, except for
250             C, C, C, C and C
251             which all return 0.
252              
253             B This module caches whatever it calculates very agressively.
254             Don't hesitate to use statistical functions (except for sum_of/mean_of)
255             more than once. The cache is deleted upon data entry.
256              
257             =head2 clear
258              
259             Destroy all stored data.
260              
261             =cut
262              
263             sub clear {
264 56     56 1 8319 my $self = shift;
265 56         2266 $self->{data} = {};
266 56         181 $self->{count} = 0;
267 56         150 delete $self->{cache};
268 56         115 return $self;
269             };
270              
271             =head2 add_data( @data )
272              
273             Add numbers to the data pool.
274              
275             Returns self, so that methods can be chained.
276              
277             If incorrect data is given (i.e. non-numeric, undef),
278             an exception is thrown and only partial data gets inserted.
279             The state of object is guaranteed to remain consistent in such case.
280              
281             B Cache is reset, even if no data was actually inserted.
282              
283             B It is possible to add infinite values to data pool.
284             The module will try and calculate whatever can still be calculated.
285             However, no portable way of serializing such values is done yet.
286              
287             =cut
288              
289             sub add_data {
290 51213     51213 1 162300 my $self = shift;
291              
292 51213         71115 delete $self->{cache};
293 51213         80032 foreach (@_) {
294 90141         152375 $self->{data}{ $self->_round($_) }++;
295 90138         167178 $self->{count}++;
296             };
297 51210         92938 $self;
298             };
299              
300             =head2 count
301              
302             Returns number of data points.
303              
304             =cut
305              
306             sub count {
307 69     69 1 10188 my $self = shift;
308 69         284 return $self->{count};
309             };
310              
311             =head2 min
312              
313             =head2 max
314              
315             Values of minimal and maximal bins.
316              
317             NOTE: Due to rounding, some of the actual inserted values may fall outside
318             of the min..max range. This may change in the future.
319              
320             =cut
321              
322             sub min {
323             my $self = shift;
324             return $self->_sort->[0];
325             };
326              
327             sub max {
328             my $self = shift;
329             return $self->_sort->[-1];
330             };
331              
332             =head2 sample_range
333              
334             Return sample range of the dataset, i.e. max() - min().
335              
336             =cut
337              
338             sub sample_range {
339 5     5 1 739 my $self = shift;
340 5 100       16 return $self->count ? $self->max - $self->min : undef;
341             };
342              
343             =head2 sum
344              
345             Return sum of all data points.
346              
347             =cut
348              
349             sub sum {
350             my $self = shift;
351             return $self->sum_of(sub { $_[0] });
352             };
353              
354             =head2 sumsq
355              
356             Return sum of squares of all datapoints.
357              
358             =cut
359              
360             sub sumsq {
361             my $self = shift;
362             return $self->sum_of(sub { $_[0] * $_[0] });
363             };
364              
365             =head2 mean
366              
367             Return mean, or average value, i.e. sum()/count().
368              
369             =cut
370              
371             sub mean {
372             my $self = shift;
373             return $self->{count} ? $self->sum / $self->{count} : undef;
374             };
375              
376             =head2 variance
377              
378             =head2 variance( $correction )
379              
380             Return data variance, i.e. E((x - E(x)) ** 2).
381              
382             Bessel's correction (division by n-1 instead of n) is used by default.
383             This may be changed by specifying $correction explicitly.
384              
385             B The binning strategy used here should also introduce variance bias.
386             This is not yet accounted for.
387              
388             =cut
389              
390             sub variance {
391             my $self = shift;
392             my $correction = shift;
393              
394             # in fact we'll receive correction='' because of how cache works
395             $correction = 1 unless defined $correction and length $correction;
396              
397             return 0 if ($self->{count} < 1 + $correction);
398              
399             my $var = $self->sumsq - $self->sum**2 / $self->{count};
400             return $var <= 0 ? 0 : $var / ( $self->{count} - $correction );
401             };
402              
403             =head2 standard_deviation
404              
405             =head2 standard_deviation( $correction )
406              
407             =head2 std_dev
408              
409             Return standard deviation, i.e. square root of variance.
410              
411             Bessel's correction (division by n-1 instead of n) is used by default.
412             This may be changed by specifying $correction explicitly.
413              
414             B The binning strategy used here should also introduce variance bias.
415             This is not yet accounted for.
416              
417             =cut
418              
419             sub standard_deviation {
420             my $self = shift;
421              
422             return sqrt($self->variance(@_));
423             };
424              
425             =head2 cdf ($x)
426              
427             Cumulative distribution function. Returns estimated probability of
428             random data point from the sample being less than C<$x>.
429              
430             As a special case, C accounts for I of zeroth bin count (if any).
431              
432             Not present in Statistics::Descriptive::Full, but appears in
433             L.
434              
435             =head2 cdf ($x, $y)
436              
437             Returns probability of a value being between C<$x> and C<$y> ($x <= $y).
438             This is essentially C.
439              
440             =cut
441              
442             sub cdf {
443 4     4 1 9 my $self = shift;
444 4 50       11 return unless $self->{count};
445 4         10 return $self->_count(@_) / $self->{count};
446             };
447              
448             =head2 percentile( $n )
449              
450             Find $n-th percentile, i.e. a value below which lies $n % of the data.
451              
452             0-th percentile is by definition -inf and is returned as undef
453             (see Statistics::Descriptive).
454              
455             $n is a real number, not necessarily integer.
456              
457             =cut
458              
459             sub percentile {
460 48     48 1 3068 my $self = shift;
461 48         80 my $x = shift;
462              
463             # assert 0<=$x<=100
464 48 50 33     218 croak __PACKAGE__.": percentile() argument must be between 0 and 100"
465             unless 0<= $x and $x <= 100;
466              
467 48         124 my $need = $x * $self->{count} / 100;
468 48 100       137 return if $need < 1;
469              
470             # dichotomize
471             # $i is lowest value >= needed
472             # $need doesnt exceed last bin!
473 42         99 my $i = _bin_search_ge( $self->_probability, $need );
474 42         94 return $self->_sort->[ $i ];
475             };
476              
477             =head2 quantile( 0..4 )
478              
479             From Statistics::Descriptive manual:
480              
481             0 => zero quartile (Q0) : minimal value
482             1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile
483             2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile
484             3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile
485             4 => fourth quartile (Q4) : maximal value
486              
487             =cut
488              
489             sub quantile {
490             my $self = shift;
491             my $t = shift;
492              
493             croak (__PACKAGE__.": quantile() argument must be one of 0..4")
494             unless $t =~ /^[0-4]$/;
495              
496             $t or return $self->min;
497             return $self->percentile($t * 100 / 4);
498             };
499              
500             =head2 median
501              
502             Return median of data, a value that divides the sample in half.
503             Same as percentile(50).
504              
505             =cut
506              
507             sub median {
508 16     16 1 1171 my $self = shift;
509 16         46 return $self->percentile(50);
510             };
511              
512             =head2 trimmed_mean( $ltrim, [ $utrim ] )
513              
514             Return mean of sample with $ltrim and $utrim fraction of data points
515             remover from lower and upper ends respectively.
516              
517             ltrim defaults to 0, and rtrim to ltrim.
518              
519             =cut
520              
521             sub trimmed_mean {
522 1     1 1 3 my $self = shift;
523 1         2 my ($lower, $upper) = @_;
524 1   50     3 $lower ||= 0;
525 1 50       4 $upper = $lower unless defined $upper;
526              
527 1         4 my $min = $self->percentile($lower * 100);
528 1         4 my $max = $self->percentile(100 - $upper * 100);
529              
530 1 50       4 return unless $min < $max;
531              
532 1     5   6 return $self->mean_of(sub{$_[0]}, $min, $max);
  5         12  
533             };
534              
535             =head2 harmonic_mean
536              
537             Return harmonic mean of the data, i.e. 1/E(1/x).
538              
539             Return undef if division by zero occurs (see Statistics::Descriptive).
540              
541             =cut
542              
543             sub harmonic_mean {
544 2     2 1 708 my $self = shift;
545              
546 2         4 my $ret;
547 2         4 eval {
548 2     5   6 $ret = $self->count / $self->sum_of(sub { 1/$_[0] });
  5         22  
549             };
550 2 50 66     23 if ($@ and $@ !~ /division.*zero/) {
551 0         0 die $@; # rethrow ALL BUT 1/0 which yields undef
552             };
553 2         12 return $ret;
554             };
555              
556             =head2 geometric_mean
557              
558             Return geometric mean of the data, that is, exp(E(log x)).
559              
560             Dies unless all data points are of the same sign.
561              
562             =cut
563              
564             sub geometric_mean {
565 2     2 1 719 my $self = shift;
566              
567 2 100       28 return unless $self->count;
568 1 50       3 croak __PACKAGE__.": geometric_mean() called on mixed sign sample"
569             if $self->min * $self->max < 0;
570              
571 1 50       3 return 0 if $self->{data}{0};
572             # this must be dog slow, but we already log() too much at this point.
573 1     5   6 my $ret = exp( $self->sum_of( sub { log abs $_[0] } ) / $self->{count} );
  5         13  
574 1 50       5 return $self->min < 0 ? -$ret : $ret;
575             };
576              
577             =head2 skewness
578              
579             Return skewness of the distribution, calculated as
580             n/(n-1)(n-2) * E((x-E(x))**3)/std_dev**3 (this is consistent with Excel).
581              
582             =cut
583              
584             sub skewness {
585 20     20 1 1567 my $self = shift;
586              
587 20         50 my $n = $self->{count};
588 20 100       73 return unless $n > 2;
589              
590             # code stolen from Statistics::Descriptive
591 18         59 my $skew = $n * $self->std_moment(3);
592 18         49 my $correction = $n / ( ($n-1) * ($n-2) );
593 18         95 return $correction * $skew;
594             };
595              
596             =head2 kurtosis
597              
598             Return kurtosis of the distribution, that is 4-th standardized moment - 3.
599             The exact formula used here is consistent with that of Excel and
600             Statistics::Descriptive.
601              
602             =cut
603              
604             sub kurtosis {
605 22     22 1 805 my $self = shift;
606              
607 22         56 my $n = $self->{count};
608 22 100       85 return unless $n > 3;
609              
610             # code stolen from Statistics::Descriptive
611 20         53 my $kurt = $n * $self->std_moment(4);
612 20         73 my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) );
613 20         58 my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) );
614              
615 20         99 return $correction1 * $kurt - $correction2;
616             };
617              
618             =head2 central_moment( $n )
619              
620             Return $n-th central moment, that is, E((x - E(x))^$n).
621              
622             Not present in Statistics::Descriptive::Full.
623              
624             =cut
625              
626             sub central_moment {
627             my $self = shift;
628             my $n = shift;
629              
630             my $mean = $self->mean;
631             return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) / $self->{count};
632             };
633              
634             =head2 std_moment( $n )
635              
636             Return $n-th standardized moment, that is,
637             E((x - E(x))**$n) / std_dev(x)**$n.
638              
639             Not present in Statistics::Descriptive::Full.
640              
641             =cut
642              
643             sub std_moment {
644             my $self = shift;
645             my $n = shift;
646              
647             my $mean = $self->mean;
648             my $dev = $self->std_dev;
649             return $self->sum_of(sub{ ($_[0] - $mean) ** $n })
650             / ( $dev**$n * $self->{count} );
651             };
652              
653             =head2 mode
654              
655             Mode of a distribution is the most common value for a discrete distribution,
656             or maximum of probability density for continuous one.
657              
658             For now we assume that the distribution IS discrete, and return the bin with
659             the biggest hit count.
660              
661             NOTE A better algorithm is still wanted. Experimental.
662             Behavior may change in the future.
663              
664             =cut
665              
666             # Naive implementation
667             # Find bin w/greatest count and return it
668             sub mode {
669             my $self = shift;
670              
671             return if !$self->count;
672              
673             my $index = $self->_sort;
674             return $index->[0] if @$index == 1;
675              
676             my @count = map { $self->{data}{$_} } @$index;
677              
678             my $max_index;
679             my $max_growth = 0;
680             for (my $i = 0; $i<@count; $i++) {
681             $count[$i] > $max_growth or next;
682             $max_index = $i;
683             $max_growth = $count[$i];
684             };
685              
686             return $index->[$max_index];
687             };
688              
689             =head2 frequency_distribution_ref( \@index )
690              
691             =head2 frequency_distribution_ref( $n )
692              
693             =head2 frequency_distribution_ref
694              
695             Return numbers of data point counts below each number in @index as hashref.
696              
697             If a number is given instead of arrayref, @index is created
698             by dividing [min, max] into $n intervals.
699              
700             If no parameters are given, return previous result, if any.
701              
702             =cut
703              
704             sub frequency_distribution_ref {
705 3     3 1 1106 my $self = shift;
706 3         5 my $index = shift;
707              
708 3 100       9 return unless $self->count;
709             # ah, compatibility - return last value
710             return $self->{cache}{frequency_distribution_ref}
711 2 50       5 unless defined $index;
712             # make index if number given
713 2 100       8 if (!ref $index) {
714 1 50       3 croak __PACKAGE__.": frequency_distribution_ref(): ".
715             "argument must be array, of number > 2, not $index"
716             unless $index > 2;
717 1         8 my $min = $self->_lower($self->min);
718 1         4 my $max = $self->_upper($self->max);
719 1         3 my $step = ($max - $min) / $index;
720 1         4 $index = [ map { $min + $_ * $step } 1..$index ];
  5         9  
721             };
722              
723 2         10 @$index = (-$INF, sort { $a <=> $b } @$index);
  12         24  
724              
725 2         5 my @count;
726 2         9 for (my $i = 0; $i<@$index-1; $i++) {
727 9         27 push @count, $self->_count( $index->[$i], $index->[$i+1] );
728             };
729 2         5 shift @$index; # remove -inf
730              
731 2         9 my %hash;
732 2         16 @hash{@$index} = @count;
733 2         7 $self->{cache}{frequency_distribution_ref} = \%hash;
734 2         9 return \%hash;
735             };
736              
737             =head1 Specific methods
738              
739             The folowing methods only apply to this module, or are experimental.
740              
741             =cut
742              
743             =head2 bucket_width
744              
745             Get bin width (relative to center of bin). Percentiles are off
746             by no more than half of this. DEPRECATED.
747              
748             =cut
749              
750             sub bucket_width {
751 5     5 1 10 my $self = shift;
752 5         32 return $self->{base} - 1;
753             };
754              
755             =head2 log_base
756              
757             Get upper/lower bound ratio for logarithmic bins.
758             This represents relative precision of sample.
759              
760             =head2 linear_width
761              
762             Get width of linear buckets.
763             This represents absolute precision of sample.
764              
765             =head2 linear_threshold
766              
767             Get absolute value threshold below which interpolation is switched to linear.
768              
769             =cut
770              
771             sub log_base {
772 3     3 1 11 my $self = shift;
773 3         34 return $self->{base};
774             };
775              
776             sub linear_width {
777 3     3 1 6 my $self = shift;
778 3         26 return $self->{linear_width};
779             };
780              
781             sub linear_threshold {
782 8     8 1 33 my $self = shift;
783 8         41 return $self->{linear_thresh};
784             };
785              
786             =head2 add_data_hash ( { value => weight, ... } )
787              
788             Add values with counts/weights.
789             This can be used to import data from other
790             Statistics::Descriptive::LogScale object.
791              
792             Returns self, so that methods can be chained.
793              
794             Negative counts are allowed and treated as "forgetting" data.
795             If a bin count goes below zero, such bin is simply discarded.
796             Minus infinity weight is allowed and has the same effect.
797             Data is guaranteed to remain consistent.
798              
799             If incorrect data is given (i.e. non-numeric, undef, or +infinity),
800             an exception is thrown and nothing changes.
801              
802             B Cache may be reset, even if no data was actually inserted.
803              
804             B It is possible to add infinite values to data pool.
805             The module will try and calculate whatever can still be calculated.
806             However, no portable way of serializing such values is done yet.
807              
808             =cut
809              
810             sub add_data_hash {
811 19     19 1 62 my $self = shift;
812 19         41 my $hash = shift;
813              
814             # check incoming data for being numeric, and no +inf values
815 19         34 eval {
816 23     23   51696 use warnings FATAL => qw(numeric);
  23         64  
  23         46706  
817 19         114 while (my ($k, $v) = each %$hash) {
818 289 100 33     1546 $k == 0+$k and $v == 0+$v and $v < $INF
      66        
819             or die "Infinite count for $k\n";
820             }
821             };
822 19 100       278 croak __PACKAGE__.": add_data_hash failed: $@"
823             if $@;
824              
825 17         40 delete $self->{cache};
826              
827             # update our counters
828 17         78 foreach (keys %$hash) {
829 286 50       517 next unless $hash->{$_};
830 286         522 my $key = $self->_round($_);
831              
832             # Insert data. Make sure -Inf doesn't corrupt our counter.
833 286   100     1016 my $newcount = ($self->{data}{$key} || 0) + $hash->{$_};
834 286 100       513 if ($newcount > 0) {
835             # normal insert
836 274         663 $self->{data}{$key} = $newcount;
837 274         479 $self->{count} += $hash->{$_};
838             } else {
839             # We're "forgetting" data, AND the bin got empty
840 12   100     44 $self->{count} -= delete $self->{data}{$key} || 0;
841             };
842             };
843 17         55 $self;
844             };
845              
846             =head2 get_data_hash( %options )
847              
848             Return distribution hashref {value => number of occurances}.
849              
850             This is inverse of add_data_hash.
851              
852             Options may include:
853              
854             =over
855              
856             =item * min - ignore values below this. (See find_boundaries)
857              
858             =item * max - ignore values above this. (See find_boundaries)
859              
860             =item * ltrim - ignore this % of values on lower end. (See find_boundaries)
861              
862             =item * rtrim - ignore this % of values on upper end. (See find_boundaries)
863              
864             =item * noise_thresh - strip bins with count below this.
865              
866             =back
867              
868             =cut
869              
870             sub get_data_hash {
871 63     63 1 12106 my ($self, %opt) = @_;
872              
873             # shallow copy of data if no options given
874 63 100       197 return {%{ $self->{data} }} unless %opt;
  61         739  
875              
876 2         8 my ($min, $max) = $self->find_boundaries( %opt );
877 2   100     10 my $noize = $opt{noize_thresh} || 0;
878              
879 2         5 my $data = $self->{data};
880 2         4 my %hash;
881 2         8 foreach (keys %$data ) {
882 13 100       38 $_ < $min and next;
883 8 50       24 $_ > $max and next;
884 8 100       18 $data->{$_} < $noize and next;
885 6         10 $hash{$_} = $data->{$_};
886             };
887              
888 2         13 return \%hash;
889             };
890              
891             =head2 TO_JSON()
892              
893             Return enough data to recreate the whole object as an unblessed hashref.
894              
895             This routine conforms with C, hence the name.
896             Can be called as
897              
898             my $str = JSON::XS->new->allow_blessed->convert_blessed->encode( $this );
899              
900             B This module DOES NOT require JSON::XS or serialize to JSON.
901             It just deals with data.
902             Use C, C, C or any serializer of choice.
903              
904             my $raw_data = $stat->TO_JSON;
905             Statistics::Descriptive::LogScale->new( %$raw_data );
906              
907             Would generate an exact copy of C<$stat> object
908             (provided it's S::D::L and not a subclass).
909              
910             =head2 clone( [ %options ] )
911              
912             Copy constructor - returns copy of an existing object.
913             Cache is not preserved.
914              
915             Constructor options may be given to override existing data. See new().
916              
917             Trim options may be given to get partial data. See get_data_hash().
918              
919             =cut
920              
921             sub clone {
922 7     7 1 1099 my ($self, %opt) = @_;
923              
924 7         19 my $raw = $self->TO_JSON;
925 7 100       22 if (%opt) {
926             $raw->{data} = $self->get_data_hash( %opt )
927 4 100       10 unless exists $opt{data};
928             exists $opt{$_} and $raw->{$_} = $opt{$_}
929 4   66     34 for @new_keys;
930             };
931              
932 7         39 return (ref $self)->new( %$raw );
933             };
934              
935             sub TO_JSON {
936 31     31 1 219 my $self = shift;
937             # UGLY HACK Increase linear_thresh by a factor of base ** 1/10
938             # so that it's rounded down to present value
939             return {
940             CLASS => ref $self,
941             VERSION => $VERSION,
942             base => $self->{base},
943             linear_width => $self->{linear_width},
944 31         148 linear_thresh => $self->{linear_thresh} * ($self->{base}+9)/10,
945             data => $self->get_data_hash,
946             };
947             };
948              
949             =head2 scale_sample( $scale )
950              
951             Multiply all bins' counts by given value. This can be used to adjust
952             significance of previous data before adding new data (e.g. gradually
953             "forgetting" past data in a long-running application).
954              
955             =cut
956              
957             sub scale_sample {
958 1     1 1 7 my $self = shift;
959 1         3 my $factor = shift;
960 1 50       5 $factor > 0 or croak (__PACKAGE__.": scale_sample: factor must be positive");
961              
962 1         3 delete $self->{cache};
963 1         3 foreach (@{ $self->_sort }) {
  1         4  
964 100         141 $self->{data}{$_} *= $factor;
965             };
966 1         2 $self->{count} *= $factor;
967 1         3 return $self;
968             };
969              
970             =head2 mean_of( $code, [$min, $max] )
971              
972             Return expectation of $code over sample within given range.
973              
974             $code is expected to be a pure function (i.e. depending only on its input
975             value, and having no side effects).
976              
977             The underlying integration mechanism only calculates $code once per bin,
978             so $code should be stable as in not vary wildly over small intervals.
979              
980             =cut
981              
982             sub mean_of {
983 24     24 1 80 my $self = shift;
984 24         67 my ($code, $min, $max) = @_;
985              
986 24     429   119 my $weight = $self->sum_of( sub {1}, $min, $max );
  429         906  
987 24 100       112 return unless $weight;
988 23         70 return $self->sum_of($code, $min, $max) / $weight;
989             };
990              
991             =head1 Experimental methods
992              
993             These methods may be subject to change in the future, or stay, if they
994             are good.
995              
996             =head2 sum_of ( $code, [ $min, $max ] )
997              
998             Integrate arbitrary function over the sample within the [ $min, $max ] interval.
999             Default values for both limits are infinities of appropriate sign.
1000              
1001             Values in the edge bins are cut using interpolation if needed.
1002              
1003             NOTE: sum_of(sub{1}, $a, $b) would return rough nubmer of data points
1004             between $a and $b.
1005              
1006             EXPERIMENTAL. The method name may change in the future.
1007              
1008             =cut
1009              
1010             sub sum_of {
1011 147     147 1 945 my $self = shift;
1012 147         361 my ($code, $realmin, $realmax) = @_;
1013              
1014             # Just app up stuff
1015 147 100 100     614 if (!defined $realmin and !defined $realmax) {
1016 124         223 my $sum = 0;
1017 124         186 while (my ($val, $count) = each %{ $self->{data} }) {
  63236         162918  
1018 63112         100733 $sum += $count * $code->( $val );
1019             };
1020 124         679 return $sum;
1021             };
1022              
1023 23 100       69 $realmin = -$INF unless defined $realmin;
1024 23 100       61 $realmax = $INF unless defined $realmax;
1025 23 50       70 return 0 if( $realmin >= $realmax );
1026              
1027             # correct limits. $min, $max are indices; $left, $right are limits
1028 23         71 my $min = $self->_round($realmin);
1029 23         64 my $max = $self->_round($realmax);
1030 23         68 my $left = $self->_lower($realmin);
1031 23         72 my $right = $self->_upper($realmax);
1032              
1033             # find first bin that's above $left
1034 23         66 my $keys = $self->_sort;
1035 23         79 my $i = _bin_search_ge($keys, $left);
1036              
1037             # warn "sum_of [$min, $max]";
1038             # add up bins
1039 23         53 my $sum = 0;
1040 23         72 for (; $i < @$keys; $i++) {
1041 284         1000 my $val = $keys->[$i];
1042 284 100       684 last if $val > $right;
1043 264         653 $sum += $self->{data}{$val} * $code->( $val );
1044             };
1045              
1046             # cut edges: the hard part
1047             # min and max are now used as indices
1048             # if min or max hits 0, we cut it in half (i.e. into equal 0+ and 0-)
1049             # warn "Add up, sum_of = $sum";
1050 23 100       153 if ($self->{data}{$max}) {
1051 20         60 my $width = $self->_upper($max) - $self->_lower($max);
1052 20 100       65 my $part = $width
1053             ? ($self->_upper($max) - $realmax) / $width
1054             : 0.5;
1055 20         106 $sum -= $self->{data}{$max} * $code->($max) * $part;
1056             };
1057             # warn "Cut R, sum_of = $sum";
1058 23 100       131 if ($self->{data}{$min}) {
1059 20         56 my $width = $self->_upper($min) - $self->_lower($min);
1060 20 100       66 my $part = $width
1061             ? ($realmin - $self->_lower($min)) / $width
1062             : 0.5;
1063 20         81 $sum -= $self->{data}{$min} * $code->($min) * $part;
1064             };
1065             # warn "Cut L, sum_of = $sum";
1066              
1067 23         174 return $sum;
1068             }; # end sum_of
1069              
1070             =head2 histogram ( %options )
1071              
1072             Returns array of form [ [ count0_1, x0, x1 ], [count1_2, x1, x2 ], ... ]
1073             where countX_Y is number of data points between X and Y.
1074              
1075             Options may include:
1076              
1077             =over
1078              
1079             =item * count (+) - number of intervals to divide sample into.
1080              
1081             =item * index (+) - interval borders as array. Will be sorted before processing.
1082              
1083             =item * min - ignore values below this. Default = $self->min - epsilon.
1084              
1085             =item * max - ignore values above this. Default = $self->max + epsilon.
1086              
1087             =item * ltrim - ignore this % of values on lower end.
1088              
1089             =item * rtrim - ignore this % of values on upper end.
1090              
1091             =item * normalize_to - adjust counts so that max number becomes nnn.
1092             This may be useful if one intends to draw pictures.
1093              
1094             =back
1095              
1096             Either count or index must be present.
1097              
1098             NOTE: this is equivalent to frequency_distribution_ref but better suited
1099             for omitting sample tails and outputting pretty pictures.
1100              
1101             =cut
1102              
1103             sub histogram {
1104 5     5 1 3170 my ($self, %opt) = @_;
1105              
1106 5 100       21 return unless $self->count;
1107 4         21 my ($min, $max) = $self->find_boundaries( %opt );
1108             # build/check index
1109 4 100       10 my @index = @{ $opt{index} || [] };
  4         25  
1110 4 100       13 if (!@index) {
1111 3         7 my $n = $opt{count};
1112 3 50       9 $n > 0 or croak (__PACKAGE__.": histogram: insufficient options (count < 1 )");
1113 3         10 my $step = ($max - $min) / $n;
1114 3         10 for (my $x = $min; $x <= $max; $x += $step) {
1115 16         45 push @index, $x;
1116             };
1117             } else {
1118             # sort & uniq raw index
1119 1         4 my %known;
1120 1         5 @index = grep { !$known{$_}++ } @index;
  5         24  
1121 1         7 @index = sort { $a <=> $b } @index;
  8         19  
1122 1 50       7 @index > 1 or croak (__PACKAGE__.": histogram: insufficient options (index < 2)");
1123             };
1124              
1125             # finally: estimated counts between indices!
1126 4         9 my @ret;
1127 4         15 for ( my $i = 0; $i<@index-1; $i++) {
1128 17         51 my $count = $self->_count( $index[$i], $index[$i+1] );
1129 17         84 push @ret, [ $count, $index[$i], $index[$i+1] ];
1130             };
1131              
1132             # if normalize - find maximum & divide by it
1133 4 100       13 if (my $norm = $opt{normalize_to}) {
1134 1         3 my $max = 0;
1135             $max < $_->[0] and $max = $_->[0]
1136 1   66     8 for @ret;
1137 1         3 $norm /= $max;
1138 1         4 $_->[0] *= $norm for @ret;
1139             };
1140              
1141 4         25 return \@ret;
1142             };
1143              
1144             =head2 find_boundaries( %opt )
1145              
1146             Return ($min, $max) of part of sample denoted by options.
1147              
1148             Options may include:
1149              
1150             =over
1151              
1152             =item * min - ignore values below this. default = min() - epsilon.
1153              
1154             =item * max - ignore values above this. default = max() + epsilon.
1155              
1156             =item * ltrim - ignore this % of values on lower end.
1157              
1158             =item * rtrim - ignore this % of values on upper end.
1159              
1160             =back
1161              
1162             If no options are given, the whole sample is guaranteed to reside between
1163             returned values.
1164              
1165             =cut
1166              
1167             sub find_boundaries {
1168 9     9 1 1492 my $self = shift;
1169 9         28 my %opt = @_;
1170              
1171 9 100       26 return unless $self->count;
1172              
1173             # preprocess boundaries
1174 8 100       35 my $min = defined $opt{min} ? $opt{min} : $self->_lower( $self->min );
1175 8 100       35 my $max = defined $opt{max} ? $opt{max} : $self->_upper( $self->max );
1176              
1177 8 100       27 if ($opt{ltrim}) {
1178 1         5 my $newmin = $self->percentile( $opt{ltrim} );
1179 1 50 33     7 defined $newmin and $newmin > $min and $min = $newmin;
1180             };
1181 8 100       25 if ($opt{utrim}) {
1182 1         3 my $newmax = $self->percentile( 100-$opt{utrim} );
1183 1 50 33     5 defined $newmax and $newmax < $max and $max = $newmax;
1184             };
1185              
1186 8         33 return ($min, $max);
1187             };
1188              
1189             =head2 format( "printf-like expression", ... )
1190              
1191             Returns a summary as requested by format string.
1192             Just as with printf and sprintf, a placeholder starts with a C<%>,
1193             followed by formatting options and a
1194              
1195             The following placeholders are supported:
1196              
1197             =over
1198              
1199             =item * % - a literal %
1200              
1201             =item * s, f, g - a normal printf acting on an extra argument.
1202             The number of extra arguments MUST match the number of such placeholders,
1203             or this function dies.
1204              
1205             =item * n - count;
1206              
1207             =item * m - min;
1208              
1209             =item * M - max,
1210              
1211             =item * a - mean,
1212              
1213             =item * d - standard deviation,
1214              
1215             =item * S - skewness,
1216              
1217             =item * K - kurtosis,
1218              
1219             =item * q(x) - x-th quantile (requires argument),
1220              
1221             =item * p(x) - x-th percentile (requires argument),
1222              
1223             =item * P(x) - cdf - the inferred cumulative distribution function (x)
1224             (requires argument),
1225              
1226             =item * e(n) - central_moment - central moment of n-th power
1227             (requires argument),
1228              
1229             =item * E(n) - std_moment - standard moment of n-th power (requires argument),
1230              
1231             =back
1232              
1233             For example,
1234              
1235             $stat->format( "99%% results lie between %p(0.5) and %p(99.5)" );
1236              
1237             Or
1238              
1239             for( my $i = 0; $i < @stats; $i++ ) {
1240             print $stats[$i]->format( "%s-th average value is %a +- %d", $i );
1241             };
1242              
1243             =cut
1244              
1245             my %format = (
1246             # percent literal
1247             '%' => '%',
1248             # placeholders without parameters
1249             n => 'count',
1250             m => 'min',
1251             M => 'max',
1252             a => 'mean',
1253             d => 'std_dev',
1254             S => 'skewness',
1255             K => 'kurtosis',
1256             # placeholders with 1 parameter
1257             q => 'quantile?',
1258             p => 'percentile?',
1259             P => 'cdf?',
1260             e => 'central_moment?',
1261             E => 'std_moment?',
1262             );
1263              
1264             my %printf = (
1265             s => 1,
1266             f => 1,
1267             g => 1,
1268             );
1269              
1270             my $re_format = join "|", keys %format, keys %printf;
1271             $re_format = qr((?:$re_format));
1272              
1273             sub format {
1274 5     5 1 20 my ($self, $format, @extra) = @_;
1275              
1276              
1277             # FIXME this accepts %m(5), then dies - UGLY
1278             # TODO rewrite this as a giant sprintf... one day...
1279 5         115 $format =~ s <%([0-9.\-+ #]*)($re_format)(?:\(($re_num)?\)){0,1}>
  7         24  
1280             < _format_dispatch($self, $2, $1, $3, \@extra) >ge;
1281 5 50       14  
1282             croak __PACKAGE__.": Extra arguments in format()"
1283 5         31 if @extra;
1284             return $format;
1285             };
1286              
1287 7     7   32 sub _format_dispatch {
1288             my ($obj, $method, $float, $arg, $extra) = @_;
1289              
1290 7 100       29 # Handle % escapes
1291 1         4 if ($method !~ /^[a-zA-Z]/) {
1292             return $method;
1293             };
1294 6 100       49 # Handle printf built-in formats
1295 1 50       6 if (!$format{$method}) {
1296             croak __PACKAGE__.": Not enough arguments in format()"
1297 1         30 unless @$extra;
1298             return sprintf "%${float}${method}", shift @$extra;
1299             };
1300              
1301 5         10 # Now we know it's LogScale's own method
1302 5 100       20 $method = $format{$method};
1303 2 50       7 if ($method =~ s/\?$//) {
1304             die "Missing argument in method $method" if !defined $arg;
1305 3 50       10 } else {
1306             die "Extra argument in method $method" if defined $arg;
1307 5         21 };
1308             my $result = $obj->$method($arg);
1309              
1310 5 100 100     17 # work around S::D::Full's convention that "-inf == undef"
1311             $result = -9**9**9
1312 5         44 if ($method eq 'percentile' and !defined $result);
1313             return sprintf "%${float}f", $result;
1314             };
1315              
1316             ################################################################
1317             # No more public methods please
1318              
1319             # MEMOIZE
1320             # We'll keep methods' returned values under {cache}.
1321             # All setters destroy said cache altogether.
1322             # PLEASE replace this with a ready-made module if there's one.
1323              
1324             # Sorry for this black magic, but it's too hard to write //= in EVERY method
1325             # Long story short
1326             # The next sub replaces $self->foo() with
1327             # sub { $self->{cache}{foo} //= $self->originnal_foo }
1328             # All setter methods are EXPECTED to destroy {cache} altogether.
1329              
1330             # NOTE if you plan subclassing the method, re-memoize methods you change.
1331 253     253   567 sub _memoize_method {
1332             my ($class, $name, $arg) = @_;
1333 253         960  
1334 253 50       704 my $orig_code = $class->can($name);
1335             die "Error in memoizer section ($name)"
1336             unless ref $orig_code eq 'CODE';
1337              
1338             # begin long conditional
1339             my $cached_code = !$arg
1340 208 100   208   7283 ? sub {
1341 103         284 if (!exists $_[0]->{cache}{$name}) {
1342             $_[0]->{cache}{$name} = $orig_code->($_[0]);
1343 208         816 };
1344             return $_[0]->{cache}{$name};
1345             }
1346 171     171   2088 : sub {
1347 171         281 my $self = shift;
1348 171 100       412 my $arg = shift;
1349             $arg = '' unless defined $arg;
1350 171 100       526  
1351 102         253 if (!exists $self->{cache}{"$name:$arg"}) {
1352             $self->{cache}{"$name:$arg"} = $orig_code->($self, $arg);
1353 170         773 };
1354 253 100       1041 return $self->{cache}{"$name:$arg"};
1355             };
1356             # conditional ends here
1357 23     23   222  
  23         59  
  23         847  
1358 23     23   145 no strict 'refs'; ## no critic
  23         56  
  23         2697  
1359 253         450 no warnings 'redefine'; ## no critic
  253         1028  
1360             *{$class."::".$name} = $cached_code;
1361             }; # end of _memoize_method
1362              
1363             # Memoize all the methods w/o arguments
1364             foreach ( qw(sum sumsq mean min max mode) ) {
1365             __PACKAGE__->_memoize_method($_);
1366             };
1367              
1368             # Memoize methods with 1 argument
1369             foreach (qw(quantile central_moment std_moment standard_deviation variance)) {
1370             __PACKAGE__->_memoize_method($_, 1);
1371             };
1372              
1373             # add shorter alias of standard_deviation (this must happen AFTER memoization)
1374 23     23   170 {
  23         62  
  23         8005  
1375             no warnings 'once'; ## no critic
1376             *std_dev = \&standard_deviation;
1377             };
1378              
1379             # Get number of values below $x
1380             # Like sum_of(sub{1}, undef, $x), but faster.
1381             # Used by cdf()
1382 105     105   192 sub _count {
1383 105 100       277 my $self = shift;
1384 74         120 @_>1 and return $self->_count($_[1]) - $self->_count($_[0]);
1385             my $x = shift;
1386 74         175  
1387 74         171 my $upper = $self->_upper($x);
1388 74 100       189 my $i = _bin_search_gt( $self->_sort, $upper );
1389 65         144 !$i-- and return 0;
1390             my $count = $self->_probability->[$i];
1391              
1392 65         140 # interpolate
1393 65 100       325 my $bin = $self->_round( $x );
1394 34         65 if (my $val = $self->{data}{$bin}) {
1395 34 100       138 my $width = ($upper - $bin) * 2;
1396 34         74 my $part = $width ? ( ($upper - $x) / $width) : 1/2;
1397             $count -= $part * $val;
1398 65         208 };
1399             return $count;
1400             };
1401              
1402             # BINARY SEARCH
1403             # Not a method, just a function
1404             # Takes sorted \@array and a $num
1405             # Return lowest $i such that $array[$i] >= $num
1406             # Return (scalar @array) if no such $i exists
1407 147     147   1102 sub _bin_search_ge {
1408             my ($array, $x) = @_;
1409 147 100 100     692  
1410 126         248 return 0 unless @$array and $array->[0] < $x;
1411 126         199 my $l = 0;
1412 126         315 my $r = @$array;
1413 405         805 while ($l+1 < $r) {
1414 405 100       1180 my $m = int( ($l + $r) /2);
1415             $array->[$m] < $x ? $l = $m : $r = $m;
1416 126         276 };
1417             return $l+1;
1418             };
1419 74     74   162 sub _bin_search_gt {
1420 74         148 my ($array, $x) = @_;
1421 74 100 100     273 my $i = _bin_search_ge(@_);
1422 74         126 $i++ if defined $array->[$i] and $array->[$i] == $x;
1423             return $i;
1424             };
1425              
1426             # THE CORE
1427             # Here come the number=>bin functions
1428             # round() generates bin center
1429             # upper() and lower() are respective boundaries.
1430             # Here's the algorithm:
1431             # 1) determine whether bin is linear or logarithmic
1432             # 2) for linear bins, return bin# * bucket_width (==2*absolute error)
1433             # add/subtract 0.5 to get edges.
1434             # 3) for logarithmic bins, return base ** bin# with appropriate sign
1435             # multiply by precalculated constant to get edges
1436             # note that +0.5 doesn't work here, since sqrt(a*b) != (a+b)/2
1437             # This part is fragile and can be written better
1438              
1439             # center of bin containing x
1440 90893     90893   127270 sub _round {
1441 90893         123324 my $self = shift;
1442             my $x = shift;
1443 23     23   197  
  23         58  
  23         3313  
1444             use warnings FATAL => qw(numeric uninitialized);
1445 90893 100       182093  
1446             if (abs($x) <= $self->{linear_thresh}) {
1447 1761   100     5531 return $self->{linear_width}
1448             && $self->{linear_width} * floor( $x / $self->{linear_width} + 0.5 );
1449 89129         201463 };
1450 89129         163434 my $i = floor (((log abs $x) - $self->{logfloor})/ $self->{logbase});
1451 89129 100       344638 my $value = $self->{base} ** $i;
1452             return $x < 0 ? -$value : $value;
1453             };
1454              
1455             # lower, upper limits of bin containing x
1456 883     883   55043 sub _lower {
1457 883         1363 my $self = shift;
1458             my $x = shift;
1459 23     23   170  
  23         61  
  23         7518  
1460             use warnings FATAL => qw(numeric uninitialized);
1461 883 100       2795  
1462             if (abs($x) <= $self->{linear_thresh}) {
1463 184   66     1449 return $self->{linear_width}
1464             && $self->{linear_width} * (floor( $x / $self->{linear_width} + 0.5) - 0.5);
1465 699         2433 };
1466 699 100       1602 my $i = floor (((log abs $x) - $self->{logfloor} )/ $self->{logbase});
1467 310         1325 if ($x > 0) {
1468             return $self->{floor} * $self->{base}**($i);
1469 389         1727 } else {
1470             return -$self->{floor} * $self->{base}**($i+1);
1471             };
1472             };
1473              
1474 462     462   6588 sub _upper {
1475             return -$_[0]->_lower(-$_[1]);
1476             };
1477              
1478             # build bin index
1479 191     191   324 sub _sort {
1480             my $self = shift;
1481 191   100     703 return $self->{cache}{sorted}
  1247         2033  
  32         298  
1482             ||= [ sort { $a <=> $b } keys %{ $self->{data} } ];
1483             };
1484              
1485             # build cumulative bin counts index
1486 107     107   166 sub _probability {
1487 107   66     313 my $self = shift;
1488 20         36 return $self->{cache}{probability} ||= do {
1489 20         33 my @array;
1490 20         48 my $sum = 0;
  20         70  
1491 165         247 foreach (@{ $self->_sort }) {
1492 165         339 $sum += $self->{data}{$_};
1493             push @array, $sum;
1494 20         123 };
1495             \@array;
1496             };
1497             };
1498              
1499             =head1 AUTHOR
1500              
1501             Konstantin S. Uvarin, C<< >>
1502              
1503             =head1 BUGS
1504              
1505             The module is currently under development. There may be bugs.
1506              
1507             C only works for discrete distributions, and simply returns
1508             the first bin with largest bin count.
1509             A better algorithm is wanted.
1510              
1511             C should have been made a private method.
1512             Its signature and/or name may change in the future.
1513              
1514             See the TODO file in the distribution package.
1515              
1516             Please feel free to post bugs and/or feature requests to github:
1517             L
1518              
1519             Alternatively, you can use CPAN RT
1520             via e-mail C,
1521             or through the web interface at
1522             L.
1523              
1524             Your contribution is appreciated.
1525              
1526             =head1 SUPPORT
1527              
1528             You can find documentation for this module with the perldoc command.
1529              
1530             perldoc Statistics::Descriptive::LogScale
1531              
1532             You can also look for information at:
1533              
1534             =over 4
1535              
1536             =item * GitHub:
1537              
1538             L
1539              
1540             =item * RT: CPAN's request tracker (report bugs here)
1541              
1542             L
1543              
1544             =item * AnnoCPAN: Annotated CPAN documentation
1545              
1546             L
1547              
1548             =item * CPAN Ratings
1549              
1550             L
1551              
1552             =item * Search CPAN
1553              
1554             L
1555              
1556             =back
1557              
1558             =head1 ACKNOWLEDGEMENTS
1559              
1560             This module was inspired by a talk that Andrew Aksyonoff, author of
1561             L,
1562             has given at HighLoad++ conference in Moscow, 2012.
1563              
1564             L was and is used as reference when in doubt.
1565             Several code snippets were shamelessly stolen from there.
1566              
1567             C and C parameter names were suggested by
1568             CountZero from http://perlmonks.org
1569              
1570             =head1 LICENSE AND COPYRIGHT
1571              
1572             Copyright 2013-2015 Konstantin S. Uvarin.
1573              
1574             This program is free software; you can redistribute it and/or modify it
1575             under the terms of either: the GNU General Public License as published
1576             by the Free Software Foundation; or the Artistic License.
1577              
1578             See http://dev.perl.org/licenses/ for more information.
1579              
1580             =cut
1581              
1582             1; # End of Statistics::Descriptive::LogScale