File Coverage

blib/lib/Statistics/Descriptive/LogScale.pm

Criterion	Covered	Total	%
statement	345	346	99.7
branch	127	148	85.8
condition	48	65	73.8
subroutine	54	54	100.0
pod	27	27	100.0
total	601	640	93.9

line	stmt	bran	cond	sub	pod	time	code
1	22			22		533900	use 5.006;
	22					85
2	22			22		121	use strict;
	22					39
	22					496
3	22			22		106	use warnings;
	22					45
	22					1893
4
5							package Statistics::Descriptive::LogScale;
6
7							=head1 NAME
8
9							Statistics::Descriptive::LogScale - Memory-efficient approximate univariate
10							descriptive statistics class.
11
12							=head1 VERSION
13
14							Version 0.09
15
16							=cut
17
18							our $VERSION = 0.09;
19
20							=head1 SYNOPSIS
21
22							=head2 Basic usage
23
24							The basic usage is roughly the same as that of L.
25
26							use Statistics::Descriptive::LogScale;
27							my $stat = Statistics::Descriptive::LogScale->new ();
28
29							while(<>) {
30							chomp;
31							$stat->add_data($_);
32							};
33
34							# This can also be done in O(1) memory, precisely
35							printf "Mean: %f +- %f\n", $stat->mean, $stat->standard_deviation;
36							# This requires storing actual data, or approximating
37							printf "25%% : %f\n", $stat->percentile(25);
38							printf "Median: %f\n", $stat->median;
39							printf "75%% : %f\n", $stat->percentile(75);
40
41							=head2 Save/load
42
43							This is not present in L.
44							The save/load interface is designed compatible with JSON::XS.
45							However, any other serializer can be used.
46							The C method is I to return unblessed hashref
47							with enough information to restore the original object.
48
49							use Statistics::Descriptive::LogScale;
50							my $stat = Statistics::Descriptive::LogScale->new ();
51
52							# ..... much later
53							# Save
54							print $fd encoder_of_choice( $stat->TO_JSON )
55							or die "Failed to save: $!";
56
57							# ..... and even later
58							# Load
59							my $plain_hash = decoder_of_choice( $raw_data );
60							my $copy_of_stat = Statistics::Descriptive::LogScale->new( %$plain_hash );
61
62							# Import into existing LogScale instance
63							my $plain_hash = decoder_of_choice( $more_raw_data );
64							$copy_of_stat->add_data_hash( $plain_hash->{data} );
65
66							=head2 Histograms
67
68							Both L and L
69							offer C method for querying data point counts.
70							However, there's also C method for making pretty pictures.
71							Here's a simple text-based histogram.
72							A proper GD example was too long to fit into this margin.
73
74							use strict;
75							use warnings;
76
77							use Statistics::Descriptive::LogScale;
78							my $stat = Statistics::Descriptive::LogScale->new ();
79
80							# collect/load data ...
81							my $re_float = qr([-+]?(?:\d+\.?\d*\|\.\d+)(?:[Ee][-+]?\d+)?);
82							while (<>) {
83							$stat->add_data($_) for m/($re_float)/g;
84							};
85							die "Empty set"
86							unless $stat->count;
87
88							# get data in [ count, lower_bound, upper_bound ] format as arrayref
89							my $hist = $stat->histogram( count => 20 );
90
91							# find maximum value to use as a scale factor
92							my $scale = $hist->[0][0];
93							$scale < $_->[0] and $scale = $_->[0] for @$hist;
94
95							foreach (@$hist) {
96							printf "%10f %s\n", $_->[1], '#' x ($_->[0] * 68 / $scale);
97							};
98							printf "%10f\n", $hist->[-1][2];
99
100							=head1 DESCRIPTION
101
102							This module aims at providing some advanced statistical functions without
103							storing all data in memory, at the cost of certain (predictable) precision loss.
104
105							Data is represented by a set of bins that only store counts of fitting
106							data points.
107							Most bins are logarithmic, i.e. lower end / upper end ratio is constant.
108							However, around zero linear approximation may be user instead
109							(see "linear_width" and "linear_thresh" parameters in new()).
110
111							All operations are then performed on the bins, introducing relative error
112							which does not, however, exceed the bins' relative width ("base").
113
114							=head1 METHODS
115
116							=cut
117
118							########################################################################
119							# == HOW IT WORKS ==
120							# Buckets are stored in a hash: { $value => $count, ... }
121							# {base} is bin width, {logbase} == log {base} (cache)
122							# {linear_thresh} is where we switch to equal bin approximation
123							# {linear_width} is width of bin around zero (==linear_thresh if not given)
124							# {floor} is lower bound of bin whose center is 1. {logfloor} = log {floor}
125							# Nearly all meaningful subs have to scan all the bins, which is bad,
126							# but anyway better than scanning full sample.
127
128	22			22		120	use Carp;
	22					40
	22					1794
129	22			22		17822	use POSIX qw(floor ceil);
	22					175957
	22					122
130
131							# Fields are NOT used internally for now, so this is just a declaration
132	22					128	use fields qw(
133							data count
134							linear_width base logbase floor linear_thresh logfloor
135							cache
136	22			22		3422996	);
	22					35081
137
138							# This is for infinite portability^W^W portable infinity
139							my $INF = 999;
140
141							=head2 new( %options )
142
143							%options may include:
144
145							=over
146
147							=item * base - ratio of adjacent bins. Default is 10^(1/232), which gives
148							1% precision and exact decimal powers.
149							This value represents acceptable relative error in analysis results.
150
151							B Actual value may be slightly less than requested one.
152							This is done so to avoid troubles with future rounding in (de)serialization.
153
154							=item * linear_width - width of linear bins around zero.
155							This value represents precision of incoming data.
156							Default is zero, i.e. we assume that the measurement is precise.
157
158							B Actual value may be less (by no more than a factor of C)
159							so that borders of linear and logarithmic bins fit nicely.
160
161							=item * linear_thresh - where to switch to linear approximation.
162							If only one of C and C is given,
163							the other will be calculated.
164							However, user may want to specify both in some cases.
165
166							B Actual value may be less (by no more than a factor of C)
167							so that borders of linear and logarithmic bins fit nicely.
168
169							=item * data - hashref with C<{ value => weight }> for initializing data.
170							Used for cloning.
171							See C.
172
173							=item * zero_thresh - absolute value threshold below which everything is
174							considered zero.
175							DEPRECATED, C and C override this if given.
176
177							=back
178
179							=cut
180
181							my @new_keys = qw( base linear_thresh linear_width zero_thresh data );
182							# TODO Throw if extra options given?
183							sub new {
184	41			41	1	8290	my $class = shift;
185	41					149	my %opt = @_;
186
187							# base for logarithmic bins, sane default: +-1%, exact decimal powers
188							# UGLY HACK number->string->number to avoid
189							# future serialization inconsistencies
190	41		100			194	$opt{base} \|\|= 10**(1/232);
191	41					403	$opt{base} = 0 + "$opt{base}";
192	41	50				221	$opt{base} > 1 or croak __PACKAGE__.": new(): base must be >1";
193
194							# calculate where to switch to linear approximation
195							# the condition is: linear bin( thresh ) ~~ log bin( thresh )
196							# i.e. thresh * base - thresh ~~ absolute error * 2
197							# i.e. thresh ~~ absolute_error * 2 / (base - 1)
198							# also support legacy API (zero_thresh)
199	41	100				143	if (defined $opt{linear_thresh} ) {
200	13		100			78	$opt{linear_width} \|\|= $opt{linear_thresh} * ($opt{base}-1);
201							} else {
202	28					424	$opt{linear_thresh} = $opt{zero_thresh};
203							};
204							$opt{linear_thresh} = abs($opt{linear_width}) / ($opt{base} - 1)
205	41	100	100			222	if $opt{linear_width} and !$opt{linear_thresh};
206	41		100			181	$opt{linear_thresh} \|\|= 0;
207	41	50				137	$opt{linear_thresh} >= 0
208							or croak __PACKAGE__.": new(): linear_thresh must be >= 0";
209
210							# Can't use fields::new anymore
211							# due to JSON::XS incompatibility with restricted hashes
212	41					130	my $self = bless {}, $class;
213
214	41					171	$self->{base} = $opt{base};
215							# cache values to ease calculations
216							# floor = (lower end of bin) / (center of bin)
217	41					125	$self->{floor} = 2/(1+$opt{base});
218	41					263	$self->{logbase} = log $opt{base};
219	41					132	$self->{logfloor} = log $self->{floor};
220
221							# bootstrap linear_thresh - make it fit bin edge
222	41					91	$self->{linear_width} = $self->{linear_thresh} = 0;
223	41					158	$self->{linear_thresh} = $self->_lower( $opt{linear_thresh} );
224
225							# divide anything below linear_thresh into odd number of bins
226							# not exceeding requested linear_width
227	41	100				139	if ($self->{linear_thresh}) {
228	15		66			57	my $linear_width = $opt{linear_width} \|\| 2 * $self->{linear_thresh};
229	15					62	my $n_linear = ceil(2 * $self->{linear_thresh} / abs($linear_width));
230	15	100				53	$n_linear++ unless $n_linear % 2;
231	15					41	$self->{linear_width} = (2 * $self->{linear_thresh} / $n_linear);
232							};
233
234	41					142	$self->clear;
235	41	100				133	if ($opt{data}) {
236	10					31	$self->add_data_hash($opt{data});
237							};
238	41					181	return $self;
239							};
240
241							=head1 General statistical methods
242
243							These methods are used to query the distribution properties. They generally
244							follow the interface of L and co,
245							with minor additions.
246
247							All methods return C on empty data set, except for
248							C, C, C, C and C
249							which all return 0.
250
251							B This module caches whatever it calculates very agressively.
252							Don't hesitate to use statistical functions (except for sum_of/mean_of)
253							more than once. The cache is deleted upon data entry.
254
255							=head2 clear
256
257							Destroy all stored data.
258
259							=cut
260
261							sub clear {
262	55			55	1	10259	my $self = shift;
263	55					120	$self->{data} = {};
264	55					3445	$self->{count} = 0;
265	55					118	delete $self->{cache};
266	55					114	return $self;
267							};
268
269							=head2 add_data( @data )
270
271							Add numbers to the data pool.
272
273							Returns self, so that methods can be chained.
274
275							If incorrect data is given (i.e. non-numeric, undef),
276							an exception is thrown and only partial data gets inserted.
277							The state of object is guaranteed to remain consistent in such case.
278
279							B Cache is reset, even if no data was actually inserted.
280
281							B It is possible to add infinite values to data pool.
282							The module will try and calculate whatever can still be calculated.
283							However, no portable way of serializing such values is done yet.
284
285							=cut
286
287							sub add_data {
288	51212			51212	1	208194	my $self = shift;
289
290	51212					58987	delete $self->{cache};
291	51212					81457	foreach (@_) {
292	90136					174826	$self->{data}{ $self->_round($_) }++;
293	90133					164623	$self->{count}++;
294							};
295	51209					97184	$self;
296							};
297
298							=head2 count
299
300							Returns number of data points.
301
302							=cut
303
304							sub count {
305	69			69	1	10577	my $self = shift;
306	69					314	return $self->{count};
307							};
308
309							=head2 min
310
311							=head2 max
312
313							Values of minimal and maximal bins.
314
315							NOTE: Due to rounding, some of the actual inserted values may fall outside
316							of the min..max range. This may change in the future.
317
318							=cut
319
320							sub min {
321							my $self = shift;
322							return $self->_sort->[0];
323							};
324
325							sub max {
326							my $self = shift;
327							return $self->_sort->[-1];
328							};
329
330							=head2 sample_range
331
332							Return sample range of the dataset, i.e. max() - min().
333
334							=cut
335
336							sub sample_range {
337	5			5	1	597	my $self = shift;
338	5	100				14	return $self->count ? $self->max - $self->min : undef;
339							};
340
341							=head2 sum
342
343							Return sum of all data points.
344
345							=cut
346
347							sub sum {
348							my $self = shift;
349							return $self->sum_of(sub { $_[0] });
350							};
351
352							=head2 sumsq
353
354							Return sum of squares of all datapoints.
355
356							=cut
357
358							sub sumsq {
359							my $self = shift;
360							return $self->sum_of(sub { $_[0] * $_[0] });
361							};
362
363							=head2 mean
364
365							Return mean, or average value, i.e. sum()/count().
366
367							=cut
368
369							sub mean {
370							my $self = shift;
371							return $self->{count} ? $self->sum / $self->{count} : undef;
372							};
373
374							=head2 variance
375
376							=head2 variance( $correction )
377
378							Return data variance, i.e. E((x - E(x)) ** 2).
379
380							Bessel's correction (division by n-1 instead of n) is used by default.
381							This may be changed by specifying $correction explicitly.
382
383							B The binning strategy used here should also introduce variance bias.
384							This is not yet accounted for.
385
386							=cut
387
388							sub variance {
389							my $self = shift;
390							my $correction = shift;
391
392							# in fact we'll receive correction='' because of how cache works
393							$correction = 1 unless defined $correction and length $correction;
394
395							return 0 if ($self->{count} < 1 + $correction);
396
397							my $var = $self->sumsq - $self->sum**2 / $self->{count};
398							return $var <= 0 ? 0 : $var / ( $self->{count} - $correction );
399							};
400
401							=head2 standard_deviation
402
403							=head2 standard_deviation( $correction )
404
405							=head2 std_dev
406
407							Return standard deviation, i.e. square root of variance.
408
409							Bessel's correction (division by n-1 instead of n) is used by default.
410							This may be changed by specifying $correction explicitly.
411
412							B The binning strategy used here should also introduce variance bias.
413							This is not yet accounted for.
414
415							=cut
416
417							sub standard_deviation {
418							my $self = shift;
419
420							return sqrt($self->variance(@_));
421							};
422
423							=head2 cdf ($x)
424
425							Cumulative distribution function. Returns estimated probability of
426							random data point from the sample being less than C<$x>.
427
428							As a special case, C accounts for I of zeroth bin count (if any).
429
430							Not present in Statistics::Descriptive::Full, but appears in
431							L.
432
433							=head2 cdf ($x, $y)
434
435							Returns probability of a value being between C<$x> and C<$y> ($x <= $y).
436							This is essentially C.
437
438							=cut
439
440							sub cdf {
441	4			4	1	7	my $self = shift;
442	4	50				12	return unless $self->{count};
443	4					9	return $self->_count(@_) / $self->{count};
444							};
445
446							=head2 percentile( $n )
447
448							Find $n-th percentile, i.e. a value below which lies $n % of the data.
449
450							0-th percentile is by definition -inf and is returned as undef
451							(see Statistics::Descriptive).
452
453							$n is a real number, not necessarily integer.
454
455							=cut
456
457							sub percentile {
458	46			46	1	8715	my $self = shift;
459	46					77	my $x = shift;
460
461							# assert 0<=$x<=100
462	46	50	33			294	croak __PACKAGE__.": percentile() argument must be between 0 and 100"
463							unless 0<= $x and $x <= 100;
464
465	46					126	my $need = $x * $self->{count} / 100;
466	46	100				161	return if $need < 1;
467
468							# dichotomize
469							# $i is lowest value >= needed
470							# $need doesnt exceed last bin!
471	41					124	my $i = _bin_search_ge( $self->_probability, $need );
472	41					113	return $self->_sort->[ $i ];
473							};
474
475							=head2 quantile( 0..4 )
476
477							From Statistics::Descriptive manual:
478
479							0 => zero quartile (Q0) : minimal value
480							1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile
481							2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile
482							3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile
483							4 => fourth quartile (Q4) : maximal value
484
485							=cut
486
487							sub quantile {
488							my $self = shift;
489							my $t = shift;
490
491							croak (__PACKAGE__.": quantile() argument must be one of 0..4")
492							unless $t =~ /^[0-4]$/;
493
494							$t or return $self->min;
495							return $self->percentile($t * 100 / 4);
496							};
497
498							=head2 median
499
500							Return median of data, a value that divides the sample in half.
501							Same as percentile(50).
502
503							=cut
504
505							sub median {
506	16			16	1	993	my $self = shift;
507	16					52	return $self->percentile(50);
508							};
509
510							=head2 trimmed_mean( $ltrim, [ $utrim ] )
511
512							Return mean of sample with $ltrim and $utrim fraction of data points
513							remover from lower and upper ends respectively.
514
515							ltrim defaults to 0, and rtrim to ltrim.
516
517							=cut
518
519							sub trimmed_mean {
520	1			1	1	3	my $self = shift;
521	1					2	my ($lower, $upper) = @_;
522	1		50			4	$lower \|\|= 0;
523	1	50				4	$upper = $lower unless defined $upper;
524
525	1					5	my $min = $self->percentile($lower * 100);
526	1					5	my $max = $self->percentile(100 - $upper * 100);
527
528	1	50				4	return unless $min < $max;
529
530	1			5		7	return $self->mean_of(sub{$_[0]}, $min, $max);
	5					13
531							};
532
533							=head2 harmonic_mean
534
535							Return harmonic mean of the data, i.e. 1/E(1/x).
536
537							Return undef if division by zero occurs (see Statistics::Descriptive).
538
539							=cut
540
541							sub harmonic_mean {
542	2			2	1	614	my $self = shift;
543
544	2					4	my $ret;
545	2					4	eval {
546	2			5		7	$ret = $self->count / $self->sum_of(sub { 1/$_[0] });
	5					14
547							};
548	2	50	66			23	if ($@ and $@ !~ /division.*zero/) {
549	0					0	die $@; # rethrow ALL BUT 1/0 which yields undef
550							};
551	2					10	return $ret;
552							};
553
554							=head2 geometric_mean
555
556							Return geometric mean of the data, that is, exp(E(log x)).
557
558							Dies unless all data points are of the same sign.
559
560							=cut
561
562							sub geometric_mean {
563	2			2	1	618	my $self = shift;
564
565	2	100				8	return unless $self->count;
566	1	50				5	croak __PACKAGE__.": geometric_mean() called on mixed sign sample"
567							if $self->min * $self->max < 0;
568
569	1	50				5	return 0 if $self->{data}{0};
570							# this must be dog slow, but we already log() too much at this point.
571	1			5		6	my $ret = exp( $self->sum_of( sub { log abs $_[0] } ) / $self->{count} );
	5					15
572	1	50				5	return $self->min < 0 ? -$ret : $ret;
573							};
574
575							=head2 skewness
576
577							Return skewness of the distribution, calculated as
578							n/(n-1)(n-2) * E((x-E(x))3)/std_dev3 (this is consistent with Excel).
579
580							=cut
581
582							sub skewness {
583	20			20	1	1307	my $self = shift;
584
585	20					40	my $n = $self->{count};
586	20	100				65	return unless $n > 2;
587
588							# code stolen from Statistics::Descriptive
589	18					51	my $skew = $n * $self->std_moment(3);
590	18					42	my $correction = $n / ( ($n-1) * ($n-2) );
591	18					89	return $correction * $skew;
592							};
593
594							=head2 kurtosis
595
596							Return kurtosis of the distribution, that is 4-th standardized moment - 3.
597							The exact formula used here is consistent with that of Excel and
598							Statistics::Descriptive.
599
600							=cut
601
602							sub kurtosis {
603	22			22	1	639	my $self = shift;
604
605	22					47	my $n = $self->{count};
606	22	100				70	return unless $n > 3;
607
608							# code stolen from Statistics::Descriptive
609	20					50	my $kurt = $n * $self->std_moment(4);
610	20					65	my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) );
611	20					48	my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) );
612
613	20					115	return $correction1 * $kurt - $correction2;
614							};
615
616							=head2 central_moment( $n )
617
618							Return $n-th central moment, that is, E((x - E(x))^$n).
619
620							Not present in Statistics::Descriptive::Full.
621
622							=cut
623
624							sub central_moment {
625							my $self = shift;
626							my $n = shift;
627
628							my $mean = $self->mean;
629							return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) / $self->{count};
630							};
631
632							=head2 std_moment( $n )
633
634							Return $n-th standardized moment, that is,
635							E((x - E(x))$n) / std_dev(x)$n.
636
637							Not present in Statistics::Descriptive::Full.
638
639							=cut
640
641							sub std_moment {
642							my $self = shift;
643							my $n = shift;
644
645							my $mean = $self->mean;
646							my $dev = $self->std_dev;
647							return $self->sum_of(sub{ ($_[0] - $mean) ** $n })
648							/ ( $dev*$n $self->{count} );
649							};
650
651							=head2 mode
652
653							Mode of a distribution is the most common value for a discrete distribution,
654							or maximum of probability density for continuous one.
655
656							For now we assume that the distribution IS discrete, and return the bin with
657							the biggest hit count.
658
659							NOTE A better algorithm is still wanted. Experimental.
660							Behavior may change in the future.
661
662							=cut
663
664							# Naive implementation
665							# Find bin w/greatest count and return it
666							sub mode {
667							my $self = shift;
668
669							return if !$self->count;
670
671							my $index = $self->_sort;
672							return $index->[0] if @$index == 1;
673
674							my @count = map { $self->{data}{$_} } @$index;
675
676							my $max_index;
677							my $max_growth = 0;
678							for (my $i = 0; $i<@count; $i++) {
679							$count[$i] > $max_growth or next;
680							$max_index = $i;
681							$max_growth = $count[$i];
682							};
683
684							return $index->[$max_index];
685							};
686
687							=head2 frequency_distribution_ref( \@index )
688
689							=head2 frequency_distribution_ref( $n )
690
691							=head2 frequency_distribution_ref
692
693							Return numbers of data point counts below each number in @index as hashref.
694
695							If a number is given instead of arrayref, @index is created
696							by dividing [min, max] into $n intervals.
697
698							If no parameters are given, return previous result, if any.
699
700							=cut
701
702							sub frequency_distribution_ref {
703	3			3	1	628	my $self = shift;
704	3					6	my $index = shift;
705
706	3	100				11	return unless $self->count;
707							# ah, compatibility - return last value
708							return $self->{cache}{frequency_distribution_ref}
709	2	50				6	unless defined $index;
710							# make index if number given
711	2	100				5	if (!ref $index) {
712	1	50				8	croak __PACKAGE__.": frequency_distribution_ref(): ".
713							"argument must be array, of number > 2, not $index"
714							unless $index > 2;
715	1					5	my $min = $self->_lower($self->min);
716	1					4	my $max = $self->_upper($self->max);
717	1					3	my $step = ($max - $min) / $index;
718	1					2	$index = [ map { $min + $_ * $step } 1..$index ];
	5					11
719							};
720
721	2					8	@$index = (-$INF, sort { $a <=> $b } @$index);
	12					25
722
723	2					3	my @count;
724	2					10	for (my $i = 0; $i<@$index-1; $i++) {
725	9					26	push @count, $self->_count( $index->[$i], $index->[$i+1] );
726							};
727	2					4	shift @$index; # remove -inf
728
729	2					3	my %hash;
730	2					22	@hash{@$index} = @count;
731	2					5	$self->{cache}{frequency_distribution_ref} = \%hash;
732	2					9	return \%hash;
733							};
734
735							=head1 Specific methods
736
737							The folowing methods only apply to this module, or are experimental.
738
739							=cut
740
741							=head2 bucket_width
742
743							Get bin width (relative to center of bin). Percentiles are off
744							by no more than half of this. DEPRECATED.
745
746							=cut
747
748							sub bucket_width {
749	5			5	1	9	my $self = shift;
750	5					29	return $self->{base} - 1;
751							};
752
753							=head2 log_base
754
755							Get upper/lower bound ratio for logarithmic bins.
756							This represents relative precision of sample.
757
758							=head2 linear_width
759
760							Get width of linear buckets.
761							This represents absolute precision of sample.
762
763							=head2 linear_threshold
764
765							Get absolute value threshold below which interpolation is switched to linear.
766
767							=cut
768
769							sub log_base {
770	3			3	1	7	my $self = shift;
771	3					14	return $self->{base};
772							};
773
774							sub linear_width {
775	3			3	1	5	my $self = shift;
776	3					21	return $self->{linear_width};
777							};
778
779							sub linear_threshold {
780	8			8	1	30	my $self = shift;
781	8					44	return $self->{linear_thresh};
782							};
783
784							=head2 add_data_hash ( { value => weight, ... } )
785
786							Add values with counts/weights.
787							This can be used to import data from other
788							Statistics::Descriptive::LogScale object.
789
790							Returns self, so that methods can be chained.
791
792							Negative counts are allowed and treated as "forgetting" data.
793							If a bin count goes below zero, such bin is simply discarded.
794							Minus infinity weight is allowed and has the same effect.
795							Data is guaranteed to remain consistent.
796
797							If incorrect data is given (i.e. non-numeric, undef, or +infinity),
798							an exception is thrown and nothing changes.
799
800							B Cache may be reset, even if no data was actually inserted.
801
802							B It is possible to add infinite values to data pool.
803							The module will try and calculate whatever can still be calculated.
804							However, no portable way of serializing such values is done yet.
805
806							=cut
807
808							sub add_data_hash {
809	19			19	1	49	my $self = shift;
810	19					36	my $hash = shift;
811
812							# check incoming data for being numeric, and no +inf values
813	19					30	eval {
814	22			22		56346	use warnings FATAL => qw(numeric);
	22					57
	22					42584
815	19					99	while (my ($k, $v) = each %$hash) {
816	289	100	33			2487	$k == 0+$k and $v == 0+$v and $v < $INF
			66
817							or die "Infinite count for $k\n";
818							}
819							};
820	19	100				290	croak __PACKAGE__.": add_data_hash failed: $@"
821							if $@;
822
823	17					29	delete $self->{cache};
824
825							# update our counters
826	17					82	foreach (keys %$hash) {
827	286	50				630	next unless $hash->{$_};
828	286					555	my $key = $self->_round($_);
829
830							# Insert data. Make sure -Inf doesn't corrupt our counter.
831	286		100			1381	my $newcount = ($self->{data}{$key} \|\| 0) + $hash->{$_};
832	286	100				530	if ($newcount > 0) {
833							# normal insert
834	274					781	$self->{data}{$key} = $newcount;
835	274					565	$self->{count} += $hash->{$_};
836							} else {
837							# We're "forgetting" data, AND the bin got empty
838	12		100			58	$self->{count} -= delete $self->{data}{$key} \|\| 0;
839							};
840							};
841	17					49	$self;
842							};
843
844							=head2 get_data_hash( %options )
845
846							Return distribution hashref {value => number of occurances}.
847
848							This is inverse of add_data_hash.
849
850							Options may include:
851
852							=over
853
854							=item * min - ignore values below this. (See find_boundaries)
855
856							=item * max - ignore values above this. (See find_boundaries)
857
858							=item * ltrim - ignore this % of values on lower end. (See find_boundaries)
859
860							=item * rtrim - ignore this % of values on upper end. (See find_boundaries)
861
862							=item * noise_thresh - strip bins with count below this.
863
864							=back
865
866							=cut
867
868							sub get_data_hash {
869	63			63	1	13235	my ($self, %opt) = @_;
870
871							# shallow copy of data if no options given
872	63	100				196	return {%{ $self->{data} }} unless %opt;
	61					820
873
874	2					30	my ($min, $max) = $self->find_boundaries( %opt );
875	2		100			15	my $noize = $opt{noize_thresh} \|\| 0;
876
877	2					4	my $data = $self->{data};
878	2					4	my %hash;
879	2					11	foreach (keys %$data ) {
880	13	100				42	$_ < $min and next;
881	8	50				17	$_ > $max and next;
882	8	100				20	$data->{$_} < $noize and next;
883	6					14	$hash{$_} = $data->{$_};
884							};
885
886	2					11	return \%hash;
887							};
888
889							=head2 TO_JSON()
890
891							Return enough data to recreate the whole object as an unblessed hashref.
892
893							This routine conforms with C, hence the name.
894							Can be called as
895
896							my $str = JSON::XS->new->allow_blessed->convert_blessed->encode( $this );
897
898							B This module DOES NOT require JSON::XS or serialize to JSON.
899							It just deals with data.
900							Use C, C, C or any serializer of choice.
901
902							my $raw_data = $stat->TO_JSON;
903							Statistics::Descriptive::LogScale->new( %$raw_data );
904
905							Would generate an exact copy of C<$stat> object
906							(provided it's S::D::L and not a subclass).
907
908							=head2 clone( [ %options ] )
909
910							Copy constructor - returns copy of an existing object.
911							Cache is not preserved.
912
913							Constructor options may be given to override existing data. See new().
914
915							Trim options may be given to get partial data. See get_data_hash().
916
917							=cut
918
919							sub clone {
920	7			7	1	3397	my ($self, %opt) = @_;
921
922	7					18	my $raw = $self->TO_JSON;
923	7	100				23	if (%opt) {
924							$raw->{data} = $self->get_data_hash( %opt )
925	4	100				18	unless exists $opt{data};
926							exists $opt{$_} and $raw->{$_} = $opt{$_}
927	4		66			51	for @new_keys;
928							};
929
930	7					35	return (ref $self)->new( %$raw );
931							};
932
933							sub TO_JSON {
934	31			31	1	152	my $self = shift;
935							# UGLY HACK Increase linear_thresh by a factor of base ** 1/10
936							# so that it's rounded down to present value
937							return {
938							CLASS => ref $self,
939							VERSION => $VERSION,
940							base => $self->{base},
941							linear_width => $self->{linear_width},
942	31					150	linear_thresh => $self->{linear_thresh} * ($self->{base}+9)/10,
943							data => $self->get_data_hash,
944							};
945							};
946
947							=head2 scale_sample( $scale )
948
949							Multiply all bins' counts by given value. This can be used to adjust
950							significance of previous data before adding new data (e.g. gradually
951							"forgetting" past data in a long-running application).
952
953							=cut
954
955							sub scale_sample {
956	1			1	1	8	my $self = shift;
957	1					2	my $factor = shift;
958	1	50				4	$factor > 0 or croak (__PACKAGE__.": scale_sample: factor must be positive");
959
960	1					3	delete $self->{cache};
961	1					2	foreach (@{ $self->_sort }) {
	1					9
962	100					175	$self->{data}{$_} *= $factor;
963							};
964	1					7	$self->{count} *= $factor;
965	1					3	return $self;
966							};
967
968							=head2 mean_of( $code, [$min, $max] )
969
970							Return expectation of $code over sample within given range.
971
972							$code is expected to be a pure function (i.e. depending only on its input
973							value, and having no side effects).
974
975							The underlying integration mechanism only calculates $code once per bin,
976							so $code should be stable as in not vary wildly over small intervals.
977
978							=cut
979
980							sub mean_of {
981	24			24	1	55	my $self = shift;
982	24					40	my ($code, $min, $max) = @_;
983
984	24			429		108	my $weight = $self->sum_of( sub {1}, $min, $max );
	429					861
985	24	100				114	return unless $weight;
986	23					52	return $self->sum_of($code, $min, $max) / $weight;
987							};
988
989							=head1 Experimental methods
990
991							These methods may be subject to change in the future, or stay, if they
992							are good.
993
994							=head2 sum_of ( $code, [ $min, $max ] )
995
996							Integrate arbitrary function over the sample within the [ $min, $max ] interval.
997							Default values for both limits are infinities of appropriate sign.
998
999							Values in the edge bins are cut using interpolation if needed.
1000
1001							NOTE: sum_of(sub{1}, $a, $b) would return rough nubmer of data points
1002							between $a and $b.
1003
1004							EXPERIMENTAL. The method name may change in the future.
1005
1006							=cut
1007
1008							sub sum_of {
1009	145			145	1	764	my $self = shift;
1010	145					292	my ($code, $realmin, $realmax) = @_;
1011
1012							# Just app up stuff
1013	145	100	66			546	if (!defined $realmin and !defined $realmax) {
1014	122					165	my $sum = 0;
1015	122					166	while (my ($val, $count) = each %{ $self->{data} }) {
	63224					168850
1016	63102					100217	$sum += $count * $code->( $val );
1017							};
1018	122					691	return $sum;
1019							};
1020
1021	23	100				52	$realmin = -$INF unless defined $realmin;
1022	23	100				50	$realmax = $INF unless defined $realmax;
1023	23	50				53	return 0 if( $realmin >= $realmax );
1024
1025							# correct limits. $min, $max are indices; $left, $right are limits
1026	23					46	my $min = $self->_round($realmin);
1027	23					49	my $max = $self->_round($realmax);
1028	23					52	my $left = $self->_lower($realmin);
1029	23					51	my $right = $self->_upper($realmax);
1030
1031							# find first bin that's above $left
1032	23					52	my $keys = $self->_sort;
1033	23					52	my $i = _bin_search_ge($keys, $left);
1034
1035							# warn "sum_of [$min, $max]";
1036							# add up bins
1037	23					31	my $sum = 0;
1038	23					51	for (; $i < @$keys; $i++) {
1039	284					855	my $val = $keys->[$i];
1040	284	100				510	last if $val > $right;
1041	264					569	$sum += $self->{data}{$val} * $code->( $val );
1042							};
1043
1044							# cut edges: the hard part
1045							# min and max are now used as indices
1046							# if min or max hits 0, we cut it in half (i.e. into equal 0+ and 0-)
1047							# warn "Add up, sum_of = $sum";
1048	23	100				134	if ($self->{data}{$max}) {
1049	20					45	my $width = $self->_upper($max) - $self->_lower($max);
1050	20	100				61	my $part = $width
1051							? ($self->_upper($max) - $realmax) / $width
1052							: 0.5;
1053	20					99	$sum -= $self->{data}{$max} * $code->($max) * $part;
1054							};
1055							# warn "Cut R, sum_of = $sum";
1056	23	100				113	if ($self->{data}{$min}) {
1057	20					40	my $width = $self->_upper($min) - $self->_lower($min);
1058	20	100				56	my $part = $width
1059							? ($realmin - $self->_lower($min)) / $width
1060							: 0.5;
1061	20					71	$sum -= $self->{data}{$min} * $code->($min) * $part;
1062							};
1063							# warn "Cut L, sum_of = $sum";
1064
1065	23					129	return $sum;
1066							}; # end sum_of
1067
1068							=head2 histogram ( %options )
1069
1070							Returns array of form [ [ count0_1, x0, x1 ], [count1_2, x1, x2 ], ... ]
1071							where countX_Y is number of data points between X and Y.
1072
1073							Options may include:
1074
1075							=over
1076
1077							=item * count (+) - number of intervals to divide sample into.
1078
1079							=item * index (+) - interval borders as array. Will be sorted before processing.
1080
1081							=item * min - ignore values below this. Default = $self->min - epsilon.
1082
1083							=item * max - ignore values above this. Default = $self->max + epsilon.
1084
1085							=item * ltrim - ignore this % of values on lower end.
1086
1087							=item * rtrim - ignore this % of values on upper end.
1088
1089							=item * normalize_to - adjust counts so that max number becomes nnn.
1090							This may be useful if one intends to draw pictures.
1091
1092							=back
1093
1094							Either count or index must be present.
1095
1096							NOTE: this is equivalent to frequency_distribution_ref but better suited
1097							for omitting sample tails and outputting pretty pictures.
1098
1099							=cut
1100
1101							sub histogram {
1102	5			5	1	1977	my ($self, %opt) = @_;
1103
1104	5	100				13	return unless $self->count;
1105	4					14	my ($min, $max) = $self->find_boundaries( %opt );
1106							# build/check index
1107	4	100				6	my @index = @{ $opt{index} \|\| [] };
	4					22
1108	4	100				12	if (!@index) {
1109	3					6	my $n = $opt{count};
1110	3	50				8	$n > 0 or croak (__PACKAGE__.": histogram: insufficient options (count < 1 )");
1111	3					33	my $step = ($max - $min) / $n;
1112	3					12	for (my $x = $min; $x <= $max; $x += $step) {
1113	16					38	push @index, $x;
1114							};
1115							} else {
1116							# sort & uniq raw index
1117	1					3	my %known;
1118	1					2	@index = grep { !$known{$_}++ } @index;
	5					17
1119	1					4	@index = sort { $a <=> $b } @index;
	8					11
1120	1	50				5	@index > 1 or croak (__PACKAGE__.": histogram: insufficient options (index < 2)");
1121							};
1122
1123							# finally: estimated counts between indices!
1124	4					6	my @ret;
1125	4					14	for ( my $i = 0; $i<@index-1; $i++) {
1126	17					41	my $count = $self->_count( $index[$i], $index[$i+1] );
1127	17					78	push @ret, [ $count, $index[$i], $index[$i+1] ];
1128							};
1129
1130							# if normalize - find maximum & divide by it
1131	4	100				11	if (my $norm = $opt{normalize_to}) {
1132	1					3	my $max = 0;
1133							$max < $_->[0] and $max = $_->[0]
1134	1		66			9	for @ret;
1135	1					2	$norm /= $max;
1136	1					5	$_->[0] *= $norm for @ret;
1137							};
1138
1139	4					18	return \@ret;
1140							};
1141
1142							=head2 find_boundaries( %opt )
1143
1144							Return ($min, $max) of part of sample denoted by options.
1145
1146							Options may include:
1147
1148							=over
1149
1150							=item * min - ignore values below this. default = min() - epsilon.
1151
1152							=item * max - ignore values above this. default = max() + epsilon.
1153
1154							=item * ltrim - ignore this % of values on lower end.
1155
1156							=item * rtrim - ignore this % of values on upper end.
1157
1158							=back
1159
1160							If no options are given, the whole sample is guaranteed to reside between
1161							returned values.
1162
1163							=cut
1164
1165							sub find_boundaries {
1166	9			9	1	1187	my $self = shift;
1167	9					26	my %opt = @_;
1168
1169	9	100				33	return unless $self->count;
1170
1171							# preprocess boundaries
1172	8	100				39	my $min = defined $opt{min} ? $opt{min} : $self->_lower( $self->min );
1173	8	100				39	my $max = defined $opt{max} ? $opt{max} : $self->_upper( $self->max );
1174
1175	8	100				33	if ($opt{ltrim}) {
1176	1					5	my $newmin = $self->percentile( $opt{ltrim} );
1177	1	50	33			7	defined $newmin and $newmin > $min and $min = $newmin;
1178							};
1179	8	100				23	if ($opt{utrim}) {
1180	1					3	my $newmax = $self->percentile( 100-$opt{utrim} );
1181	1	50	33			14	defined $newmax and $newmax < $max and $max = $newmax;
1182							};
1183
1184	8					30	return ($min, $max);
1185							};
1186
1187							################################################################
1188							# No more public methods please
1189
1190							# MEMOIZE
1191							# We'll keep methods' returned values under {cache}.
1192							# All setters destroy said cache altogether.
1193							# PLEASE replace this with a ready-made module if there's one.
1194
1195							# Sorry for this black magic, but it's too hard to write //= in EVERY method
1196							# Long story short
1197							# The next sub replaces $self->foo() with
1198							# sub { $self->{cache}{foo} //= $self->originnal_foo }
1199							# All setter methods are EXPECTED to destroy {cache} altogether.
1200
1201							# NOTE if you plan subclassing the method, re-memoize methods you change.
1202							sub _memoize_method {
1203	242			242		438	my ($class, $name, $arg) = @_;
1204
1205	242					1041	my $orig_code = $class->can($name);
1206	242	50				646	die "Error in memoizer section ($name)"
1207							unless ref $orig_code eq 'CODE';
1208
1209							# begin long conditional
1210							my $cached_code = !$arg
1211							? sub {
1212	203	100		203		5906	if (!exists $_[0]->{cache}{$name}) {
1213	100					277	$_[0]->{cache}{$name} = $orig_code->($_[0]);
1214							};
1215	203					859	return $_[0]->{cache}{$name};
1216							}
1217							: sub {
1218	169			169		1657	my $self = shift;
1219	169					236	my $arg = shift;
1220	169	100				409	$arg = '' unless defined $arg;
1221
1222	169	100				593	if (!exists $self->{cache}{"$name:$arg"}) {
1223	100					239	$self->{cache}{"$name:$arg"} = $orig_code->($self, $arg);
1224							};
1225	168					798	return $self->{cache}{"$name:$arg"};
1226	242	100				994	};
1227							# conditional ends here
1228
1229	22			22		133	no strict 'refs'; ## no critic
	22					38
	22					760
1230	22			22		118	no warnings 'redefine'; ## no critic
	22					47
	22					2694
1231	242					312	*{$class."::".$name} = $cached_code;
	242					997
1232							}; # end of _memoize_method
1233
1234							# Memoize all the methods w/o arguments
1235							foreach ( qw(sum sumsq mean min max mode) ) {
1236							__PACKAGE__->_memoize_method($_);
1237							};
1238
1239							# Memoize methods with 1 argument
1240							foreach (qw(quantile central_moment std_moment standard_deviation variance)) {
1241							__PACKAGE__->_memoize_method($_, 1);
1242							};
1243
1244							# add shorter alias of standard_deviation (this must happen AFTER memoization)
1245							{
1246	22			22		329	no warnings 'once'; ## no critic
	22					38
	22					9137
1247							*std_dev = \&standard_deviation;
1248							};
1249
1250							# Get number of values below $x
1251							# Like sum_of(sub{1}, undef, $x), but faster.
1252							# Used by cdf()
1253							sub _count {
1254	105			105		145	my $self = shift;
1255	105	100				259	@_>1 and return $self->_count($_[1]) - $self->_count($_[0]);
1256	74					88	my $x = shift;
1257
1258	74					134	my $upper = $self->_upper($x);
1259	74					156	my $i = _bin_search_gt( $self->_sort, $upper );
1260	74	100				180	!$i-- and return 0;
1261	65					140	my $count = $self->_probability->[$i];
1262
1263							# interpolate
1264	65					130	my $bin = $self->_round( $x );
1265	65	100				325	if (my $val = $self->{data}{$bin}) {
1266	34					57	my $width = ($upper - $bin) * 2;
1267	34	100				67	my $part = $width ? ( ($upper - $x) / $width) : 1/2;
1268	34					56	$count -= $part * $val;
1269							};
1270	65					210	return $count;
1271							};
1272
1273							# BINARY SEARCH
1274							# Not a method, just a function
1275							# Takes sorted \@array and a $num
1276							# Return lowest $i such that $array[$i] >= $num
1277							# Return (scalar @array) if no such $i exists
1278							sub _bin_search_ge {
1279	146			146		370	my ($array, $x) = @_;
1280
1281	146	100	100			796	return 0 unless @$array and $array->[0] < $x;
1282	125					160	my $l = 0;
1283	125					183	my $r = @$array;
1284	125					308	while ($l+1 < $r) {
1285	403					632	my $m = int( ($l + $r) /2);
1286	403	100				1252	$array->[$m] < $x ? $l = $m : $r = $m;
1287							};
1288	125					347	return $l+1;
1289							};
1290							sub _bin_search_gt {
1291	74			74		103	my ($array, $x) = @_;
1292	74					133	my $i = _bin_search_ge(@_);
1293	74	100	100			326	$i++ if defined $array->[$i] and $array->[$i] == $x;
1294	74					115	return $i;
1295							};
1296
1297							# THE CORE
1298							# Here come the number=>bin functions
1299							# round() generates bin center
1300							# upper() and lower() are respective boundaries.
1301							# Here's the algorithm:
1302							# 1) determine whether bin is linear or logarithmic
1303							# 2) for linear bins, return bin# * bucket_width (==2*absolute error)
1304							# add/subtract 0.5 to get edges.
1305							# 3) for logarithmic bins, return base ** bin# with appropriate sign
1306							# multiply by precalculated constant to get edges
1307							# note that +0.5 doesn't work here, since sqrt(a*b) != (a+b)/2
1308							# This part is fragile and can be written better
1309
1310							# center of bin containing x
1311							sub _round {
1312	90888			90888		109040	my $self = shift;
1313	90888					109305	my $x = shift;
1314
1315	22			22		180	use warnings FATAL => qw(numeric uninitialized);
	22					37
	22					3570
1316
1317	90888	100				189372	if (abs($x) <= $self->{linear_thresh}) {
1318							return $self->{linear_width}
1319	1761		100			6796	&& $self->{linear_width} * floor( $x / $self->{linear_width} + 0.5 );
1320							};
1321	89124					225567	my $i = floor (((log abs $x) - $self->{logfloor})/ $self->{logbase});
1322	89124					152199	my $value = $self->{base} ** $i;
1323	89124	100				415819	return $x < 0 ? -$value : $value;
1324							};
1325
1326							# lower, upper limits of bin containing x
1327							sub _lower {
1328	882			882		58216	my $self = shift;
1329	882					1151	my $x = shift;
1330
1331	22			22		116	use warnings FATAL => qw(numeric uninitialized);
	22					33
	22					8391
1332
1333	882	100				2365	if (abs($x) <= $self->{linear_thresh}) {
1334							return $self->{linear_width}
1335	183		66			1419	&& $self->{linear_width} * (floor( $x / $self->{linear_width} + 0.5) - 0.5);
1336							};
1337	699					2309	my $i = floor (((log abs $x) - $self->{logfloor} )/ $self->{logbase});
1338	699	100				1571	if ($x > 0) {
1339	310					1300	return $self->{floor} * $self->{base}**($i);
1340							} else {
1341	389					1627	return -$self->{floor} * $self->{base}**($i+1);
1342							};
1343							};
1344
1345							sub _upper {
1346	462			462		4814	return -$_[0]->_lower(-$_[1]);
1347							};
1348
1349							# build bin index
1350							sub _sort {
1351	189			189		260	my $self = shift;
1352							return $self->{cache}{sorted}
1353	189		100			863	\|\|= [ sort { $a <=> $b } keys %{ $self->{data} } ];
	1232					1909
	31					262
1354							};
1355
1356							# build cumulative bin counts index
1357							sub _probability {
1358	106			106		134	my $self = shift;
1359	106		66			425	return $self->{cache}{probability} \|\|= do {
1360	19					35	my @array;
1361	19					36	my $sum = 0;
1362	19					33	foreach (@{ $self->_sort }) {
	19					56
1363	160					255	$sum += $self->{data}{$_};
1364	160					299	push @array, $sum;
1365							};
1366	19					129	\@array;
1367							};
1368							};
1369
1370							=head1 AUTHOR
1371
1372							Konstantin S. Uvarin, C<< >>
1373
1374							=head1 BUGS
1375
1376							The module is currently under development. There may be bugs.
1377
1378							C only works for discrete distributions, and simply returns
1379							the first bin with largest bin count.
1380							A better algorithm is wanted.
1381
1382							C should have been made a private method.
1383							Its signature and/or name may change in the future.
1384
1385							See the TODO file in the distribution package.
1386
1387							Please feel free to post bugs and/or feature requests to github:
1388							L
1389
1390							Alternatively, you can use CPAN RT
1391							via e-mail C,
1392							or through the web interface at
1393							L.
1394
1395							Your contribution is appreciated.
1396
1397							=head1 SUPPORT
1398
1399							You can find documentation for this module with the perldoc command.
1400
1401							perldoc Statistics::Descriptive::LogScale
1402
1403							You can also look for information at:
1404
1405							=over 4
1406
1407							=item * GitHub:
1408
1409							L
1410
1411							=item * RT: CPAN's request tracker (report bugs here)
1412
1413							L
1414
1415							=item * AnnoCPAN: Annotated CPAN documentation
1416
1417							L
1418
1419							=item * CPAN Ratings
1420
1421							L
1422
1423							=item * Search CPAN
1424
1425							L
1426
1427							=back
1428
1429							=head1 ACKNOWLEDGEMENTS
1430
1431							This module was inspired by a talk that Andrew Aksyonoff, author of
1432							L,
1433							has given at HighLoad++ conference in Moscow, 2012.
1434
1435							L was and is used as reference when in doubt.
1436							Several code snippets were shamelessly stolen from there.
1437
1438							C and C parameter names were suggested by
1439							CountZero from http://perlmonks.org
1440
1441							=head1 LICENSE AND COPYRIGHT
1442
1443							Copyright 2013-2015 Konstantin S. Uvarin.
1444
1445							This program is free software; you can redistribute it and/or modify it
1446							under the terms of either: the GNU General Public License as published
1447							by the Free Software Foundation; or the Artistic License.
1448
1449							See http://dev.perl.org/licenses/ for more information.
1450
1451							=cut
1452
1453							1; # End of Statistics::Descriptive::LogScale