File Coverage

blib/lib/Statistics/Running/Tiny.pm
Criterion Covered Total %
statement 116 123 94.3
branch 12 16 75.0
condition 5 8 62.5
subroutine 27 28 96.4
pod 18 23 78.2
total 178 198 89.9


line stmt bran cond sub pod time code
1             package Statistics::Running::Tiny;
2              
3 2     2   55321 use 5.006;
  2         11  
4 2     2   10 use strict;
  2         4  
  2         40  
5 2     2   9 use warnings;
  2         3  
  2         143  
6              
7             our $VERSION = '0.01';
8              
9             use overload
10 2         14 '+' => \&concatenate,
11             '==' => \&equals,
12             '""' => \&stringify,
13 2     2   1948 ;
  2         1587  
14              
15 2     2   146 use constant SMALL_NUMBER_FOR_EQUALITY => 1E-10;
  2         4  
  2         2524  
16              
17             # creates an obj. There are no input params
18             sub new {
19 7     7 1 82 my $class = $_[0];
20              
21 7   100     34 my $parent = ( caller(1) )[3] || "N/A";
22 7         32 my $whoami = ( caller(0) )[3];
23              
24 7         46 my $self = {
25             # these are internal variables to store mean etc. or used to calculate Kurtosis
26             'M1' => 0.0,
27             'M2' => 0.0,
28             'M3' => 0.0,
29             'M4' => 0.0,
30             'MIN' => 0.0,
31             'MAX' => 0.0,
32             'N' => 0, # number of data items inserted
33             };
34 7         12 bless($self, $class);
35 7         18 $self->clear();
36 7         11 return $self
37             }
38             # push Data: a sample and process/update mean and all other stat measures
39             sub add {
40 505     505 1 776 my $self = $_[0];
41 505         525 my $x = $_[1];
42              
43 505         555 my $aref = ref($x);
44              
45 505 100       660 if( $aref eq '' ){
    50          
46             # a scalar input
47 502         580 my ($delta, $delta_n, $delta_n2, $term1);
48 502         553 my $n1 = $self->{'N'};
49 502 100       615 if( $n1 == 0 ){ $self->{'MIN'} = $self->{'MAX'} = $x }
  4         18  
50             else {
51 498 100       722 if( $x < $self->{'MIN'} ){ $self->{'MIN'} = $x }
  13         16  
52 498 100       717 if( $x > $self->{'MAX'} ){ $self->{'MAX'} = $x }
  112         126  
53             }
54 502         560 $self->{'N'} += 1; # increment sample size push in
55 502         556 my $n0 = $self->{'N'};
56              
57 502         594 $delta = $x - $self->{'M1'};
58 502         590 $delta_n = $delta / $n0;
59 502         551 $delta_n2 = $delta_n * $delta_n;
60 502         590 $term1 = $delta * $delta_n * $n1;
61 502         549 $self->{'M1'} += $delta_n;
62             $self->{'M4'} += $term1 * $delta_n2 * ($n0*$n0 - 3*$n0 + 3)
63             + 6 * $delta_n2 * $self->{'M2'}
64 502         832 - 4 * $delta_n * $self->{'M3'}
65             ;
66             $self->{'M3'} += $term1 * $delta_n * ($n0 - 2)
67 502         704 - 3 * $delta_n * $self->{'M2'}
68             ;
69 502         690 $self->{'M2'} += $term1;
70             } elsif( $aref eq 'ARRAY' ){
71             # an array input
72 3         7 foreach (@$x){ $self->add($_) }
  302         381  
73             } else {
74 0         0 die "add(): only ARRAY and SCALAR can be handled (input was type '$aref')."
75             }
76             }
77             # copies input(=src) Running obj into current/self overwriting our data, this is not a clone()!
78             sub copy_from {
79 1     1 1 4 my $self = $_[0];
80 1         2 my $src = $_[1];
81 1         9 $self->{'M1'} = $src->M1();
82 1         3 $self->{'M2'} = $src->M2();
83 1         1 $self->{'M3'} = $src->M3();
84 1         2 $self->{'M4'} = $src->M4();
85 1         2 $self->set_N($src->get_N());
86             }
87             # clones current obj into a new Running obj with same values
88             sub clone {
89 1     1 1 3 my $self = $_[0];
90 1         3 my $newO = Statistics::Running::Tiny->new();
91 1         4 $newO->{'M1'} = $self->M1();
92 1         3 $newO->{'M2'} = $self->M2();
93 1         3 $newO->{'M3'} = $self->M3();
94 1         3 $newO->{'M4'} = $self->M4();
95 1         3 $newO->set_N($self->get_N());
96 1         2 return $newO
97             }
98             # clears all data entered/calculated including histogram
99             sub clear {
100 9     9 1 13 my $self = $_[0];
101 9         33 $self->{'M1'} = 0.0;
102 9         12 $self->{'M2'} = 0.0;
103 9         12 $self->{'M3'} = 0.0;
104 9         11 $self->{'M4'} = 0.0;
105 9         14 $self->{'N'} = 0;
106             }
107             # return the mean of the data entered so far
108 4     4 1 22 sub mean { return $_[0]->{'M1'} }
109 4     4 1 11 sub min { return $_[0]->{'MIN'} }
110 4     4 1 10 sub max { return $_[0]->{'MAX'} }
111             # get number of total elements entered so far
112 18     18 1 46 sub get_N { return $_[0]->{'N'} }
113             sub variance {
114 4     4 1 6 my $self = $_[0];
115 4         4 my $m = $self->{'N'};
116 4 50       11 if( $m == 1 ){ return 0 }
  0         0  
117 4         17 return $self->{'M2'}/($m-1.0)
118             }
119 4     4 1 11 sub standard_deviation { return sqrt($_[0]->variance()) }
120             sub skewness {
121 3     3 1 5 my $self = $_[0];
122 3         11 my $m = $self->{'M2'};
123 3 50       6 if( $m == 0 ){ return 0 }
  3         57  
124             return sqrt($self->{'N'})
125 0         0 * $self->{'M3'} / ($m ** 1.5)
126             ;
127             }
128             sub kurtosis {
129 4     4 1 6 my $self = $_[0];
130 4         6 my $m = $self->{'M2'};
131 4 50       9 if( $m == 0 ){ return 0 }
  4         11  
132             return $self->{'N'}
133 0         0 * $self->{'M4'}
134             / ($m * $m)
135             - 3.0
136             ;
137             }
138             # concatenates another Running obj with current
139             # returns a new Running obj with concatenated stats
140             # input objs are not modified.
141             sub concatenate {
142 2     2 1 8 my $self = $_[0]; # us
143 2         3 my $other = $_[1]; # another Running obj
144              
145 2         6 my $combined = Statistics::Running::Tiny->new();
146              
147 2         5 my $selfN = $self->get_N();
148 2         4 my $otherN = $other->get_N();
149 2         4 my $selfM2 = $self->M2();
150 2         3 my $otherM2 = $other->M2();
151 2         4 my $selfM3 = $self->M3();
152 2         3 my $otherM3 = $other->M3();
153              
154 2         3 my $combN = $selfN + $otherN;
155 2         5 $combined->set_N($combN);
156            
157 2         3 my $delta = $other->M1() - $self->M1();
158 2         4 my $delta2 = $delta*$delta;
159 2         3 my $delta3 = $delta*$delta2;
160 2         2 my $delta4 = $delta2*$delta2;
161              
162 2         4 $combined->{'M1'} = ($selfN*$self->M1() + $otherN*$other->M1()) / $combN;
163              
164 2         5 $combined->{'M2'} = $selfM2 + $otherM2 +
165             $delta2 * $selfN * $otherN / $combN;
166            
167 2         8 $combined->{'M3'} = $selfM3 + $otherM3 +
168             $delta3 * $selfN * $otherN * ($selfN - $otherN)/($combN*$combN) +
169             3.0*$delta * ($selfN*$otherM2 - $otherN*$selfM2) / $combN
170             ;
171            
172 2         10 $combined->{'M4'} = $self->{'M4'} + $other->{'M4'}
173             + $delta4*$selfN*$otherN * ($selfN*$selfN - $selfN*$otherN + $otherN*$otherN) /
174             ($combN*$combN*$combN)
175             + 6.0*$delta2 * ($selfN*$selfN*$otherM2 + $otherN*$otherN*$selfM2)/($combN*$combN) +
176             4.0*$delta*($selfN*$otherM3 - $otherN*$selfM3) / $combN
177             ;
178            
179 2         11 return $combined;
180             }
181             # appends another Running obj INTO current
182             # current obj (self) IS MODIFIED
183             sub append {
184 0     0 1 0 my $self = $_[0]; # us
185 0         0 my $other = $_[1]; # another Running obj
186 0         0 $self->copy_from($self+$other);
187             }
188             # equality only wrt to stats BUT NOT histogram
189             sub equals {
190 4     4 1 14 my $self = $_[0]; # us
191 4         6 my $other = $_[1]; # another Running obj
192             return
193 4   66     5 $self->get_N() == $other->get_N() &&
194             $self->equals_statistics($other)
195             }
196             sub equals_statistics {
197 5     5 1 16 my $self = $_[0]; # us
198 5         6 my $other = $_[1]; # another Running obj
199             return
200 5   33     7 abs($self->M1()-$other->M1()) < Statistics::Running::Tiny::SMALL_NUMBER_FOR_EQUALITY &&
201             abs($self->M2()-$other->M2()) < Statistics::Running::Tiny::SMALL_NUMBER_FOR_EQUALITY &&
202             abs($self->M3()-$other->M3()) < Statistics::Running::Tiny::SMALL_NUMBER_FOR_EQUALITY &&
203             abs($self->M4()-$other->M4()) < Statistics::Running::Tiny::SMALL_NUMBER_FOR_EQUALITY
204             }
205             # print object as a string, string concat/printing is overloaded on this method
206             sub stringify {
207 3     3 1 11 my $self = $_[0];
208 3         6 return "N: ".$self->get_N()
209             .", mean: ".$self->mean()
210             .", range: ".$self->min()." to ".$self->max()
211             .", standard deviation: ".$self->standard_deviation()
212             .", kurtosis: ".$self->kurtosis()
213             .", skewness: ".$self->skewness()
214             }
215             # internal methods, no need for anyone to know or use externally
216 4     4 0 7 sub set_N { $_[0]->{'N'} = $_[1] }
217 20     20 0 46 sub M1 { return $_[0]->{'M1'} }
218 16     16 0 61 sub M2 { return $_[0]->{'M2'} }
219 16     16 0 37 sub M3 { return $_[0]->{'M3'} }
220 12     12 0 34 sub M4 { return $_[0]->{'M4'} }
221              
222             =head1 NAME
223              
224             Statistics::Running::Tiny - Basic descriptive statistics (incl. min/max/skew/kurtosis) without the need to store data points, statistics are updated every time a new data point is added in
225              
226             Calculate basic descriptive statistics (mean, variance, standard deviation, skewness, kurtosis)
227             without the need to store any data point/sample. Statistics are
228             updated each time a new data point/sample comes in.
229              
230             =head1 VERSION
231              
232             Version 0.01
233              
234             =cut
235              
236              
237             =head1 SYNOPSIS
238              
239              
240             There are three amazing things about B.P.Welford's algorithm implemented here:
241              
242             =over 4
243              
244             =item 1. It calculates and keeps updating mean/standard-deviation etc. on
245             data without the need to store that data. As new data comes in, the
246             statistics are updated based on the state of a few variables (mean, number
247             of data points, etc.) but not the past data points. This includes the
248             calculation of standard deviation which most of us knew (wrongly) that
249             it requires a second pass on the data points, after the mean is calculated.
250             Well, B.P.Welford found a way to avoid this.
251              
252             =item 2. The standard formula for standard deviation requires to sum
253             the square of the difference of each sample from the mean.
254             If samples are large numbers then you are summing differences of large
255             numbers. If further there is little difference between samples, and the
256             discrepancy from the mean is small, then you are prone to
257             precision errors which accumulate to destructive effect if the number of
258             samples is large. In contrast, B.P.Welford's algorithm does
259             not suffer from this, it is stable and accurate.
260              
261             =item 3. B.P.Welford's online statistics algorithm
262             is quite a revolutionary idea and why is not an obligatory subject
263             in first-year programming courses is beyond comprehension.
264             Here is a way to decrease those CO2 emissions.
265              
266             =back
267              
268             The basis for the code in this module is from
269             L
270              
271             use Statistics::Running::Tiny;
272             my $ru = Statistics::Running::Tiny->new();
273             for(1..100){
274             $ru->add(rand());
275             }
276             print "mean: ".$ru->mean()."\n";
277             $ru->add(12345);
278             print "mean: ".$ru->mean()."\n";
279              
280             my $ru2 = Statistics::Running::Tiny->new();
281             for(1..100){
282             $ru2->add(rand());
283             }
284             my $ru3 = $ru + $ru2;
285             print "mean of concatenated data: ".$ru3->mean()."\n";
286              
287             $ru += $ru2;
288             print "mean after appending data: ".$ru->mean()."\n";
289              
290             print "stats: ".$ru->stringify()."\n";
291              
292             =head1 EXPORT
293              
294             =head1 SUBROUTINES/METHODS
295              
296             =head2 new
297              
298             Constructor, initialises internal variables.
299              
300             =head2 add
301             Update our statistics after one more data point/sample (or an
302             array of them) is presented to us.
303              
304             my $ru1 = Statistics::Running::Tiny->new();
305             for(1..100){
306             $ru1->add(rand());
307             print $ru1->stringify()."\n";
308             }
309              
310             Input can be a single data point (a scalar) or a reference
311             to an array of data points.
312              
313             =head2 copy_from
314             Copy state of input object into current effectively making us like
315             them. Our previous state is forgotten. After that adding a new data point into
316             us will be with the new state copied.
317              
318             my $ru1 = Statistics::Running::Tiny->new();
319             for(1..100){
320             $ru1->add(rand());
321             }
322             my $ru2 = Statistics::Running::Tiny->new();
323             for(1..100){
324             $ru2->add(rand(1000000));
325             }
326             # copy the state of ru1 into ru2. state of ru1 is forgotten.
327             $ru2->copy_from($ru1);
328              
329             =head2 clone
330             Clone state of our object into a newly created object which is returned.
331             Our object and returned object are identical at the time of cloning.
332              
333             my $ru1 = Statistics::Running::Tiny->new();
334             for(1..100){
335             $ru1->add(rand(1000000));
336             }
337             my $ru2 = $ru1->clone();
338              
339             =head2 clear
340             Clear our internal state as if no data points have ever added into us.
341             As if we were just created. All state is forgotten and reset to zero.
342              
343             =head2 min
344             Returns the minimum data sample added in us
345              
346             =head2 max
347             Returns the maximum data sample added in us
348              
349             =head2 get_N
350             Returns the number of data points/samples processed (added onto us) so far.
351              
352             =head2 variance
353             Returns the variance of the data points/samples added onto us so far.
354              
355             =head2 standard_deviation
356             Returns the standard deviation of the data points/samples added onto us so far. This is the square root of the variance.
357              
358             =head2 skewness
359             Returns the skewness of the data points/samples added onto us so far.
360              
361             =head2 kurtosis
362             Returns the kurtosis of the data points/samples added onto us so far.
363              
364             =head2 concatenate
365             Concatenates our state with the input object's state and returns
366             a newly created object with the combined state. Our object and
367             input object are not modified. The overloaded symbol '+' points
368             to this sub.
369              
370             =head2 append
371             Appends input object's state into ours.
372             Our state is modified. (input object's state is not modified)
373             The overloaded symbol '+=' points
374             to this sub.
375              
376             =head2 equals
377             Check if our state (number of samples and all internal state) is
378             the same with input object's state. Equality here implies that
379             ALL statistics are equal (within a small number Statistics::Running::Tiny::SMALL_NUMBER_FOR_EQUALITY)
380              
381             =head2 equals_statistics
382             Check if our statistics only (and not sample size)
383             are the same with input object. E.g. it checks mean, variance etc.
384             but not sample size (as with the real equals()).
385             It returns 0 on non-equality. 1 if equal.
386              
387             =head2 stringify
388             Returns a string description of descriptive statistics we know about
389             (mean, standard deviation, kurtosis, skewness) as well as the
390             number of data points/samples added onto us so far.
391              
392             =cut
393              
394             =head1 BENCHMARKS
395              
396             Check B<< make bench >> for benchmarks
397              
398             =head1 SEE ALSO
399              
400             =over 4
401              
402             =item 1. L
403              
404             =item 2. L
405              
406             =item 3. L This module does not provide B<< kurtosis() >> and B<< skewness() >> which current module does.
407              
408             =item 4. L This is the exact same module with the addition of
409             a histogram logging each inserted data point. The histogram is in effect
410             a discrete approximation of the Probability Distribution of the input data
411             points. The current module is the same as that bar the histogram. That
412             makes it a bit faster. Check B<< make bench >> for benchmarks
413              
414             =back
415              
416             =head1 AUTHOR
417              
418             Andreas Hadjiprocopis, C<< >>
419              
420              
421             =head1 BUGS
422              
423             Please report any bugs or feature requests to C, or through
424             the web interface at L. I will be notified, and then you'll
425             automatically be notified of progress on your bug as I make changes.
426              
427              
428             =head1 SUPPORT
429              
430             You can find documentation for this module with the perldoc command.
431              
432             perldoc Statistics::Running::Tiny
433              
434              
435             You can also look for information at:
436              
437             =over 4
438              
439             =item * RT: CPAN's request tracker (report bugs here)
440              
441             L
442              
443             =item * AnnoCPAN: Annotated CPAN documentation
444              
445             L
446              
447             =item * CPAN Ratings
448              
449             L
450              
451             =item * Search CPAN
452              
453             L
454              
455             =back
456              
457              
458             =head1 ACKNOWLEDGEMENTS
459              
460              
461             =head1 LICENSE AND COPYRIGHT
462              
463             Copyright 2018 Andreas Hadjiprocopis.
464              
465             This program is free software; you can redistribute it and/or modify it
466             under the terms of the the Artistic License (2.0). You may obtain a
467             copy of the full license at:
468              
469             L
470              
471             Any use, modification, and distribution of the Standard or Modified
472             Versions is governed by this Artistic License. By using, modifying or
473             distributing the Package, you accept this license. Do not use, modify,
474             or distribute the Package, if you do not accept this license.
475              
476             If your Modified Version has been derived from a Modified Version made
477             by someone other than you, you are nevertheless required to ensure that
478             your Modified Version complies with the requirements of this license.
479              
480             This license does not grant you the right to use any trademark, service
481             mark, tradename, or logo of the Copyright Holder.
482              
483             This license includes the non-exclusive, worldwide, free-of-charge
484             patent license to make, have made, use, offer to sell, sell, import and
485             otherwise transfer the Package with respect to any patent claims
486             licensable by the Copyright Holder that are necessarily infringed by the
487             Package. If you institute patent litigation (including a cross-claim or
488             counterclaim) against any party alleging that the Package constitutes
489             direct or contributory patent infringement, then this Artistic License
490             to you shall terminate on the date that such litigation is filed.
491              
492             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
493             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
494             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
495             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
496             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
497             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
498             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
499             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
500              
501              
502             =cut
503              
504             1; # End of Statistics::Running::Tiny