File Coverage

blib/lib/Statistics/LSNoHistory.pm

Criterion	Covered	Total	%
statement	130	150	86.6
branch	42	66	63.6
condition	4	9	44.4
subroutine	23	25	92.0
pod	0	21	0.0
total	199	271	73.4

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# LSNoHistory.pm - least-squares regression without data history
3							#
4							# $Id: LSNoHistory.pm,v 1.6 2003/02/23 05:11:29 pliam Exp $
5							#
6
7							package Statistics::LSNoHistory;
8	3			3		72430	use strict;
	3					10
	3					131
9
10	3			3		19	use vars qw($VERSION);
	3					5
	3					5777
11							$VERSION = sprintf("%d.%02d", (q$Name: LSNoHist_Release_0_01 $ =~ /\d+/g));
12
13							#############################################################################
14							# top-level pod
15							#############################################################################
16
17							=pod
18
19							=head1 NAME
20
21							Statistics::LSNoHistory - Least-Squares linear regression package without
22							data history
23
24							=head1 SYNOPSIS
25
26							# construct from points
27							$reg = Statistics::LSNoHistory->new(points => [
28							1.0 => 1.0,
29							2.1 => 1.9,
30							2.8 => 3.2,
31							4.0 => 4.1,
32							5.2 => 4.9
33							]);
34
35							# other equivalent constructions
36							$reg = Statistics::LSNoHistory->new(
37							xvalues => [1.0, 2.1, 2.8, 4.0, 5.2],
38							yvalues => [1.0, 1.9, 3.2, 4.1, 4.9]
39							);
40							# or
41							$reg = Statistics::LSNoHistory->new;
42							$reg->append_arrays(
43							[1.0, 2.1, 2.8, 4.0, 5.2],
44							[1.0, 1.9, 3.2, 4.1, 4.9]
45							);
46							# or
47							$reg = Statistics::LSNoHistory->new;
48							$reg->append_points(
49							1.0 => 1.0, 2.1 => 1.9, 2.8 => 3.2, 4.0 => 4.1, 5.2 => 4.9
50							);
51
52							# You may also construct from the preliminary statistics of a
53							# previous regression:
54							$reg = Statistics::LSNoHistory->new(
55							num => 5,
56							sumx => 15.1,
57							sumy => 15.1,
58							sumxx => 56.29,
59							sumyy => 55.67,
60							sumxy => 55.83,
61							minx => 1.0,
62							maxx => 5.2,
63							miny => 1.0,
64							maxy => 4.9
65							);
66							# thus a branch may be instantiated as follows
67							$branch = Statistics::LSNoHistory->new(%{$reg->dump_stats});
68							$reg->append_point(6.1, 5.9);
69							$branch->append_point(5.8, 6.0);
70
71							# calculate regression values, print some
72							printf("Slope: %.2f\n", $reg->slope);
73							printf("Intercept %.2f\n", $reg->intercept);
74							printf("Correlation Coefficient: %.2f\n", $reg->pearson_r);
75							...
76
77
78							=head1 DESCRIPTION
79
80							This package provides standard least squares linear regression
81							functionality without the need for storing the complete data history.
82							Like any other, it finds best m,k (in least squares sense) so that
83							y = m*x + k fits data points (x_1,y_1),...,(x_n,y_n).
84
85							In many applications involving linear regression, it is desirable
86							to compute a regression based on the intermediate statistics of a
87							previous regression along with any I data points. Thus there
88							is no need to store a complete data history, but rather only a minimal
89							set of intermediate statistics, the number of which, thanks to Gauss,
90							is 6.
91
92							The user interface provides a way to instantiate a regression object
93							with either raw data or previous intermediate statistics.
94
95							=cut
96
97							#############################################################################
98							# construction
99							#############################################################################
100
101							=pod
102
103							=head1 CONSTRUCTOR ARGUMENTS
104
105							The constructor (or class method I) takes several possible
106							arguments. The initialization scenario depends on the kinds of
107							arguments passed and falls into one of the following categories:
108
109							=over 2
110
111							=item *
112
113							I S() by itself is equivalent to initializing with no
114							data. All internal statistics are set to zero.
115
116							=item *
117
118							I new(I => [x_1 => y_1, x_2 => y_2,...,
119							x_n => y_n]) processes the n specified data points. Note that
120							points expects an array reference even though we've written it
121							in "hash notation" for clarity.
122
123							=item *
124
125							I new(I => [x_1, x_2,..., x_n],
126							I => [y_1, y_2,..., y_n]) is equivalent to the above.
127
128							=item *
129
130							I new(I) requires I of the
131							following intermediate statistics:
132
133							=over 6
134
135							=item I
136
137							S<=E> Number of points.
138
139							=item I
140
141							S<=E> Sum of x values.
142
143							=item I
144
145							S<=E> Sum of y values.
146
147							=item I
148
149							S<=E> Sum of x values squared.
150
151							=item I
152
153							S<=E> Sum of y values squared.
154
155							=item I
156
157							S<=E> Sum of x*y products.
158
159							=item I
160
161							S<=E> Minimum x value.
162
163							=item I
164
165							S<=E> Maximum x value.
166
167							=item I
168
169							S<=E> Minimum y value.
170
171							=item I
172
173							S<=E> Maximum y value.
174
175							=back 6
176
177							=back 2
178
179							=cut
180
181							## new constructor
182							sub new {
183	11			11	0	2447	my $class = shift;
184	11					40	my %args = @_;
185	11					17	my $self;
186	11					35	my @stats = qw(num sumx sumy sumxx sumyy sumxy);
187	11					28	push(@stats, qw(minx maxx miny maxy)); # min/max
188
189							# if complete set of statistics, construct from previous state
190							# if (@stats == scalar(grep {defined($args{$_})} @stats)) {
191	11	100				23	if (@stats == grep {defined($args{$_})} @stats) {
	110					281
192							# reject unsupported arguments and combinations
193	2	50				5	if (grep {defined($args{$_})} qw(points xvalues yvalues)) {
	6					15
194	0					0	die "Cannot give new data along with previous state.";
195							}
196	2	50				9	unless (@stats == keys %args) {
197	0					0	die "Unknown constructor arguments.";
198							}
199							# check the number of points for consistency
200	2	50				7	unless (abs(int($args{num})) == $args{num}) {
201	0					0	die "Bad number of points: must be positive integer.";
202							}
203	2					3	$self = \%args;
204	2					5	bless $self, $class;
205	2					9	return $self;
206							}
207							# in any other case we're starting from scratch
208	9					17	$self = {};
209	9					20	bless $self, $class;
210	9					31	$self->_init;
211							# x & y value array refs
212	9	100	66			61	if (defined($args{xvalues}) && defined($args{yvalues})) {
		100
213	3	50				632	if (defined $args{points}) {
214	0					0	die "Must give points or array values, but not both";
215							}
216	3	50				10	unless (scalar(keys %args) == 2) {
217	0					0	die "Unknown constructor arguments.";
218							}
219	3					12	$self->append_arrays($args{xvalues}, $args{yvalues});
220							}
221							# (x,y) point array ref
222							elsif (defined($args{points})) {
223	3	50				7	if (grep {defined($args{$_})} qw(xvalues yvalues)) {
	6					29
224	0					0	die "Must give points or array values, but not both";
225							}
226	3	50				15	unless (scalar(keys %args) == 1) {
227	0					0	die "Unknown constructor arguments.";
228							}
229	3					6	$self->append_points(@{$args{points}});
	3					16
230							}
231							# default constructor (already initialized above)
232							else {
233	3	50				10	if (scalar(keys %args)) {
234	0					0	die "Unknown constructor arguments.";
235							}
236							}
237	9					41	return $self;
238							}
239
240							## _init in this context really means start with state of 0's
241							sub _init {
242	9			9		14	my $self = shift;
243	9					26	my @stats = qw(num sumx sumy sumxx sumyy sumxy);
244	9					23	push(@stats, qw(minx maxx miny maxy)); # min/max
245
246	9					90	@$self{@stats} = (0) x scalar(@stats);
247							}
248
249
250							#############################################################################
251							# other methods
252							#############################################################################
253							=pod
254
255							=head1 METHODS
256
257							=over 2
258
259							=cut
260
261							#
262							# adding data
263							#
264
265							## append_point
266							=pod
267
268							=item *
269
270							I(x,y) process an additional data point.
271
272							=cut
273							sub append_point {
274	64			64	0	1756	my $self = shift;
275	64					71	my($x,$y) = @_;
276
277							## will have to recompute regression
278	64					84	$self->{cached} = 0;
279
280							# min/max
281	64	100				110	if ($self->{num}) {
282	55	100				126	$self->{minx} = ($x < $self->{minx}) ? $x : $self->{minx};
283	55	100				120	$self->{maxx} = ($x > $self->{maxx}) ? $x : $self->{maxx};
284	55	100				112	$self->{miny} = ($y < $self->{miny}) ? $y : $self->{miny};
285	55	100				106	$self->{maxy} = ($y > $self->{maxy}) ? $y : $self->{maxy};
286							}
287							else {
288	9					17	$self->{minx} = $x;
289	9					27	$self->{maxx} = $x;
290	9					16	$self->{miny} = $y;
291	9					12	$self->{maxy} = $y;
292							}
293
294							# classic stats
295	64					71	$self->{num}++;
296	64					80	$self->{sumx} += $x;
297	64					69	$self->{sumy} += $y;
298	64					102	$self->{sumxx} += $x**2;
299	64					87	$self->{sumyy} += $y**2;
300	64					140	$self->{sumxy} += $x*$y;
301							}
302
303							## append_points
304							=pod
305
306							=item *
307
308							I(x_1 => y_1,..., x_n => y_n) process additional data points,
309							which is equivalent to calling append_point() n times.
310
311							=cut
312							sub append_points {
313	4			4	0	9	my $self = shift;
314	4					23	my @points = @_;
315	4					8	my $num = scalar(@points);
316
317	4	50				20	if ($num % 2) { die "Incomplete list of points."; }
	0					0
318
319	4					9	$num /= 2;
320	4					29	for (1..$num) { $self->append_point(splice(@points, 0, 2)); }
	30					103
321							}
322
323
324							## append_arrays
325							=pod
326
327							=item *
328
329							I([x_1, x_2,..., x_n], [y_1, y_2,..., y_n])
330							process additional data points given a pair x and y data array
331							references. Also equivalent to calling append_point() n times.
332
333							=cut
334							sub append_arrays {
335	4			4	0	11	my $self = shift;
336	4					4	my ($xr, $yr) = @_;
337	4					5	my ($xn, $yn);
338
339							# check arg type
340	4	50	33			22	unless ((ref($xr) eq 'ARRAY') && (ref($yr) eq 'ARRAY')) {
341	0					0	die "Must pass pair of array references.";
342							}
343
344							# check that sizes match
345	4					6	$xn = scalar(@$xr);
346	4					4	$yn = scalar(@$yr);
347	4	50				11	unless ($xn == $yn) { die "Incomplete list of points."; }
	0					0
348
349	4					8	for (1..$xn) { $self->append_point(shift(@$xr), shift(@$yr)); }
	30					61
350							}
351
352							#
353							# computing the regression
354							#
355
356							## _regress method -- done behind the scenes & considered private
357							sub _regress {
358	12			12		17	my $self = shift;
359	12					18	my($n) = $self->{num};
360	12					67	my($dx) = $n$self->{sumxx} - $self->{sumx}*2;
361	12					638	my($dy) = $n$self->{sumyy} - $self->{sumy}*2;
362
363							# check that we have 2 points
364	12	100				30	unless ($n >= 2) { die "Must have at least 2 points for regression."; }
	1					6
365							# check data for consistency
366	11	50	33			62	unless (($dx!=0) && ($dy!=0)) {
367	0					0	die "Inconsistent data: would divide by zero.";
368							}
369
370							# means and variances
371	11					25	$self->{avgx} = $self->{sumx}/$n;
372	11					21	$self->{avgy} = $self->{sumy}/$n;
373	11					33	$self->{varx} = $dx/$n/($n-1);
374	11					27	$self->{vary} = $dy/$n/($n-1);
375
376							# slopes and intercepts
377	11					40	$self->{mx} = ($n$self->{sumxy} - $self->{sumx}$self->{sumy})/$dx;
378	11					32	$self->{kx} = $self->{avgy} - $self->{mx}*$self->{avgx};
379	11					30	$self->{my} = ($n$self->{sumxy} - $self->{sumx}$self->{sumy})/$dy;
380	11					25	$self->{ky} = $self->{avgx} - $self->{my}*$self->{avgy};
381
382							# correlation coefficient (Pearson's r) and chi squared
383	11					51	$self->{r} = ($n$self->{sumxy} - $self->{sumx}$self->{sumy})
384							/ sqrt($dx*$dy);
385	11					49	$self->{chi2} = (1-$self->{r}*2)$dy/$n;
386
387							# flag that regression calculations are up to date
388	11					51	$self->{cached} = 1;
389							}
390
391							#
392							# presentation of stats, prediction
393							#
394
395							## average_x
396							=pod
397
398							=item *
399
400							I returns the mean of the x values.
401
402							=cut
403							sub average_x {
404	19			19	0	7041	my $self = shift;
405	19	100				64	$self->_regress unless $self->{cached};
406	18					149	return $self->{avgx}
407							}
408
409							## average_y
410							=pod
411
412							=item *
413
414							I returns the mean of the y values.
415
416							=cut
417							sub average_y {
418	18			18	0	1163	my $self = shift;
419	18	50				62	$self->_regress unless $self->{cached};
420	18					668	return $self->{avgy}
421							}
422
423							## variance_x
424							=pod
425
426							=item *
427
428							I returns the (n-1)-style variance of the x values.
429
430							=cut
431							sub variance_x {
432	18			18	0	1147	my $self = shift;
433	18	50				53	$self->_regress unless $self->{cached};
434	18					123	return $self->{varx}
435							}
436
437							## variance_y
438							=pod
439
440							=item *
441
442							I returns the (n-1)-style variance of the y values.
443
444							=cut
445							sub variance_y {
446	18			18	0	1216	my $self = shift;
447	18	50				52	$self->_regress unless $self->{cached};
448	18					155	return $self->{vary}
449							}
450
451							## slope
452							=pod
453
454							=item *
455
456							I returns the slope m so that y = m*x + k is a least squares fit.
457							Note that this is the least (y-y_avg)**2, and thus the standard slope.
458
459							=cut
460							sub slope {
461	19			19	0	2283	my $self = shift;
462	19	50				58	$self->_regress unless $self->{cached};
463	19					140	return $self->{mx}
464							}
465
466							## intercept
467							=pod
468
469							=item *
470
471							I returns the intercept k so that y = m*x + k is a least squares
472							fit. Note again that this is the least (y-y_avg)**2, and thus the
473							standard intercept.
474
475							=cut
476							sub intercept {
477	19			19	0	1172	my $self = shift;
478	19	50				55	$self->_regress unless $self->{cached};
479	19					134	return $self->{kx}
480							}
481
482							## predict - predicte a y value given an x value
483							=pod
484
485							=item *
486
487							I(x) predicts a y value, given an x value. Computes m*x + k, where
488							m, k are the standard regression slope and intercept (->slope and ->intercept,
489							respectively) for the most recent data.
490
491							=cut
492							sub predict {
493	0			0	0	0	my $self = shift;
494	0					0	my($x) = @_;
495
496	0	0				0	$self->_regress unless $self->{cached};
497	0					0	return $self->{mx}*$x + $self->{kx};
498							}
499
500							## slope_y
501							=pod
502
503							=item *
504
505							I returns the slope m' so that y = m'*x + k' is a least squares fit.
506							Note that this is the least (x-x_avg)**2, and thus I the standard slope.
507
508							=cut
509							sub slope_y {
510	18			18	0	1182	my $self = shift;
511	18	50				49	$self->_regress unless $self->{cached};
512	18					129	return $self->{my}
513							}
514
515							## intercept_y
516							=pod
517
518							=item *
519
520							I returns the intercept k' so that y = m'*x + k' is a least
521							squares fit. Note that this is the least (x-x_avg)**2, and thus I
522							the standard intercept.
523
524							=cut
525							sub intercept_y {
526	18			18	0	1120	my $self = shift;
527	18	50				46	$self->_regress unless $self->{cached};
528	18					134	return $self->{ky}
529							}
530
531							## predict_x - predicte an x value given a y value
532							=pod
533
534							=item *
535
536							I(y) predicts an x value given a y value. Computes m'*y + k',
537							where m', k' are the regression (y-reletive) slope and intercept
538							(->slope_y and ->intercept_y, respectively) for the most recent data.
539
540							=cut
541							sub predict_x {
542	0			0	0	0	my $self = shift;
543	0					0	my($y) = @_;
544
545	0	0				0	$self->_regress unless $self->{cached};
546	0					0	return $self->{my}*$y + $self->{ky};
547							}
548
549							## pearson_r
550							=pod
551
552							=item *
553
554							I returns Pearson's r correlation coefficient.
555
556							=cut
557							sub pearson_r {
558	21			21	0	1925	my $self = shift;
559	21	100				59	$self->_regress unless $self->{cached};
560	21					158	return $self->{r}
561							}
562
563							## chi_squared
564							=pod
565
566							=item *
567
568							I returns the chi squared statistic.
569
570							=cut
571							sub chi_squared {
572	17			17	0	1187	my $self = shift;
573	17	50				47	$self->_regress unless $self->{cached};
574	17					140	return $self->{chi2}
575							}
576
577							## minimum_x
578							=pod
579
580							=item *
581
582							I returns the minimum x value
583
584							=cut
585	17			17	0	1768	sub minimum_x { return shift->{minx}; }
586
587							## maximum_x
588							=pod
589
590							=item *
591
592							I returns the maximum x value
593
594							=cut
595	17			17	0	1512	sub maximum_x { return shift->{maxx}; }
596
597							## minimum_y
598							=pod
599
600							=item *
601
602							I returns the minimum y value
603
604							=cut
605	17			17	0	1527	sub minimum_y { return shift->{miny}; }
606
607							## maximum_y
608							=pod
609
610							=item *
611
612							I returns the maximum y value
613
614							=cut
615	17			17	0	1405	sub maximum_y { return shift->{maxy}; }
616
617							## dump_stats
618							=pod
619
620							=item *
621
622							I returns a hash reference of the form
623
624							{ num => ,
625							sumx => ,
626							sumy => ,
627							sumxx => ,
628							sumyy => ,
629							sumxy => ,
630							minx => ,
631							maxx => ,
632							miny => ,
633							maxy => }
634
635							in other words, containing all the stats required by the final constructor
636							above. This effectively dumps the regression history.
637
638							=cut
639							sub dump_stats {
640	11			11	0	1792	my $self = shift;
641	11					31	my @stats = qw(num sumx sumy sumxx sumyy sumxy);
642	11					22	push(@stats, qw(minx maxx miny maxy)); # min/max
643	11					21	my %dump;
644
645	11					119	@dump{@stats} = @$self{@stats};
646	11					63	return \%dump;
647							}
648
649							1;
650
651							__END__