File Coverage

blib/lib/Statistics/OLS.pm
Criterion Covered Total %
statement 131 198 66.1
branch 25 60 41.6
condition 1 3 33.3
subroutine 16 18 88.8
pod 8 15 53.3
total 181 294 61.5


line stmt bran cond sub pod time code
1             #================---------=================================================#
2             # Statistics::OLS -- Perform Ordinary Least Squares (with statistics) 2-D #
3             # by Sanford Morton #
4             #==========================================================================#
5              
6             # Revision history for Perl module Statistics::OLS.
7             #
8             # 0.01 - 22 March 1998
9             # - original version
10             #
11             # 0.02 - 29 March 1998
12             # - corrected array bounds check bug in setData
13             # - included check for divide by zero in standard error of
14             # coefficients and t-stats
15             #
16             # 0.03 - 31 May 1998
17             # - placed module into standard format using h2xs
18             #
19             # 0.04 - 13 July 1998
20             # - changed the name from Statistics::Ols to Statistics::OLS
21             #
22             # 0.05 - 15 Sep 1999
23             # - corrected error checking bug
24             # - corrected pod documentation bug
25             #
26             # 0.06 - 4 July 2000
27             # - allowed data in scientific (exponential) notation
28             #
29             # 0.07 - 12 October 2000
30             # - _sse fix for potential precision problems
31              
32             package Statistics::OLS;
33              
34             $Statistics::OLS::VERSION = '0.07';
35              
36 1     1   650 use strict;
  1         2  
  1         2061  
37              
38             #==================#
39             # public methods #
40             #==================#
41              
42             sub new {
43 1     1 1 58 my $class = shift;
44 1         12 my $self = {};
45            
46 1         3 bless $self, $class;
47 1         5 $self->_init (@_);
48              
49 1         2 return $self;
50             }
51              
52              
53             sub setData {
54             # check for equal or non-numeric data
55             # can receive data either as \@xdata, \@ydata or as @xydata.
56             # set refs: either to $self->{'_xdata'} and $self->{'_ydata'}
57             # or to $self->{'_xydata'}
58             # then set $self->{'_flatDataArray'}
59 1     1 1 6 my $self = shift;
60 1         3 my ($arrayref1, $arrayref2) = @_;
61 1         2 my ($arrayref, $i);
62              
63 1 50       4 if (ref $arrayref2) { # passing data as two data arrays (x0 ...) (y0 ...)
64              
65 0 0       0 unless ($#$arrayref1 == $#$arrayref2) { # error checking
66 0         0 $self->{'_errorMessage'} = "The dataset does not contain an equal number of x and y values. ";
67 0         0 return 0;
68             }
69              
70 0 0       0 unless ($#$arrayref1 > 1) { # error checking
71 0         0 $self->{'_errorMessage'} = "The data set must contain at least three points. ";
72 0         0 return 0;
73             }
74              
75             # check whether data are equal and numeric
76 0         0 for ($i=0; $i<=$#$arrayref1; $i++) {
77              
78 0 0       0 unless ($$arrayref1[$i] =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) {
79 0         0 $self->{'_errorMessage'} = "The data element $$arrayref1[$i] is non-numeric. ";
80 0         0 return 0;
81             }
82 0 0       0 unless ($$arrayref2[$i] =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) {
83 0         0 $self->{'_errorMessage'} = "The data element $$arrayref2[$i] is non-numeric. ";
84 0         0 return 0;
85             }
86             }
87            
88 0         0 $self->{'_xdata'} = $arrayref1;
89 0         0 $self->{'_ydata'} = $arrayref2;
90 0         0 $self->{'_flatDataArray'} = 0; # passed as two data arrays
91              
92             } else { # passing data as a single flat data array (x0 y0 ...)
93            
94             # check whether array is unbalanced
95 1 50       5 if ($#$arrayref1 % 2 == 0) {
96 0         0 $self->{'_errorMessage'} = "The dataset does not contain an equal number of x and y values.";
97 0         0 return 0;
98             }
99              
100 1 50       4 unless ($#$arrayref1 > 4) { # error checking
101 0         0 $self->{'_errorMessage'} = "The data set must contain at least three points. ";
102 0         0 return 0;
103             }
104              
105             # check whether data are numeric
106 1         5 for ($i=0; $i<=$#$arrayref1; $i++) {
107 22 50       100 unless ($$arrayref1[$i] =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) {
108 0         0 $self->{'_errorMessage'} = "The data element $$arrayref1[$i] is non-numeric.";
109 0         0 return 0;
110             }
111             }
112              
113 1         4 $self->{'_xydata'} = $arrayref1;
114 1         3 $self->{'_flatDataArray'} = 1; # passed as one data array
115             }
116              
117 1         2 $self->{'_dataIsSet'} = 1;
118 1         2 $self->{'_gotMinMax'} = 0; # recalculate min-max if already calculated
119 1         2 return 1;
120             }
121              
122              
123              
124             sub error {
125             # returns the last error message as a string
126 0     0 0 0 my $self = shift;
127 0         0 return $self->{'_errorMessage'};
128             }
129              
130              
131             sub regress {
132 1     1 1 6 my $self = shift;
133              
134 1 50       6 unless ($self->{'_dataIsSet'}) {
135 0         0 $self->{'_errorMessage'} = "No datset has been registered. ";
136 0         0 return 0;
137             }
138              
139 1         3 my ($sumX, $sumY, $sumXX, $sumYY, $sumXY) = qw (0 0 0 0 0);
140 1         2 my ($n, $i, $arrayref);
141              
142 1 50       4 if ($self->{'_flatDataArray'}) {
143 1         2 $arrayref = $self->{'_xydata'};
144 1         2 $n = 1 + $#{ $arrayref };
  1         2  
145 1         4 for ($i=0; $i<$n; $i+=2) {
146 11         20 $sumX += $self->{'_xydata'}[$i];
147 11         20 $sumY += $self->{'_xydata'}[$i+1];
148 11         24 $sumXX += $self->{'_xydata'}[$i]**2;
149 11         15 $sumYY += $self->{'_xydata'}[$i+1]**2;
150 11         30 $sumXY += $self->{'_xydata'}[$i] * $self->{'_xydata'}[$i+1];
151             }
152 1         3 $n /= 2; # number of observations
153             } else {
154 0         0 $arrayref = $self->{'_xdata'};
155 0         0 $n = $#{ $arrayref };
  0         0  
156 0         0 $n++; # number of observations
157 0         0 for ( $i=0; $i<$n; $i++ ) {
158 0         0 $sumX += $self->{'_xdata'}[$i];
159 0         0 $sumY += $self->{'_ydata'}[$i];
160 0         0 $sumXX += $self->{'_xdata'}[$i]**2;
161 0         0 $sumYY += $self->{'_ydata'}[$i]**2;
162 0         0 $sumXY += $self->{'_xdata'}[$i] * $self->{'_ydata'}[$i];
163             }
164             }
165              
166             # sum of squared deviations of X and Y
167 1         4 $self->{'_ssdX'} = $sumXX - $sumX**2/$n;
168 1         4 $self->{'_ssdY'} = $sumYY - $sumY**2/$n;
169 1         2 $self->{'_ssdXY'} = $sumXY - $sumX*$sumY/$n;
170              
171             # num observations and sample averages
172 1         2 $self->{'_n'} = $n;
173 1         10 ($self->{'_avX'}, $self->{'_avY'}) = ($sumX/$n, $sumY/$n);
174              
175             # sample var's and cov's (using n-1)
176 1         4 $self->{'_varX'} = $self->{'_ssdX'} / ($n-1);
177 1         3 $self->{'_varY'} = $self->{'_ssdY'} / ($n-1);
178 1         8 $self->{'_covXY'} = $self->{'_ssdXY'} / ($n-1);
179              
180             # coefficient estimates
181 1 50       5 $self->{'_b2'} = $self->{'_ssdX'} == 0
182             ? undef
183             : $self->{'_ssdXY'} / $self->{'_ssdX'}; # slope
184 1         5 $self->{'_b1'} = ($sumY - $self->{'_b2'} * $sumX) / $n; # intercept
185              
186             # R-squared
187 1 50 33     11 $self->{'_rsq'} = ($self->{'_ssdX'} == 0 or $self->{'_ssdY'} == 0)
188             ? 1.0
189             : ($self->{'_ssdXY'} / $self->{'_ssdX'})
190             * ($self->{'_ssdXY'} / $self->{'_ssdY'}) ;
191             # $self->{'_rsq'} = $self->{'_b2'}**2 * $self->{'_ssdX'} / $self->{'_ssdY'};
192              
193             # error (residual) sum of squares
194 1         4 $self->{'_sse'} = $self->{'_ssdY'} - $self->{'_ssdX'} * $self->{'_b2'}**2;
195 1 50       5 $self->{'_sse'} = 0 if $self->{'_sse'} < 0; # potential precision problems
196              
197             # homoscedastic standard deviation of error term
198 1         7 $self->{'_sigma'} = sqrt ($self->{'_sse'}/($n-2));
199              
200             # standard error of coefficients and t-stats
201 1         4 $self->{'_seB1'} = $self->{'_seB2'} = undef;
202 1         2 $self->{'_t1'} = $self->{'_t2'} = undef;
203              
204 1 50       4 unless ($self->{'_ssdX'} == 0) {
205 1         3 $self->{'_seB1'} = $self->{'_sigma'} * sqrt ($sumXX / ($n*$self->{'_ssdX'}));
206 1         3 $self->{'_seB2'} = $self->{'_sigma'} / sqrt $self->{'_ssdX'};
207 1 50       15 $self->{'_t2'} = $self->{'_b2'} / $self->{'_seB2'} unless $self->{'_seB2'} == 0;
208 1 50       6 $self->{'_t1'} = $self->{'_b1'} / $self->{'_seB1'} unless $self->{'_seB1'} == 0;
209             }
210              
211             # durbin-watson
212 1         1 my $sum = 0;
213 1         2 my ($prevErr, $currentErr);
214              
215 1 50       3 if ($self->{'_sse'} == 0) {
216 0         0 $self->{'_dw'} = undef;
217             } else {
218 1 50       4 if ($self->{'_flatDataArray'}) {
219 1         1 $arrayref = $self->{'_xydata'};
220 1         2 $n = 1+$#{ $arrayref };
  1         16  
221 1         5 $prevErr = $self->{'_xydata'}[1]
222             - $self->{'_b1'} - $self->{'_b2'} * $self->{'_xydata'}[0];
223 1         4 for ($i=2; $i<$n; $i+=2) {
224 10         22 $currentErr = $self->{'_xydata'}[$i+1]
225             - $self->{'_b1'} - $self->{'_b2'} * $self->{'_xydata'}[$i];
226 10         12 $sum += ($currentErr - $prevErr)**2;
227 10         18 $prevErr = $currentErr;
228             }
229             } else {
230 0         0 $arrayref = $self->{'_xdata'};
231 0         0 $n = 1+$#{ $arrayref };
  0         0  
232 0         0 $prevErr = $self->{'_ydata'}[0]
233             - $self->{'_b1'} - $self->{'_b2'} * $self->{'_xdata'}[0];
234 0         0 for ( $i=1; $i<$n; $i++ ) {
235 0         0 $currentErr = $self->{'_ydata'}[$i]
236             - $self->{'_b1'} - $self->{'_b2'} * $self->{'_xdata'}[$i];
237 0         0 $sum += ($currentErr - $prevErr)**2;
238 0         0 $prevErr = $currentErr;
239             }
240             }
241 1         3 $self->{'_dw'} = $sum / $self->{'_sse'};
242             }
243              
244 1         2 $self->{'_gotMinMax'} = 0; # should recalculate min-max's if already calculated
245 1         3 return 1;
246             }
247              
248             sub minMax {
249 1     1 0 5 my $self = shift;
250 1 50       6 $self->_getMinMax() unless $self->{'_gotMinMax'};
251 1         5 return ($self->{'_xmin'}, $self->{'_xmax'},
252             $self->{'_ymin'}, $self->{'_ymax'});
253             }
254              
255 1     1 1 7 sub coefficients { my $self = shift; return ($self->{'_b1'}, $self->{'_b2'}); }
  1         4  
256              
257 1     1 1 4 sub rsq { my $self = shift; return $self->{'_rsq'}; }
  1         4  
258              
259 1     1 1 4 sub tstats { my $self = shift; return ($self->{'_t1'}, $self->{'_t2'}); }
  1         4  
260              
261 1     1 0 6 sub av { my $self = shift; return ($self->{'_avX'}, $self->{'_avY'}); }
  1         4  
262              
263 1     1 0 4 sub var { my $self = shift; return ($self->{'_varX'}, $self->{'_varY'},
  1         4  
264             $self->{'_covXY'}); }
265              
266 1     1 0 4 sub sigma { my $self = shift; return $self->{'_sigma'}; }
  1         3  
267              
268 1     1 0 10 sub size { my $self = shift; return $self->{'_n'}; }
  1         3  
269              
270 0     0 0 0 sub dw { my $self = shift; return $self->{'_dw'}; }
  0         0  
271              
272             sub residuals {
273 1     1 1 6 my $self = shift;
274 1         1 my ($n, $i, $arrayref);
275 1         2 my @result = ();
276              
277 1 50       3 if ($self->{'_flatDataArray'}) { # construct xy data array
278 1         2 $arrayref = $self->{'_xydata'};
279 1         2 $n = 1+$#{ $arrayref };
  1         1  
280 1         4 for ($i=0; $i<$n; $i+=2) {
281 11         20 $result[$i] = $self->{'_xydata'}[$i];
282 11         47 $result[$i+1] = $self->{'_xydata'}[$i+1]
283             - $self->{'_b1'} - $self->{'_b2'} * $self->{'_xydata'}[$i];
284             }
285             } else { # construct y data array
286 0         0 $arrayref = $self->{'_xdata'};
287 0         0 $n = 1+$#{ $arrayref };
  0         0  
288 0         0 for ( $i=0; $i<$n; $i++ ) {
289 0         0 $result[$i] = $self->{'_ydata'}[$i]
290             - $self->{'_b1'} - $self->{'_b2'} * $self->{'_xdata'}[$i];
291             }
292             }
293 1         8 return @result;
294             }
295              
296             sub predicted {
297 1     1 1 38 my $self = shift;
298 1         2 my ($n, $i, $arrayref);
299 1         3 my @result = ();
300              
301 1 50       3 if ($self->{'_flatDataArray'}) {
302 1         2 $arrayref = $self->{'_xydata'};
303 1         2 $n = 1+$#{ $arrayref };
  1         2  
304 1         4 for ($i=0; $i<$n; $i+=2) {
305 11         45 $result[$i] = $self->{'_xydata'}[$i];
306 11         34 $result[$i+1] = $self->{'_b1'} + $self->{'_b2'} * $self->{'_xydata'}[$i];
307             }
308             } else {
309 0         0 $arrayref = $self->{'_xdata'};
310 0         0 $n = 1+$#{ $arrayref };
  0         0  
311 0         0 for ( $i=0; $i<$n; $i++ ) {
312 0         0 $result[$i] = $self->{'_b1'} + $self->{'_b2'} * $self->{'_xdata'}[$i];
313             }
314             }
315 1         9 return @result;
316             }
317              
318              
319             #===================#
320             # private methods #
321             #===================#
322              
323             # initialization;
324             # this contains a record of all private data
325             # this is the place to start if you want to read the code.
326             sub _init {
327 1     1   1 my $self = shift;
328              
329             # $self->{'_flatDataArray'} = ''; # data passed as one flat or two data arrays?
330 1         6 $self->{'_dataIsSet'} = 0; # return error if asking to regress
331 1         2 $self->{'_errorMessage'} = '';
332            
333             # will hold references to caller's data array(s)
334             # $self->{'_xydata'} = $self->{'_xdata'} = $self->{'_ydata'} = '';
335              
336             # $self->{'_ssdX'} = $self->{'_ssdY'} = $self->{'_ssdXY'} = '';
337 1         3 $self->{'_n'} = 0; # num observations
338             # $self->{'_avX'} = $self->{'_avY'} = '';
339             # $self->{'_varX'} = $self->{'_vary'} = $self->{'_covXY'} = '';
340              
341 1         3 $self->{'_gotMinMax'} = 0; # do not calculate again
342             # $self->{'_xmin'} = $self->{'_xmax'} = 0;
343             # $self->{'_ymin'} = $self->{'_ymax'} = 0;
344              
345             # $self->{'_b1'} = $self->{'_b2'} = '';
346             # $self->{'_rsq'} = $self->{'_sse'} = $self->{'_sigma'} = '';
347              
348             # $self->{'_seB1'} = $self->{'_seB2'} = undef;
349             # $self->{'_t1'} = $self->{'_t2'} = undef;
350             # $self->{'_dw'} = undef;
351             }
352              
353              
354             # sets min and max values of all data (_xmin, _ymin, _xmax, _ymax);
355             # also sets _xslope, _yslope, _ax and _ay used in _data2pxl;
356             # usage: $self->_getMinMax
357             sub _getMinMax {
358 1     1   2 my $self = shift;
359 1         7 my ($i, $n, $arrayref);
360              
361 1 50       4 if ($self->{'_flatDataArray'}) {
362 1         4 $self->{'_xmin'} = $self->{'_xmax'} = $self->{'_xydata'}[0];
363 1         4 $self->{'_ymin'} = $self->{'_ymax'} = $self->{'_xydata'}[1];
364 1         1 $arrayref = $self->{'_xydata'};
365 1         2 $n = 1+$#{ $arrayref };
  1         2  
366 1         5 for ($i=2; $i<$n; $i+=2) {
367 10 100       24 $self->{'_xmin'} = $self->{'_xydata'}[$i]
368             if $self->{'_xydata'}[$i] < $self->{'_xmin'};
369 10 100       22 $self->{'_xmax'} = $self->{'_xydata'}[$i]
370             if $self->{'_xydata'}[$i] > $self->{'_xmax'};
371 10 100       27 $self->{'_ymin'} = $self->{'_xydata'}[$i+1]
372             if $self->{'_xydata'}[$i+1] < $self->{'_ymin'};
373 10 50       27 $self->{'_ymax'} = $self->{'_xydata'}[$i+1]
374             if $self->{'_xydata'}[$i+1] > $self->{'_ymax'};
375             }
376 1         3 $n /= 2; # number of observations
377             } else {
378 0         0 $self->{'_xmin'} = $self->{'_xmax'} = $self->{'_xdata'}[0];
379 0         0 $self->{'_ymin'} = $self->{'_ymax'} = $self->{'_ydata'}[0];
380 0         0 $arrayref = $self->{'_xdata'};
381 0         0 $n = 1+$#{ $arrayref };
  0         0  
382 0         0 for ( $i=1; $i<$n; $i++ ) {
383 0 0       0 $self->{'_xmin'} = $self->{'_xdata'}[$i]
384             if $self->{'_xdata'}[$i] < $self->{'_xmin'};
385 0 0       0 $self->{'_xmax'} = $self->{'_xdata'}[$i]
386             if $self->{'_xdata'}[$i] > $self->{'_xmax'};
387 0 0       0 $self->{'_ymin'} = $self->{'_ydata'}[$i]
388             if $self->{'_ydata'}[$i] < $self->{'_ymin'};
389 0 0       0 $self->{'_ymax'} = $self->{'_ydata'}[$i]
390             if $self->{'_ydata'}[$i] > $self->{'_ymax'};
391             }
392             }
393 1         3 $self->{'_gotMinMax'} = 1;
394             }
395              
396             1;
397              
398             __END__