File Coverage

blib/lib/Statistics/Autocorrelation.pm
Criterion Covered Total %
statement 44 81 54.3
branch 5 16 31.2
condition 4 12 33.3
subroutine 8 10 80.0
pod 2 2 100.0
total 63 121 52.0


line stmt bran cond sub pod time code
1             package Statistics::Autocorrelation;
2            
3 2     2   55339 use warnings;
  2         5  
  2         67  
4 2     2   11 use strict;
  2         5  
  2         74  
5 2     2   11 use Carp 'croak';
  2         10  
  2         145  
6 2     2   1770 use Statistics::Lite 'mean';
  2         3144  
  2         2192  
7            
8             =head1 NAME
9            
10             Statistics-Autocorrelation - Coefficients for any lag
11            
12             =head1 VERSION
13            
14             Version 0.02
15            
16             =cut
17            
18             our $VERSION = '0.02';
19            
20             =head1 SYNOPSIS
21            
22             use Statistics::Autocorrelation;
23             $acorr = Statistics::Autocorrelation->new();
24             $coeff = $acorr->coefficient(data => \@data, lag => integer (from 1 to N-1), exact => 0, simple => 1);
25            
26             =head1 DESCRIPTION
27            
28             Calculates autocorrelation coefficients for a single series of numerical data, for any valid length of I.
29            
30             =head1 SUBROUTINES/METHODS
31            
32             =head2 new
33            
34             $acorr = Statistics::Autocorrelation->new();
35            
36             Return a new class object for accessing its methods.
37            
38             =cut
39            
40             sub new {
41 1     1 1 12 my $class = shift;
42 1         3 my $self = {};
43 1         3 bless $self, $class;
44             #$self->init(@_);
45 1         3 return $self;
46             }
47            
48             =head2 coefficient
49            
50             $autocorr->coefficient(data => \@data, lag => integer (from 1 to N-1), exact => 0|1, simple => 1|0);
51            
52             I: C
53            
54             Corporate stats programs (e.g., SPSS/PASW), and examples of autocorrelation on the web (e.g., L), often yield/demonstrate a value of the autocorrelation coefficient that assumes that the underlying series is stationary (has no linear or curvilinear trend, no periodicity), and so can be calculated as the population autocorrelation coefficient, the ratio of the autocovariance to the overall variance. To be a valid estimate of the sample autocorrelation coefficient, it is assumed that the number of observations, I, in the sample is "reasonably large" - so that all the observations in each summation in the numerator are taken relative to the mean of the whole series, rather than the exact value at lag I, and also that the variance used in the denominator can be that of the whole series - instead of using completely pairwise products, i.e., making the coefficient dependent on the values of I, ..., I, and I, ..., I at each summation. Additionally, these sources also commonly drop the factor I/(I - 1), assuming that the value is close enough to 1 for large I.
55            
56             By default, then, this method returns an estimate of the population autocorrelation coefficient, and the divisor factors are dropped; i.e., the default values of the options C = 0, and C = 1. The default value returned from this method, then, is equivalent to those returned from such corporate software, and demonstrated via such URLs, as cited; and this is also the default form of calculating the coefficient as used in texts such as Chatfield (1975). If you want, however, to keep these divisors, you need to specify C = 0; and if you want, furthermore, the exact sample autocorrelation coefficient, then specify C = 1; then you get the coefficient as calculated by Kendall's (1973) Eq. 3.35.
57            
58             A croak will be heard if no value is given for C (expects an array reference), or if the array is empty. A value for C is the only one for which there is no default value and must be given.
59            
60             If a value is not given for C, or it equals zero, it is set to the value of 1 by default.
61            
62             =cut
63            
64             sub coefficient {
65 2     2 1 1339 my $self = shift;
66 2         8 my (%args) = @_;
67 2 50       9 my $dat = ref $args{'data'} ? $args{'data'} : croak 'No value for data for calculating coefficient';
68 2   33     3 my $n = scalar(@{$dat}) || croak 'No data are available for calculating coefficient';
69 2 50       6 croak 'Can\'t autocorrelate a dataset of only one element' if $n < 2;
70 2   50     16 my $lag = $args{'lag'} || 1;
71             #croak "Can\'t autocorrelate with a lag of < $lag > for only < $n > data" if $n - $lag == 1 && !$args{'circular'};
72 2   50     10 $args{'exact'} ||= 0;
73            
74 2         2 my $rk;
75 2 50       6 if ($args{'exact'} == 1) {
76 0 0       0 if ($args{'circular'}) {
77 0         0 $rk = _calc_exact_circ($dat, $n, $lag);
78             }
79             else {
80 0         0 $rk = _calc_exact($dat, $n, $lag);
81             }
82             }
83             else {
84 2 50       5 if ($args{'circular'}) {
85 0         0 $rk = _calc_approx_circ($dat, $n, $lag, $args{'simple'});
86             }
87             else {
88 2         12 $rk = _calc_approx($dat, $n, $lag, $args{'simple'});
89             }
90             }
91 2         106 return $rk;
92             }
93             *coeff = \&coefficient;
94             # Kendall's (1973) Eq. 3.35., p. 40
95            
96             sub _calc_exact {
97 0     0   0 my ($dat, $n, $k) = @_;
98            
99 0         0 my $c0 = 1 / ($n - $k);
100 0         0 my ($sum_ui, $sum_uik, $sum_prodsum, $sum_sq_ui, $sum_sq_uik, $i) = (0, 0, 0, 0);
101            
102 0         0 for ($i = 0; $i < $n - $k; $i++) {
103 0         0 $sum_ui += $dat->[$i];
104             }
105 0         0 for ($i = 0; $i < $n - $k; $i++) {
106 0         0 $sum_uik += $dat->[$i + $k];
107             }
108            
109 0         0 my $c1 = $c0 * $sum_ui;
110 0         0 my $c2 = $c0 * $sum_uik;
111            
112             # Numerator:
113 0         0 for ($i = 0; $i < $n - $k; $i++) {
114 0         0 $sum_prodsum += ($dat->[$i] - $c1) * ($dat->[$i + $k] - $c2);
115             }
116 0         0 my $numerator = $c0 * $sum_prodsum;
117            
118             # Denominator:
119 0         0 for ($i = 0; $i < $n - $k; $i++) {
120 0         0 $sum_sq_ui += ($dat->[$i] - $c1)**2;
121             }
122 0         0 for ($i = 0; $i < $n - $k; $i++) {
123 0         0 $sum_sq_uik += ($dat->[$i + $k] - $c2)**2;
124             }
125            
126 0         0 my $denominator = sqrt($c0 * $sum_sq_ui * $c0 * $sum_sq_uik);
127            
128 0         0 my $rk = $numerator / $denominator;
129            
130 0         0 return $rk;
131            
132             }
133            
134             # Kendall Eq. 3.36
135            
136             sub _calc_approx {
137 2     2   4 my ($dat, $n, $k) = @_;
138 2         3 my $simple = shift;
139 2         1 my $mean = mean(@{$dat});
  2         11  
140 2         32 my ($rk, $sum_resid, $sum_sq, $i) = ();
141            
142 2         27 for ($i = 0; $i < $n - $k; $i++) {
143 2         10 $sum_resid += ($dat->[$i] - $mean)*($dat->[$i + $k] - $mean);
144             }
145            
146 2         7 $sum_sq = _sum_sq($dat, $n, $mean);
147            
148 2 50 33     19 if (defined $simple && !$simple) {
149 0         0 $rk = ($sum_resid / ($n - $k)) / ($sum_sq/$n);
150             }
151             else {
152 2         4 $rk = $sum_resid / $sum_sq;
153             }
154            
155 2         6 return $rk;
156             }
157            
158             sub _calc_approx_circ {
159 0     0   0 my ($dat, $n, $k) = @_;
160 0   0     0 my $simple = shift || 0;
161 0         0 my $mean = mean(@{$dat});
  0         0  
162 0         0 my ($rk, $sum_resid, $sum_sq, $i) = ();
163            
164 0         0 for ($i = 0; $i < $n; $i++) { # go right up to the last value in the data
165 0 0       0 my $i2 = $i + $k >= $n ? $k : $i + $k;
166 0         0 $sum_resid += ($dat->[$i] - $mean)*($dat->[$i2] - $mean);
167             }
168            
169 0         0 $sum_sq = _sum_sq($dat, $n, $mean);
170            
171 0 0       0 if (!$simple) {
172 0         0 $rk = ($sum_resid / ($n - $k)) / ($sum_sq / $n);
173             }
174             else {
175 0         0 $rk = $sum_resid / $sum_sq;
176             }
177            
178 0         0 return $rk;
179             }
180            
181             sub _sum_sq {
182 2     2   3 my ($dat, $n, $mean) = @_;
183 2         3 my ($i, $sum) = ();
184 2         7 for ($i = 0; $i < $n; $i++) {
185 4         21 $sum += ($dat->[$i] - $mean)**2;
186             }
187 2         4 return $sum;
188             }
189            
190             =head1 REFERENCES
191            
192             Chatfield, C. (1975). I. London, UK: Chapman and Hall.
193            
194             Kendall, M. G. (1973). I. London, UK: Griffin.
195            
196             =head1 AUTHOR
197            
198             Roderick Garton, C<< >>
199            
200             =head1 BUGS
201            
202             Please report any bugs or feature requests to C, or through
203             the web interface at L. I will be notified, and then you'll
204             automatically be notified of progress on your bug as I make changes.
205            
206             =head1 SUPPORT
207            
208             You can find documentation for this module with the perldoc command.
209            
210             perldoc Statistics::Autocorrelation
211            
212            
213             You can also look for information at:
214            
215             =over 4
216            
217             =item * RT: CPAN's request tracker
218            
219             L
220            
221             =item * AnnoCPAN: Annotated CPAN documentation
222            
223             L
224            
225             =item * CPAN Ratings
226            
227             L
228            
229             =item * Search CPAN
230            
231             L
232            
233             =back
234            
235             =head1 LICENSE AND COPYRIGHT
236            
237             Copyright 2011 Roderick Garton.
238            
239             This program is free software; you can redistribute it and/or modify it
240             under the terms of either: the GNU General Public License as published
241             by the Free Software Foundation; or the Artistic License.
242            
243             See http://dev.perl.org/licenses/ for more information.
244            
245             =cut
246            
247             1; # End of Statistics::Autocorrelation