File Coverage

blib/lib/Statistics/TheilSenEstimator.pm
Criterion Covered Total %
statement 15 94 15.9
branch 0 36 0.0
condition 0 42 0.0
subroutine 5 11 45.4
pod 6 6 100.0
total 26 189 13.7


line stmt bran cond sub pod time code
1             package Statistics::TheilSenEstimator;
2              
3 1     1   31390 use 5.006;
  1         4  
  1         52  
4 1     1   5 use strict;
  1         2  
  1         32  
5 1     1   6 use Carp;
  1         12  
  1         81  
6 1     1   5 use warnings FATAL => 'all';
  1         1  
  1         63  
7             require Exporter;
8              
9             our @ISA = qw/Exporter/;
10 1     1   920 use Statistics::QuickMedian qw/qmedian/;
  1         455  
  1         1296  
11              
12             our @EXPORT_OK = qw/theilsen/;
13              
14             =head1 NAME
15              
16             Statistics::TheilSen - Perl implementation of Theil Sen Estimator
17              
18             =head1 VERSION
19              
20             Version 0.06
21              
22             =cut
23              
24             our $VERSION = '0.06';
25              
26              
27             =head1 SYNOPSIS
28              
29             This is a perl implementation of the Theil Sen Estimator, which is a method of
30             linear regression that uses medians. All of the gradients of the lines between
31             all points are calculated, and hte median is the one reported. Sounds trivial.
32             If you have 1000s of points, then you have millions of lines, and sort-based
33             median methods can take ages, so Statistics::TheilSen uses the partition-based
34             Statistics::QuickMedian.
35              
36             # OOP...
37            
38             use Statistics::TheilSenEstimator;
39              
40             my $tse = Statistics::TheilSenEstimator->new(\$y_values, \$x_values);
41             # which is really a shortcut for:
42             my $tse = Statistics::TheilSenEstimator->new();
43             $tse->addData(\@y_values, \@x_values); # listrefs of numeric scalars
44            
45             my $status_line = $tse->run(); # might tell if you had bad values, etc
46             print "y = ", $tse->m(), "x + ", $tse->c(); # y = mx + c
47            
48             # or procedural...
49            
50             use Statistics::TheilSenEstimator qw/theilsen/;
51              
52             my ($m,$c) = theilsen(\@y_values, \@x_values);
53              
54             =head1 EXPORT/SUBROUTINES
55              
56             =head2 theilsen
57              
58             Accepts two list refs, the lists should be the same length. They represent y and x series
59             which will be the subject of the regression. Returns a list of two
60              
61             use Statistics::TheilSenEstimator qw/theilsen/;
62              
63             my ($m,$b) = theilsen(\$y_values, \$x_values);
64              
65             =cut
66              
67             sub theilsen {
68 0     0 1   my ($y,$x) = @_;
69 0           my $n = @$y;
70 0 0         carp "y and x series are different lengths"
71             unless $n == @$x;
72             # all the gradients!
73 0           my @M = ();
74             # each item from start to penultimate
75 0           my ($x1,$x2,$y1,$y2);
76 0           foreach my $i(0 .. $n-2){
77 0           $y1 = $y->[$i];
78 0           $x1 = $x->[$i];
79 0 0 0       next unless defined $y1 && $y1 =~ /\d/ && defined $x1 && $x1 =~ /\d/;
      0        
      0        
80             # each item from next to last
81 0           foreach my $j($i+1 .. $n-1){
82 0           $y2 = $y->[$j];
83 0 0 0       next unless defined $y2 && $y2 =~ /\d/;
84             # short cut for zero (even if dx is zero ;-)
85 0 0         if($y2 == $y1){
86 0           push @M, 0;
87 0           next;
88             }
89 0           $x2 = $x->[$j];
90 0 0 0       next unless defined $x2 && $x2 =~ /\d/;
91             # skip any divisions by zero! (don't add to the list, if it's infinite then it's both pos and neg anyway!)
92 0 0         next if $x2 == $x1;
93             # otherwise, calculate the gradient and push it...
94 0           push @M, ($y2-$y1)/($x2-$x1);
95             }
96             }
97             # now we have @M, so what's the median?
98 0           my $m = qmedian(\@M); # warning... this modifies the order of M!
99            
100             # y-intercept b to be the median of the values yi - mxi
101 0           my @C = ();
102 0           foreach my $i(0 .. $n-1){
103 0           $y1 = $y->[$i];
104 0           $x1 = $x->[$i];
105 0 0 0       next unless defined $y1 && $y1 =~ /\d/ && defined $x1 && $x1 =~ /\d/;
      0        
      0        
106 0           push @C, $y1 - $m * $x1;
107             }
108             # now we have @C, so what's the median?
109 0           my $c = qmedian(\@C); # warning... this modifies the order of C!
110 0           return ($m,$c);
111             }
112              
113             =head1 METHODS
114              
115             =head2 new
116              
117             use Statistics::TheilSenEstimator;
118             my $tse = Statistics::TheilSenEstimator->new();
119             #or
120             my $tse = Statistics::TheilSenEstimator->new(\@y_values, \@x_values);
121            
122             returns a new Statistics::TheilSenEstimator estimator object with the optional data added.
123              
124             =cut
125              
126             sub new {
127 0     0 1   my $p = shift;
128 0   0       my $c = ref $p || $p;
129 0           my $o = {
130             Y=>[], # we store y series here
131             X=>[], # and x here
132             runSinceAddData=>0, # check whether a run is needed
133             m=>'',
134             c=>'',
135             };
136 0           bless $o, $c;
137 0 0         if(@_==2){
    0          
138 0           $o->addData(@_);
139             }
140             elsif(@_){
141 0           croak "wrong number of args to Statistics::TheilSen->new, should be 0 or 2.";
142             }
143 0           return $o;
144             }
145              
146             =head2 addData
147              
148             $tse->addData(\@y_values, \@x_values);
149            
150             Adds data to the y and x series. Data series should be the same length.
151              
152             =cut
153              
154             sub addData {
155 0     0 1   my $o = shift;
156 0 0         croak "wrong number of args to Statistics::TheilSen->new, should be 0 or 2."
157             unless @_ == 2;
158 0           my ($Y,$X) = @_;
159 0 0         croak "Y and X are not equal lengths"
160             unless @$Y == @$X;
161 0           push @{$o->{Y}}, @$Y;
  0            
162 0           push @{$o->{X}}, @$X;
  0            
163 0           $o->{runSinceAddData} = 0;
164             }
165              
166             =head2 run
167              
168             my $status_line = $tse->run();
169              
170             Runs the estimator on the data currently in the object. Returns any messages
171             about whether errors or weird things were found in the data. Sets m and c in
172             the object
173              
174             =cut
175              
176             sub run {
177 0     0 1   my $o = shift;
178             # fatal:
179 0           my $n = @{$o->{Y}};
  0            
180 0           return "Y and X are different lengths (fatal)"
181 0 0         if $n != @{$o->{X}};
182            
183             # "warnings" about data
184 0           my $message;
185             # count up how many of x2-x1 == 0...
186 0           my %X = ();
187 0           my $divZeroCounts = 0;
188 0           foreach (@{$o->{X}}){
  0            
189 0 0         if(exists $X{$_}){
190 0           $X{$_}++;
191 0           $divZeroCounts += $X{$_};
192             }
193             else {
194 0           $X{$_} = 0;
195             }
196             }
197 0           undef %X;
198 0 0         if($divZeroCounts){
199 0           $message .= "Denominator (x2-x1) is zero in $divZeroCounts cases. ";
200             }
201             # check missing values, etc.
202 0           my ($y,$x);
203 0           my $missing = 0;
204 0           foreach my $i(0..$n-1){
205 0           ($y,$x) = ($o->{Y}->[$i],$o->{X}->[$i]);
206 0 0 0       if(! defined $y || $y !~ /\d/ || $y != $y+0
      0        
      0        
      0        
      0        
207             || ! defined $x || $x !~ /\d/ || $x != $x+0){
208             # looks like x or y is NaN
209 0           $missing ++;
210             }
211             }
212 0 0         if($missing){
213 0           $message .= "Missing values on $missing rows. ";
214             }
215             # end of checks
216 0           ($o->{m}, $o->{c})
217             = theilsen(
218             $o->{Y},
219             $o->{X},
220             );
221 0           $o->{runSinceAddData} = 1;
222 0           return $message;
223             }
224              
225             =head2 m
226              
227             my $gradient = $tse->m();
228            
229             Returns "m", the gradient of the model generated by run(). If run() was not
230             called since addData(), then run() will be called here!
231              
232             =cut
233              
234             sub m {
235 0     0 1   my $o = shift;
236 0 0         $o->{runSinceAddData} || $o->run();
237 0           return $o->{m};
238             }
239              
240             =head2 c
241              
242             my $intersect = $tse->c();
243            
244             Returns "c", the intersect of the model generated by run(). If run() was not
245             called since addData(), then run() will be called here!
246              
247             =cut
248              
249             sub c {
250 0     0 1   my $o = shift;
251 0 0         $o->{runSinceAddData} || $o->run();
252 0           return $o->{c};
253             }
254              
255             =head1 AUTHOR
256              
257             Jimi Wills, C<< >>
258              
259             =head1 BUGS
260              
261             Please report any bugs or feature requests to C, or through
262             the web interface at L. I will be notified, and then you'll
263             automatically be notified of progress on your bug as I make changes.
264              
265              
266              
267              
268             =head1 SUPPORT
269              
270             You can find documentation for this module with the perldoc command.
271              
272             perldoc Statistics::TheilSenEstimator
273              
274              
275             You can also look for information at:
276              
277             =over 4
278              
279             =item * RT: CPAN's request tracker (report bugs here)
280              
281             L
282              
283             =item * AnnoCPAN: Annotated CPAN documentation
284              
285             L
286              
287             =item * CPAN Ratings
288              
289             L
290              
291             =item * Search CPAN
292              
293             L
294              
295             =back
296              
297              
298             =head1 ACKNOWLEDGEMENTS
299              
300             http://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
301              
302             =head1 LICENSE AND COPYRIGHT
303              
304             Copyright 2013 Jimi Wills.
305              
306             This program is free software; you can redistribute it and/or modify it
307             under the terms of the the Artistic License (2.0). You may obtain a
308             copy of the full license at:
309              
310             L
311              
312             Any use, modification, and distribution of the Standard or Modified
313             Versions is governed by this Artistic License. By using, modifying or
314             distributing the Package, you accept this license. Do not use, modify,
315             or distribute the Package, if you do not accept this license.
316              
317             If your Modified Version has been derived from a Modified Version made
318             by someone other than you, you are nevertheless required to ensure that
319             your Modified Version complies with the requirements of this license.
320              
321             This license does not grant you the right to use any trademark, service
322             mark, tradename, or logo of the Copyright Holder.
323              
324             This license includes the non-exclusive, worldwide, free-of-charge
325             patent license to make, have made, use, offer to sell, sell, import and
326             otherwise transfer the Package with respect to any patent claims
327             licensable by the Copyright Holder that are necessarily infringed by the
328             Package. If you institute patent litigation (including a cross-claim or
329             counterclaim) against any party alleging that the Package constitutes
330             direct or contributory patent infringement, then this Artistic License
331             to you shall terminate on the date that such litigation is filed.
332              
333             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
334             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
335             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
336             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
337             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
338             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
339             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
340             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
341              
342              
343             =cut
344              
345             1; # End of Statistics::TheilSen