File Coverage

blib/lib/Statistics/TheilSen.pm
Criterion Covered Total %
statement 15 93 16.1
branch 0 34 0.0
condition 0 33 0.0
subroutine 5 11 45.4
pod 6 6 100.0
total 26 177 14.6


line stmt bran cond sub pod time code
1             package Statistics::TheilSen;
2            
3 1     1   26379 use 5.006;
  1         4  
  1         82  
4 1     1   6 use strict;
  1         2  
  1         36  
5 1     1   5 use Carp;
  1         7  
  1         93  
6 1     1   5 use warnings FATAL => 'all';
  1         2  
  1         67  
7             require Exporter;
8            
9             our @ISA = qw/Exporter/;
10 1     1   907 use Statistics::QuickMedian qw/qmedian/;
  1         496  
  1         1258  
11            
12             our @EXPORT_OK = qw/theilsen/;
13            
14             =head1 NAME
15            
16             Statistics::TheilSen - Perl implementation of Theil Sen Estimator
17            
18             =head1 VERSION
19            
20             Version 0.03
21            
22             =cut
23            
24             our $VERSION = '0.03';
25            
26            
27             =head1 SYNOPSIS
28            
29             This is a perl implementation of the Theil Sen Estimator, which is a method of
30             linear regression that uses medians. All of the gradients of the lines between
31             all points are calculated, and hte median is the one reported. Sounds trivial.
32             If you have 1000s of points, then you have millions of lines, and sort-based
33             median methods can take ages, so Statistics::TheilSen uses the partition-based
34             Statistics::QuickMedian.
35            
36             # OOP...
37            
38             use Statistics::TheilSen;
39            
40             my $tse = Statistics::TheilSen->new(\$y_values, \$x_values);
41             # which is really a shortcut for:
42             my $tse = Statistics::TheilSen->new();
43             $tse->addData(\@y_values, \@x_values); # listrefs of numeric scalars
44            
45             my $status_line = $tse->run(); # might tell if you had bad values, etc
46             print "y = ", $tse->m(), "x + ", $tse->c(); # y = mx + c
47            
48             # or procedural...
49            
50             use Statistics::TheilSen qw/theilsen/;
51            
52             my ($m,$c) = theilsen(\@y_values, \@x_values);
53            
54             =head1 EXPORT/SUBROUTINES
55            
56             =head2 theilsen
57            
58             Accepts two list refs, the lists should be the same length. They represent y and x series
59             which will be the subject of the regression. Returns a list of two
60            
61             use Statistics::TheilSen qw/theilsen/;
62            
63             my ($m,$b) = theilsen(\$y_values, \$x_values);
64            
65             =cut
66            
67             sub theilsen {
68 0     0 1   my ($y,$x) = @_;
69 0           my $n = @$y;
70 0 0         carp "y and x series are different lengths"
71             unless $n == @$x;
72             # all the gradients!
73 0           my @M = ();
74             # each item from start to penultimate
75 0           my ($x1,$x2,$y1,$y2);
76 0           foreach my $i(0 .. $n-2){
77 0           $y1 = $y->[$i];
78 0           $x1 = $x->[$i];
79 0 0 0       next unless defined $y1 && $y1 =~ /\d/ && defined $x1 && $x1 =~ /\d/;
      0        
      0        
80             # each item from next to last
81 0           foreach my $j($i+1 .. $n-1){
82 0           $y2 = $y->[$j];
83 0 0 0       next unless defined $y2 && $y2 =~ /\d/;
84             # short cut for zero (even if dx is zero ;-)
85 0 0         if($y2 == $y1){
86 0           push @M, 0;
87 0           next;
88             }
89 0           $x2 = $x->[$j];
90 0 0 0       next unless defined $x2 && $x2 =~ /\d/;
91             # skip any divisions by zero! (don't add to the list, if it's infinite then it's both pos and neg anyway!)
92 0 0         next if $x2 == $x1;
93             # otherwise, calculate the gradient and push it...
94 0           push @M, ($y2-$y1)/($x2-$x1);
95             }
96             }
97             # now we have @M, so what's the median?
98 0           my $m = qmedian(\@M); # warning... this modifies the order of M!
99            
100             # y-intercept b to be the median of the values yi - mxi
101 0           my @C = ();
102 0           foreach my $i(0 .. $n-1){
103 0           $y1 = $y->[$i];
104 0           $x1 = $x->[$i];
105 0           push @C, $y1 - $m * $x1;
106             }
107             # now we have @C, so what's the median?
108 0           my $c = qmedian(\@C); # warning... this modifies the order of C!
109 0           return ($m,$c);
110             }
111            
112             =head1 METHODS
113            
114             =head2 new
115            
116             use Statistics::TheilSen;
117             my $tse = Statistics::TheilSen->new();
118             #or
119             my $tse = Statistics::TheilSen->new(\@y_values, \@x_values);
120            
121             returns a new Statistics::TheilSen estimator object with the optional data added.
122            
123             =cut
124            
125             sub new {
126 0     0 1   my $p = shift;
127 0   0       my $c = ref $p || $p;
128 0           my $o = {
129             Y=>[], # we store y series here
130             X=>[], # and x here
131             runSinceAddData=>0, # check whether a run is needed
132             m=>'',
133             c=>'',
134             };
135 0           bless $o, $c;
136 0 0         if(@_==2){
    0          
137 0           $o->addData(@_);
138             }
139             elsif(@_){
140 0           croak "wrong number of args to Statistics::TheilSen->new, should be 0 or 2.";
141             }
142 0           return $o;
143             }
144            
145             =head2 addData
146            
147             $tse->addData(\@y_values, \@x_values);
148            
149             Adds data to the y and x series. Data series should be the same length.
150            
151             =cut
152            
153             sub addData {
154 0     0 1   my $o = shift;
155 0 0         croak "wrong number of args to Statistics::TheilSen->new, should be 0 or 2."
156             unless @_ == 2;
157 0           my ($Y,$X) = @_;
158 0 0         croak "Y and X are not equal lengths"
159             unless @$Y == @$X;
160 0           push @{$o->{Y}}, @$Y;
  0            
161 0           push @{$o->{X}}, @$X;
  0            
162 0           $o->{runSinceAddData} = 0;
163             }
164            
165             =head2 run
166            
167             my $status_line = $tse->run();
168            
169             Runs the estimator on the data currently in the object. Returns any messages
170             about whether errors or weird things were found in the data. Sets m and c in
171             the object
172            
173             =cut
174            
175             sub run {
176 0     0 1   my $o = shift;
177             # fatal:
178 0           my $n = @{$o->{Y}};
  0            
179 0           return "Y and X are different lengths (fatal)"
180 0 0         if $n != @{$o->{X}};
181            
182             # "warnings" about data
183 0           my $message;
184             # count up how many of x2-x1 == 0...
185 0           my %X = ();
186 0           my $divZeroCounts = 0;
187 0           foreach (@{$o->{X}}){
  0            
188 0 0         if(exists $X{$_}){
189 0           $X{$_}++;
190 0           $divZeroCounts += $X{$_};
191             }
192             else {
193 0           $X{$_} = 0;
194             }
195             }
196 0           undef %X;
197 0 0         if($divZeroCounts){
198 0           $message .= "Denominator (x2-x1) is zero in $divZeroCounts cases. ";
199             }
200             # check missing values, etc.
201 0           my ($y,$x);
202 0           my $missing = 0;
203 0           foreach my $i(0..$n-1){
204 0           ($y,$x) = ($o->{Y}->[$i],$o->{X}->[$i]);
205 0 0 0       if(! defined $y || $y !~ /\d/ || $y != $y+0
      0        
      0        
      0        
      0        
206             || ! defined $x || $x !~ /\d/ || $x != $x+0){
207             # looks like x or y is NaN
208 0           $missing ++;
209             }
210             }
211 0 0         if($missing){
212 0           $message .= "Missing values on $missing rows. ";
213             }
214             # end of checks
215 0           ($o->{m}, $o->{c})
216             = theilsen(
217             $o->{Y},
218             $o->{X},
219             );
220 0           $o->{runSinceAddData} = 1;
221 0           return $message;
222             }
223            
224             =head2 m
225            
226             my $gradient = $tse->m();
227            
228             Returns "m", the gradient of the model generated by run(). If run() was not
229             called since addData(), then run() will be called here!
230            
231             =cut
232            
233             sub m {
234 0     0 1   my $o = shift;
235 0 0         $o->{runSinceAddData} || $o->run();
236 0           return $o->{m};
237             }
238            
239             =head2 c
240            
241             my $intersect = $tse->c();
242            
243             Returns "c", the intersect of the model generated by run(). If run() was not
244             called since addData(), then run() will be called here!
245            
246             =cut
247            
248             sub c {
249 0     0 1   my $o = shift;
250 0 0         $o->{runSinceAddData} || $o->run();
251 0           return $o->{c};
252             }
253            
254             =head1 AUTHOR
255            
256             Jimi Wills, C<< >>
257            
258             =head1 BUGS
259            
260             Please report any bugs or feature requests to C, or through
261             the web interface at L. I will be notified, and then you'll
262             automatically be notified of progress on your bug as I make changes.
263            
264            
265            
266            
267             =head1 SUPPORT
268            
269             You can find documentation for this module with the perldoc command.
270            
271             perldoc Statistics::TheilSen
272            
273            
274             You can also look for information at:
275            
276             =over 4
277            
278             =item * RT: CPAN's request tracker (report bugs here)
279            
280             L
281            
282             =item * AnnoCPAN: Annotated CPAN documentation
283            
284             L
285            
286             =item * CPAN Ratings
287            
288             L
289            
290             =item * Search CPAN
291            
292             L
293            
294             =back
295            
296            
297             =head1 ACKNOWLEDGEMENTS
298            
299             http://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
300            
301             =head1 LICENSE AND COPYRIGHT
302            
303             Copyright 2013 Jimi Wills.
304            
305             This program is free software; you can redistribute it and/or modify it
306             under the terms of the the Artistic License (2.0). You may obtain a
307             copy of the full license at:
308            
309             L
310            
311             Any use, modification, and distribution of the Standard or Modified
312             Versions is governed by this Artistic License. By using, modifying or
313             distributing the Package, you accept this license. Do not use, modify,
314             or distribute the Package, if you do not accept this license.
315            
316             If your Modified Version has been derived from a Modified Version made
317             by someone other than you, you are nevertheless required to ensure that
318             your Modified Version complies with the requirements of this license.
319            
320             This license does not grant you the right to use any trademark, service
321             mark, tradename, or logo of the Copyright Holder.
322            
323             This license includes the non-exclusive, worldwide, free-of-charge
324             patent license to make, have made, use, offer to sell, sell, import and
325             otherwise transfer the Package with respect to any patent claims
326             licensable by the Copyright Holder that are necessarily infringed by the
327             Package. If you institute patent litigation (including a cross-claim or
328             counterclaim) against any party alleging that the Package constitutes
329             direct or contributory patent infringement, then this Artistic License
330             to you shall terminate on the date that such litigation is filed.
331            
332             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
333             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
334             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
335             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
336             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
337             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
338             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
339             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
340            
341            
342             =cut
343            
344             1; # End of Statistics::TheilSen