File Coverage

blib/lib/Statistics/Reproducibility.pm
Criterion Covered Total %
statement 21 287 7.3
branch 0 74 0.0
condition 0 51 0.0
subroutine 7 27 25.9
pod 20 20 100.0
total 48 459 10.4


line stmt bran cond sub pod time code
1             package Statistics::Reproducibility;
2              
3 1     1   40041 use 5.006;
  1         4  
  1         245  
4 1     1   6 use strict;
  1         3  
  1         43  
5 1     1   6 use warnings FATAL => 'all';
  1         14  
  1         56  
6 1     1   5 use Carp;
  1         3  
  1         1697  
7              
8             #use Math::Geometry::Multidimensional qw/distanceToLineN diagonalComponentsN diagonalDistancesFromOriginN/;
9 1     1   1637 use Statistics::TheilSenEstimator qw/theilsen/;
  1         3869  
  1         67  
10 1     1   8 use Statistics::QuickMedian qw/qmedian/;
  1         3  
  1         36  
11 1     1   1122 use Statistics::Distributions;
  1         5815  
  1         7738  
12              
13             =head1 NAME
14              
15             Statistics::Reproducibility - Reproducibility measurement between multiple replicate experiments
16              
17             =head1 VERSION
18              
19             Version 0.09
20              
21             =cut
22              
23             our $VERSION = '0.09';
24              
25              
26             =head1 SYNOPSIS
27              
28             This module facilitates investigation of reproducbility between multiple replicates of
29             quantitative experiments e.g. SILAC or microarray. Scatter plots are great, but
30             only 2d. Some people use correlation as a proxy for reproducibility, but it's not right.
31             This module helps you through the following items...
32              
33             1) Summarize reproducibility across the replicates
34             2) Pick out replicates that agree more or less
35             3) Summarize reproducibility for individual proteins/genes/whatever
36             4) Set a cutoff for what you can call significant, based on precision
37             5) Deal with missing values (common in SILAC)
38              
39             This works by going through the following steps:
40              
41             (0) Choose a dataset to compare everything else to (the middlemost)
42             1) Put the middle of the data at (0,0,0,0...) by subtracting the median ... report the median
43             2) Rotate the data so the line x=y=z=... lies on a single axis. The data should be spread along this axis.
44             3) Do regression on the data and work out "wrongness" of each replicate (!)
45             4) Calculate and report ratio variance and imprecision variance
46             5) Report combined ratio and error for each protein/gene/whatever
47              
48             Perhaps a little code snippet.
49              
50             use Statistics::Reproducibility;
51              
52             my $r = Statistics::Reproducibility->new();
53            
54             =head1 SUBROUTINES/METHODS
55              
56             =head2 new
57              
58             =cut
59              
60             sub new {
61 0     0 1   my $p = shift;
62 0   0       my $c = ref $p || $p;
63 0           my $o = {
64             # scalars:
65             comparatorIndex => 0, # index of column used to compare
66             k => '', # number of columns
67             n => '', # number of rows
68             vE => '', # variance of "error" (imprecision)
69             vS => '', # variable of experimental spread
70             sdE => '', # s.d. error
71             sdS => '', # s.d. spread
72             derivedFrom => '', # the object derived from
73             derivedReason => '', # the reason the object was derived (e.g. deDiagonalize)
74            
75             # arrays (foreach column)
76             'm' => [], # regression denominator
77             'c' => [], # regression constant
78             # arrays (foreach row)
79             pee => '', # p-value of error
80             pss => '', # p-value of spread
81             pes => '', # p-value of error over spread (??)
82             pse => '', # p-value of spread over error
83            
84             # 2D array (LoL)
85             data => [],
86            
87             #meta info
88             copyOnDerive => [qw/comparatorIndex k n vE vS sdE sdS m c pee pss pes pse copyOnDerive obs/]
89             };
90 0           bless $o, $c;
91 0           return $o;
92             }
93              
94             =head2 derive
95              
96             derives a new object from an old one... some fields are conserved.
97             Warning: references are copied, so m and c point to the same arrays!
98             However, if you run regression() again, they will point to new arrays.
99             Data is set up with k empty columns.
100              
101             =cut
102              
103             sub derive {
104 0     0 1   my ($o,$reason) = @_;
105 0           my $r = $o->new;
106 0           foreach (@{$o->{copyOnDerive}}){
  0            
107 0           $r->{$_} = $o->{$_};
108             }
109 0           $r->{derivedFrom} = $o;
110 0           $r->{derivedReason} = $reason;
111 0           $r->{data} = [map {[]} (0..$o->{k}-1)];
  0            
112 0           return $r;
113             }
114              
115             =head2 data
116              
117             Set the data. Should be rectangular, i.e. all columns the same length, and
118             we'll check it is and croak if not...
119             but can contain "empty" cells (empty string), which represent missing values
120             in the data.
121              
122             returns the object for chaining.
123              
124             =cut
125              
126             sub data {
127 0     0 1   my ($o,@columns) = @_;
128 0           $o->{data} = [@columns];
129 0           $o->{k} = @columns;
130 0           $o->{n} = @{$columns[0]};
  0            
131 0           foreach (1 .. $o->{k}-1){
132 0           croak "columns different lengths!"
133 0 0         unless @{$columns[$_]} == $o->{n};
134             }
135 0           return $o;
136             }
137              
138             =head2 run
139              
140             runs a recommended workflow. it's a shortcut for:
141              
142              
143             my $m = $r->subtractMedian();
144             $m->middlemostColumn();
145             my $d = $m->deDiagonalize();
146             $d -> regression();
147             my $e = $d->rotateToRegressionLine();
148             $e->variances();
149              
150             It returns the last object. So you could do:
151              
152             my $results = Statistics::Reproducibility
153             ->new()
154             ->data($mydata)
155             ->run()
156             ->printableTable($depth);
157              
158             =cut
159              
160             sub run {
161 0     0 1   my $r = shift;
162             # $r->data([qw/1 2 3 4 5 6 7 8/],[qw/0 1 2 3 4 5 6 7/],[qw/2.1 3.2 4.3 5.4 6.5 7.6 8.7 9.8/]);
163 0           $r->countObservations();
164 0           $r->regression();
165 0           my $m = $r->subtractMedian();
166 0           $m->applyMinimumObservations(2);
167 0           $m->middlemostColumn();
168 0           my $d = $m->deDiagonalize();
169 0           $d->applyMinimumObservations(2);
170 0           $d->regression();
171 0           my $e = $d->rotateToRegressionLine();
172 0           $e->applyMinimumObservations(2);
173 0           $e->variances();
174 0           return $e;
175             }
176              
177             =head2 subtractMedian
178              
179             calculates the median for each column, substracts from each column and
180             returns the new object.
181              
182             =cut
183              
184             sub subtractMedian {
185 0     0 1   my $o = shift;
186 0           my $r = $o->derive('subtractMedian');
187 0 0         my @medians = map {qmedian([map {$_ eq '' ? () : $_} @$_])} @{$o->{data}};
  0            
  0            
  0            
188 0           foreach my $i(0..$#medians){
189 0 0         $r->{data}->[$i] = [map {$_ eq '' ? '' : $_ - $medians[$i]} @{$o->{data}->[$i]}];
  0            
  0            
190             }
191 0           $r->{medians} = \@medians;
192 0           return $r;
193             }
194              
195             =head2 middlemostColumn
196              
197             Calculates which of the columns is middlemost and remembers it so all
198             others are compared to it. This can be done instead of using a constructed
199             median dataset as the comparator so that the constructed one does not add to
200             the spread, and does not contribute to the observation count.
201              
202             Note: the method of scoring the columns involves counting which has
203             the most middlemost values. For two columns only, the result will always
204             be the one with the least missing values. I don't think there's anything
205             wrong with that, but just so you know!
206              
207             =cut
208              
209             sub middlemostColumn {
210 0     0 1   my $o = shift;
211             # which is the middle most column? i.e. who has the most medians?
212            
213 0           my @medianCounts = map {0} (1..$o->{k}); # stash counts
  0            
214            
215 0           foreach my $i(0..$o->{n}-1){ # each row
216 0           my @row = ();
217 0           foreach my $j(0..$o->{k}-1){
218 0 0 0       if(defined $o->{data}->[$j]->[$i]
219             && $o->{data}->[$j]->[$i] ne ''){
220 0           push @row, $o->{data}->[$j]->[$i];
221             }
222             }
223             # who's in the middle?
224 0           foreach(medianI(@row)){
225 0           $medianCounts[$_] ++;
226             }
227             }
228            
229             # which has the most middlemost values?
230 0           my $imax = 0;
231 0           my $max = 0;
232 0           foreach my $i(0..$o->{k}-1){
233 0 0         if($medianCounts[$i] > $max){
234 0           $imax = $i;
235 0           $max = $medianCounts[$i];
236             }
237             }
238            
239             # so now we want to put this column on the left? Or should we just
240             # store that we're going to use this one as the comparator?
241 0           $o->{comparatorIndex} = $imax;
242 0           return $imax;
243             }
244              
245             =head2 constructMedianLeft
246              
247             Make a median column and pop it on the left. Note that the
248             regular median is used here, not the Quick Median estimator. This means
249             that for an even number of observations, the mean of the two middlemost is
250             used, which is not the case for Quick Median.
251              
252             =cut
253              
254             sub constructMedianLeft {
255 0     0 1   my $o = shift;
256 0           my @newcol = ();
257 0           foreach my $i(0..$o->{n}-1){
258 0           my @row = ();
259 0           foreach my $j(0..$o->{k}-1){
260 0 0 0       if(defined $o->{data}->[$i]->[$j]
261             && $o->{data}->[$i]->[$j] ne ''){
262 0           push @row, $o->{data}->[$i]->[$j];
263             }
264             }
265 0           push @newcol, median(@row);
266             }
267 0           unshift @{$o->{data}}, \@newcol;
  0            
268 0           return \@newcol;
269             }
270              
271             =head2 deDiagonalize
272              
273             Replicated data with some spread will naturally lie along the diagonal line,
274             y=x (in 2 dimensions, or z=y=x... in more). This function aligns the data
275             along one axis by rotation. This is done so that (a) errors are measured
276             approximately perpendicular to the spread of data and (b) unspread data
277             (a ball of points) gives a gradient of zero in Theil Sen estimator, which is
278             correct because if there's no experimental spread then there can be no
279             evidence that the replicates disagree.
280              
281             Note: at this point, any missing values are REPLACED BY ZEROS! This means
282             that these data point will not disagree with any "unchanging" data, but they
283             will not support the reproducibility of "changed" data (data for proteins/genes)
284             that are regulated). The effect of this is that those points will not appear as
285             extreme in the output and will also have a larger error associated with them.
286              
287             A NEW object is returned! comparatorIndex is honoured and conserved,
288             meaning that if you ran middlemostColumn, the result is the column used
289             as the Y axis in all comparisons, and the column itself will contain the
290             experimental variance.
291              
292             =cut
293              
294             sub deDiagonalize {
295 0     0 1   my $o = shift;
296 0           my $r = $o->derive('deDiagonalize');
297            
298 0           my $ic = $o->{comparatorIndex};
299              
300 0           my $a = atan2(1,1);
301            
302 0           foreach my $i(0..$o->{k}-1){
303 0 0         next if $i == $ic;
304 0           foreach my $j(0..$o->{n}-1){
305 0   0       my $y = $o->{data}->[$i]->[$j] || 0;
306 0   0       my $x = $o->{data}->[$ic]->[$j] || 0;
307              
308 0 0 0       if($y || $x){
309 0           my $t = atan2($y,$x) - $a;
310 0           my $r = sqrt($x**2 + $y**2);
311 0           ($x,$y) = ($r*cos($t), $r*sin($t));
312             }
313              
314 0           $r->{data}->[$i]->[$j] = $y;
315 0           $r->{data}->[$ic]->[$j] = $x;
316             }
317              
318             # $r->{data}->[$i] = diagonalComponentsN(
319             # $o->{data}->[$i], $o->{data}->[$ic]
320             # )
321             }
322            
323             #$r->{data}->[$ic] = diagonalDistancesFromOriginN(
324             # $o->{k}, $o->{n}, @{$o->{data}}
325             #);
326 0           return $r;
327             }
328              
329             =head2 countObservations
330              
331             Counts the number of observations present in each point and stores in obs.
332             The result is used by applyMinimumObservations to check for unwanted data
333             before a processing event which turns empties into zeros (like deDiagonalize).
334              
335             =cut
336              
337             sub countObservations {
338 0     0 1   my ($o) = @_;
339 0           my @obs = ();
340 0           foreach my $j(0..$o->{n}-1){
341 0           my $c = 0;
342 0           foreach my $i(0..$o->{k}-1){
343 0 0 0       $c++ if defined $o->{data}->[$i]->[$j] && $o->{data}->[$i]->[$j] ne '';
344             }
345 0           push @obs, $c;
346             }
347 0           $o->{obs} = \@obs;
348             }
349              
350             =head2 applyMinimumObservations
351              
352             A method that blanks any data that does not have a minimum number of
353             values, e.g. if the minimum were 2, the point [2,3,undef] would be fine
354             but [2,undef,undef] would become [undef,undef,undef]
355              
356             =cut
357              
358             sub applyMinimumObservations {
359 0     0 1   my ($o,$min) = @_;
360 0           foreach my $j(0..$o->{n}-1){
361 0 0         if($o->{obs}->[$j] < $min){
362 0           foreach my $i(0..$o->{k}-1){
363 0           $o->{data}->[$i]->[$j] = '';
364             }
365 0 0         $o->{d}->[$j] = '' if exists $o->{d};
366             }
367             }
368             }
369              
370             =head2 regression
371              
372             Perform Theil Sen Estimator regression on the data. The regression is
373             done with the comparator on the x axis, but the symmetric parameters
374             are returned for the comparator on the y-axis.
375              
376             =cut
377              
378             sub regression {
379 0     0 1   my $o = shift;
380 0           my @m = map {1} (0..$o->{k}-1);
  0            
381 0           my @c = map {0} (0..$o->{k}-1);
  0            
382 0           foreach my $i(0..$o->{k}-1){
383 0 0         next if $i == $o->{comparatorIndex};
384 0           my ($m,$c) = theilsen(
385             $o->{data}->[$i],
386             $o->{data}->[$o->{comparatorIndex}]
387             );
388 0           $m[$i] = $m;
389 0           $c[$i] = -$c; # - because we're converting to the inverse symmetric
390             }
391 0           $o->{m} = \@m;
392 0           $o->{c} = \@c;
393 0           return ($o->{m}, $o->{c});
394             }
395              
396             =head2 rotateToRegressionLine
397              
398             do we need this?
399              
400             =cut
401              
402             sub rotateToRegressionLine {
403 0     0 1   my $o = shift;
404 0 0         croak "looks like regression() has not been called on this object"
405             unless defined $o->{c};
406            
407             # use distanceToLineN([0,0,0],...) to get middle point of line for distance :-)
408             #my $O = [map {0} (1..$o->{k})]; # the origin
409             #my @MC = ($o->{m},$o->{c}); # we'll be using this a lot, maybe
410            
411             #my ($dO,$X) = distanceToLineN($O,@MC); # $X is the "centre" of the line
412            
413             ####
414 0           my $r = $o->derive('rotateToRegressionLine');
415            
416 0           my $ic = $o->{comparatorIndex};
417            
418 0           $r->{d} = [];
419            
420 0           foreach my $j(0..$o->{n}-1){
421 0           foreach my $i(0..$o->{k}-1){
422 0 0         next if $i == $ic;
423              
424 0           my $m = $o->{m}->[$i];
425 0           my $c = $o->{c}->[$i];
426 0   0       my $y = $o->{data}->[$i]->[$j] || $c;
427 0   0       my $x = $o->{data}->[$ic]->[$j] || 0;
428 0           $y -= $c;
429 0 0 0       if($y || $x){
430 0           my $a = atan2($m,1);
431 0           my $t = atan2($y,$x) - $a;
432 0           my $r = sqrt($x**2 + $y**2);
433 0           ($x,$y) = ($r*cos($t), $r*sin($t));
434             }
435              
436 0           $r->{data}->[$i]->[$j] = $y;
437 0           $r->{data}->[$ic]->[$j] = $x;
438              
439             #my ($d,$x) = distanceToLineN(
440             # [$o->{data}->[$i]->[$j],$o->{data}->[$ic]->[$j]],
441             # [$o->{m}->[$i], $o->{m}->[$ic]],
442             # [$o->{c}->[$i], $o->{c}->[$ic]]
443             #);
444             #my $icv = $o->{data}->[$ic]->[$j] || 0;
445             #my $iv = $o->{data}->[$i]->[$j] || 0;
446             #if($icv < $iv){
447             # $d *= -1;
448             #}
449             #$r->{data}->[$i]->[$j] = $d;
450             }
451            
452 0           my $sumOfSquares = 0;
453 0           my $sumOfValues = 0;
454 0 0         $sumOfSquares += $_ foreach map {$_ == $ic ? () : $r->{data}->[$_]->[$j] ** 2} (0..$o->{k}-1);
  0            
455 0 0         $sumOfValues += $_ foreach map {$_ == $ic ? () : $r->{data}->[$_]->[$j]} (0..$o->{k}-1);
  0            
456 0           my $rootsumsquares = sqrt($sumOfSquares);
457             #my ($d,$x) = distanceToLineN(
458             # [@coords], @MC
459             #);
460             # give the distance a sign too!
461             #my $ss = 0;
462             #foreach my $i(0..$r->{k}-1){
463             # $ss += $r->{data}->[$i]->[$j] * $r->{m}->[$i];
464             #}
465              
466 0 0         $rootsumsquares *= -1 if $sumOfValues < 0;
467              
468 0           push @{$r->{d}}, $rootsumsquares; # distance to line
  0            
469            
470             #my $ss = 0; # sum of squares
471             #foreach my $i(0..$o->{k}-1){
472             # my $xi = $X->[$i] || 0;
473             # my $di = $o->{data}->[$i]->[$j] || 0;
474             # $ss += ($xi - $di)**2
475             #}
476             #
477             #$r->{data}->[$ic]->[$j] = sqrt($ss); # distance to center of line
478            
479             }
480            
481            
482 0           return $r;
483             }
484              
485             =head2 variances
486              
487             Calculate variances... i.e. distances from the origin along the line of
488             regression, and distances from the line of regression. This is just like
489             deDiagonalise, except that only two columns are returned.
490              
491             =cut
492              
493             sub variances {
494 0     0 1   my $o = shift;
495 0           my $S; # experimental spread
496             my $E; # imprecision
497 0           my $df = 0;
498 0           my $ic = $o->{comparatorIndex};
499             # we can give a value for how likely a point is to be there by imprecision alone
500 0           foreach my $j(0..$o->{n}-1){
501 0 0 0       if($o->{d}->[$j] ne '' && $o->{data}->[$ic]->[$j] ne ''){
502 0           $E += $o->{d}->[$j] ** 2;
503 0           $S += $o->{data}->[$ic]->[$j] ** 2;
504 0           $df ++;
505             }
506             }
507 0           $E /= $df;
508 0           $S /= $df;
509 0           my $sdE = sqrt($E);
510 0           my $sdS = sqrt($S);
511 0           $o->{vE} = $E;
512 0           $o->{vS} = $S;
513 0           $o->{sdE} = $sdE;
514 0           $o->{sdS} = $sdS;
515              
516 0           $o->{pee} = [];
517 0           $o->{pss} = [];
518 0           $o->{pes} = [];
519 0           $o->{pse} = [];
520 0           foreach my $j(0..$o->{n}-1){
521 0           my ($pee,$pes,$pss,$pse) = ('','','','');
522 0 0 0       if($o->{d}->[$j] ne '' && $o->{data}->[$ic]->[$j] ne ''){
523 0 0         $pee = $sdE ?
524             Statistics::Distributions::tprob ($df,$o->{d}->[$j] / $sdE)
525             : 1;
526 0 0         $pes = $sdS ?
527             Statistics::Distributions::tprob ($df,$o->{d}->[$j] / $sdS)
528             : 1;
529 0 0         $pss = $sdS ?
530             Statistics::Distributions::tprob ($df,$o->{data}->[$ic]->[$j] / $sdS)
531             : 1;
532 0 0         $pse = $sdE ?
533             Statistics::Distributions::tprob ($df,$o->{data}->[$ic]->[$j] / $sdE)
534             : 1;
535             }
536 0           push @{$o->{pee}}, $pee;
  0            
537 0           push @{$o->{pes}}, $pes;
  0            
538 0           push @{$o->{pss}}, $pss;
  0            
539 0           push @{$o->{pse}}, $pse;
  0            
540             }
541 0           return ($S,$E);
542             }
543              
544             =head2 printableTable, printTable
545              
546             printableTable returns all available relevant info in a table
547             printTable prints all available relevant info in a table
548              
549             the firts element returned is a list of columns. the rest are the columns.
550              
551             data stored are:
552              
553             # scalars:
554             comparatorIndex # index of column used to compare
555             k
556             n
557             vE # variance of "error" (imprecision)
558             vS # variable of experimental spread
559             sdE # s.d. error
560             sdS # s.d. spread
561            
562             # arrays (foreach column)
563             m # regression denominator
564             c # regression constant
565             # arrays (foreach row)
566             d # distance from regression line
567             pee # p-value of error
568             pss # p-value of spread
569             pes # p-value of error over spread (??)
570             pse # p-value of spread over error
571            
572             # 2D array (LoL)
573             data
574            
575             note that the distance from the center of the distribution
576             is given by the values in data[comparatorIndex]
577              
578             These methods take a single argumen: depth. Every time an object is
579             derived from another (subtractMedian, deDiagonalize and
580             rotateToRegressionLine all do this) the old object is referenced, and
581             you can include the last $depth objects in the output. Set depth to -1
582             to include all objects.
583              
584             =cut
585              
586             sub printableTable {
587 0     0 1   my ($o,$deep) = @_;
588 0           my @header = (qw/Statistic Value/, map {"Column $_"} (1..$o->{k}));
  0            
589            
590 0           my @statistics = ();
591 0           my @values = ();
592            
593 0           my @statkeys = qw/comparatorIndex k n vE vS sdE sdS/;
594 0           my @statnames = qw/CompareColumn Columns Rows ErrorVariance SpreadVariance ErrorSD SpreadSD/;
595 0           foreach (0..$#statkeys){
596 0           my $statkey = $statkeys[$_];
597 0           my $statname = $statnames[$_];
598 0 0 0       if(defined $o->{$statkey} && $o->{$statkey} ne ''){
599 0           push @statistics, $statname;
600 0           push @values, $o->{$statkey};
601 0 0         if($statkey eq 'comparatorIndex'){
602 0           $values[$#values] ++; # 1-based!
603             }
604             }
605             }
606            
607 0           my @printData = (\@statistics, \@values, @{$o->{data}});
  0            
608            
609 0 0 0       if(ref $o->{m} && @{$o->{m}} ){
  0            
610 0           push @header, 'Regression','M','C';
611 0           push @printData, [map {"Column $_"} (1..$o->{k})];
  0            
612 0           push @printData, $o->{m}, $o->{c};
613             }
614            
615 0 0         if(ref $o->{d}){
616 0           push @header, 'DistanceToRegressionLine';
617 0           push @printData, $o->{d};
618             }
619            
620 0 0         if(ref $o->{pee}){
621 0           push @header, 'ErrorPvalue';
622 0           push @printData, $o->{pee};
623            
624 0           push @header, 'SpreadPvalue';
625 0           push @printData, $o->{pss};
626            
627 0           push @header, 'ErrorOverSpreadPvalue';
628 0           push @printData, $o->{pes};
629            
630 0           push @header, 'SpreadOverErrorPvalue';
631 0           push @printData, $o->{pse};
632             }
633              
634 0 0 0       if($deep && ref($o->{derivedFrom})){
635 0           push @header, 'DerivedFrom';
636 0           push @printData, [$o->{derivedReason}];
637 0           my $d = $o->{derivedFrom}->printableTable($deep-1);
638 0           my ($dh,@dcols) = @$d;
639 0           push @header, @$dh;
640 0           push @printData, @dcols;
641             }
642            
643 0           return [\@header, @printData];
644             }
645              
646             sub printTable {
647 0     0 1   my ($o,$deep) = @_;
648 0           my $pt = $o->printableTable($deep);
649             # lets assume that n > number of statistics!
650 0           my $n = $o->{n};
651 0           my $w = @$pt;
652 0           print join("\t", @{$pt->[0]})."\n";
  0            
653 0           foreach my $j(0..$n-1){
654 0           my @row = ();
655 0           foreach my $i(1..$w-1){
656 0 0         my $val = defined $pt->[$i]->[$j] ? $pt->[$i]->[$j] : '';
657 0           push @row, $val;
658             }
659 0           print join("\t", @row)."\n";
660             }
661             }
662              
663             =head1 just some wee helper functions...
664              
665             =head2 median
666              
667             yes this probably exists in other modules, but I didn't want to pull in a whole
668             module for just one funciton. Anyway, this is an inefficient version for small
669             numbers of data. It sorts the list and then uses middle() to find the middle of
670             the sorted list.
671              
672             =cut
673              
674             sub median {
675 0 0 0 0 1   my @r = sort {$a<=>$b} map {defined && /\d/ ? $_ : ()} @_;
  0            
  0            
676 0           return middle(@r);
677             }
678              
679             =head2 medianN
680              
681             Like median, but for an even list is returns the two middlemost values.
682             This version is used in medianI.
683              
684             =cut
685              
686             sub medianN {
687 0 0 0 0 1   my @r = sort {$a<=>$b} map {defined && /\d/ ? $_ : ()} @_;
  0            
  0            
688 0           return middleN(@r);
689             }
690              
691             =head2 medianI
692              
693             This uses medianN to get the middlemost value(s) and then returns a list
694             of column indices indicating which columns had a middlemost value.
695             This is used in the medianLeft method when deciding which
696             column is middlemost.
697              
698             =cut
699              
700             sub medianI {
701 0     0 1   my @N = medianN(@_);
702 0           my @I = ();
703 0           foreach my $i(0..$#_){
704 0 0 0       if(defined $_[$i] && $_[$i] ne ''){
705 0           foreach my $n(@N){
706 0 0         if($n == $_[$i]){
707 0           push @I, $i;
708             }
709             }
710             }
711             }
712 0           return @I;
713             }
714              
715             =head2 middle
716              
717             middle returns the middlemost item in a list, or the mean average of the two
718             middlemost items. It doesn't sort the list first.
719              
720             =cut
721              
722             sub middle {
723 0 0   0 1   if(@_ % 2){
724 0           return $_[$#_/2];
725             }
726             else {
727 0           return $_[($#_+1)/2]/2 + $_[($#_-1)/2]/2;
728             }
729             }
730              
731             =head2 middleN
732              
733             middleN does like middle, but for even lists, it returns the two middlemost
734             items as a list. This is used by medianN.
735              
736             =cut
737              
738             sub middleN {
739 0 0   0 1   if(@_ % 2){
740 0           return $_[$#_/2];
741             }
742             else {
743 0           return ($_[($#_+1)/2], $_[($#_-1)/2]);
744             }
745             }
746              
747              
748             =head1 AUTHOR
749              
750             Jimi Wills, C<< >>
751              
752             =head1 BUGS
753              
754             Please report any bugs or feature requests to C, or through
755             the web interface at L. I will be notified, and then you'll
756             automatically be notified of progress on your bug as I make changes.
757              
758              
759              
760              
761             =head1 SUPPORT
762              
763             You can find documentation for this module with the perldoc command.
764              
765             perldoc Statistics::Reproducibility
766              
767              
768             You can also look for information at:
769              
770             =over 4
771              
772             =item * RT: CPAN's request tracker (report bugs here)
773              
774             L
775              
776             =item * AnnoCPAN: Annotated CPAN documentation
777              
778             L
779              
780             =item * CPAN Ratings
781              
782             L
783              
784             =item * Search CPAN
785              
786             L
787              
788             =back
789              
790              
791             =head1 ACKNOWLEDGEMENTS
792              
793              
794             =head1 LICENSE AND COPYRIGHT
795              
796             Copyright 2013 Jimi Wills.
797              
798             This program is free software; you can redistribute it and/or modify it
799             under the terms of the the Artistic License (2.0). You may obtain a
800             copy of the full license at:
801              
802             L
803              
804             Any use, modification, and distribution of the Standard or Modified
805             Versions is governed by this Artistic License. By using, modifying or
806             distributing the Package, you accept this license. Do not use, modify,
807             or distribute the Package, if you do not accept this license.
808              
809             If your Modified Version has been derived from a Modified Version made
810             by someone other than you, you are nevertheless required to ensure that
811             your Modified Version complies with the requirements of this license.
812              
813             This license does not grant you the right to use any trademark, service
814             mark, tradename, or logo of the Copyright Holder.
815              
816             This license includes the non-exclusive, worldwide, free-of-charge
817             patent license to make, have made, use, offer to sell, sell, import and
818             otherwise transfer the Package with respect to any patent claims
819             licensable by the Copyright Holder that are necessarily infringed by the
820             Package. If you institute patent litigation (including a cross-claim or
821             counterclaim) against any party alleging that the Package constitutes
822             direct or contributory patent infringement, then this Artistic License
823             to you shall terminate on the date that such litigation is filed.
824              
825             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
826             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
827             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
828             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
829             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
830             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
831             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
832             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
833              
834              
835             =cut
836              
837             1; # End of Statistics::Reproducibility