File Coverage

blib/lib/Fsdb/Filter/dbrvstatdiff.pm

Criterion	Covered	Total	%
statement	21	153	13.7
branch	0	56	0.0
condition	0	12	0.0
subroutine	7	16	43.7
pod	5	5	100.0
total	33	242	13.6

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl
2
3							#
4							# dbrvstatdiff
5							#
6							# Copyright (C) 1991-2015 by John Heidemann
7							# $Id: ae7f061d6c1ab7aa2bd1243c98877ac4cdde0c3d $
8							#
9							# This program is distributed under terms of the GNU general
10							# public license, version 2. See the file COPYING
11							# in $dblibdir for details.
12							#
13
14							package Fsdb::Filter::dbrvstatdiff;
15
16							=head1 NAME
17
18							dbrvstatdiff - evaluate statistical differences between two random variables
19
20							=head1 SYNOPSIS
21
22							dbrvstatdiff [-f format] [-c ConfRating]
23							[-h HypothesizedDifference] m1c sd1c n1c m2c sd2c n2c
24
25							OR
26
27							dbrvstatdiff [-f format] [-c ConfRating] m1c n1c m2c n2c
28
29							=head1 DESCRIPTION
30
31							Produce statistics on the difference of sets of random variables.
32							If a hypothesized difference is given
33							(with C<-h>), to does a Student's t-test.
34
35							Random variables are specified by:
36
37							=over 4
38
39							=item C, C
40
41							The column names of means of random variables.
42
43							=item C, C
44
45							The column names of standard deviations of random variables.
46
47							=item C, C
48
49							Counts of number of samples for each random variable
50
51							=back
52
53							These values can be computed with L.
54
55							Creates up to ten new columns:
56
57							=over 4
58
59							=item C
60
61							The difference of RV 2 - RV 1.
62
63							=item C
64
65							The percentage difference (RV2-RV1)/1
66
67							=item C and C
68
69							The half half confidence intervals and low and high values
70							for absolute and relative confidence.
71
72							=item C
73
74							The T-test value for the given hypothesized difference.
75
76							=item C
77
78							Given the confidence rating, does the test pass? Will be
79							either "rejected" or "not-rejected".
80
81							=item C
82
83							The hypothesized value that is break-even point
84							for the T-test.
85
86							=item C
87
88							Break-even point as a percent of m1c.
89
90							=back
91
92							Confidence intervals are not printed if standard deviations are not provided.
93							Confidence intervals assume normal distributions with common variances.
94
95							T-tests are only computed if a hypothesized difference is provided.
96							Hypothesized differences should be proceeded by <=, >=, =.
97							T-tests assume normal distributions with common variances.
98
99							=head1 OPTIONS
100
101							=over 4
102
103							=item B<-c FRACTION> or B<--confidence FRACTION>
104
105							Specify FRACTION for the confidence interval.
106							Defaults to 0.95 for a 95% confidence factor
107							(alpha = 0.05).
108
109							=item B<-f FORMAT> or B<--format FORMAT>
110
111							Specify a L-style format for output statistics.
112							Defaults to C<%.5g>.
113
114							=item B<-h DIFF> or B<--hypothesis DIFF>
115
116							Specify the hypothesized difference as C,
117							where C is something like C=0> or C=0>, etc.
118
119							=back
120
121							=for comment
122							begin_standard_fsdb_options
123
124							This module also supports the standard fsdb options:
125
126							=over 4
127
128							=item B<-d>
129
130							Enable debugging output.
131
132							=item B<-i> or B<--input> InputSource
133
134							Read from InputSource, typically a file name, or C<-> for standard input,
135							or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
136
137							=item B<-o> or B<--output> OutputDestination
138
139							Write to OutputDestination, typically a file name, or C<-> for standard output,
140							or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
141
142							=item B<--autorun> or B<--noautorun>
143
144							By default, programs process automatically,
145							but Fsdb::Filter objects in Perl do not run until you invoke
146							the run() method.
147							The C<--(no)autorun> option controls that behavior within Perl.
148
149							=item B<--help>
150
151							Show help.
152
153							=item B<--man>
154
155							Show full manual.
156
157							=back
158
159							=for comment
160							end_standard_fsdb_options
161
162
163							=head1 SAMPLE USAGE
164
165							=head2 Input:
166
167							#fsdb title mean2 stddev2 n2 mean1 stddev1 n1
168							example6.12 0.17 0.0020 5 0.22 0.0010 4
169
170							=head2 Command:
171
172							cat data.fsdb \| dbrvstatdiff mean2 stddev2 n2 mean1 stddev1 n1
173
174							=head2 Output:
175
176							#fsdb title mean2 stddev2 n2 mean1 stddev1 n1 diff diff_pct diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high
177							example6.12 0.17 0.0020 5 0.22 0.0010 4 0.05 29.412 0.0026138 0.047386 0.052614 1.5375 27.874 30.949
178							# \| dbrvstatdiff mean2 stddev2 n2 mean1 stddev1 n1
179
180							=head2 Input 2:
181
182							(example 7.10 from Scheaffer and McClave):
183
184							#fsdb title x2 sd2 n2 x1 sd1 n1
185							example7.10 9 35.22 24.44 9 31.56 20.03
186
187							=head2 Command 2:
188
189							dbrvstatdiff -h '<=0' x2 sd2 n2 x1 sd1 n1
190
191							=head2 Output 2:
192
193							#fsdb title n1 x1 sd1 n2 x2 sd2 diff diff_pct diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high t_test t_test_result
194							example7.10 9 35.22 24.44 9 31.56 20.03 3.66 0.11597 4.7125 -1.0525 8.3725 0.14932 -0.033348 0.26529 1.6465 not-rejected
195							# \| /global/us/edu/ucla/cs/ficus/users/johnh/BIN/DB/dbrvstatdiff -h <=0 x2 sd2 n2 x1 sd1 n1
196
197
198							=head2 Case 3:
199
200							A common use case is to have one file with a set of trials
201							from two experiments, and to use dbrvstatdiff to see if they are different.
202
203							=head3 Input 3:
204
205							#fsdb case trial value
206							a 1 1
207							a 2 1.1
208							a 3 0.9
209							a 4 1
210							a 5 1.1
211							b 1 2
212							b 2 2.1
213							b 3 1.9
214							b 4 2
215							b 5 1.9
216
217							=head2 Command 3:
218
219							cat two_trial.fsdb \|
220							dbmultistats -k case value \|
221							dbcolcopylast mean stddev n \|
222							dbrow '_case eq "b"' \|
223							dbrvstatdiff -h '=0' mean stddev n copylast_mean copylast_stddev copylast_n \|
224							dblistize
225
226							=head3 Output 3:
227
228							#fsdb -R C case mean stddev pct_rsd conf_range conf_low conf_high conf_pct sum sum_squared min max n copylast_mean copylast_stddev copylast_n diff diff_pct diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high t_test t_test_result t_test_break t_test_break_pct
229							case: b
230							mean: 1.98
231							stddev: 0.083666
232							pct_rsd: 4.2256
233							conf_range: 0.10387
234							conf_low: 1.8761
235							conf_high: 2.0839
236							conf_pct: 0.95
237							sum: 9.9
238							sum_squared: 19.63
239							min: 1.9
240							max: 2.1
241							n: 5
242							copylast_mean: 1.02
243							copylast_stddev: 0.083666
244							copylast_n: 5
245							diff: -0.96
246							diff_pct: -48.485
247							diff_conf_half: 0.12202
248							diff_conf_low: -1.082
249							diff_conf_high: -0.83798
250							diff_conf_pct_half: 6.1627
251							diff_conf_pct_low: -54.648
252							diff_conf_pct_high: -42.322
253							t_test: -18.142
254							t_test_result: rejected
255							t_test_break: -1.082
256							t_test_break_pct: -54.648
257
258							# \| dbmultistats -k case value
259							# \| dbcolcopylast mean stddev n
260							# \| dbrow _case eq "b"
261							# \| dbrvstatdiff -h =0 mean stddev n copylast_mean copylast_stddev copylast_n
262							# \| dbfilealter -R C
263
264							(So one cannot say that they are statistically equal.)
265
266
267							=head1 SEE ALSO
268
269							L.
270							L.
271							L.
272
273
274							=head1 CLASS FUNCTIONS
275
276							=cut
277
278							@ISA = qw(Fsdb::Filter);
279							$VERSION = 2.0;
280
281	1			1		8758	use strict;
	1					3
	1					39
282	1			1		5	use Pod::Usage;
	1					2
	1					152
283	1			1		6	use Carp;
	1					2
	1					60
284
285	1			1		5	use Fsdb::Filter;
	1					5
	1					21
286	1			1		4	use Fsdb::IO::Reader;
	1					2
	1					31
287	1			1		4	use Fsdb::IO::Writer;
	1					1
	1					25
288	1			1		494	use Fsdb::Support::TDistribution qw(t_distribution);
	1					3
	1					1489
289
290
291							=head2 new
292
293							$filter = new Fsdb::Filter::dbrvstatdiff(@arguments);
294
295							Create a new dbrvstatdiff object, taking command-line arguments.
296
297							=cut
298
299							sub new ($@) {
300	0			0	1		my $class = shift @_;
301	0						my $self = $class->SUPER::new(@_);
302	0						bless $self, $class;
303	0						$self->set_defaults;
304	0						$self->parse_options(@_);
305	0						$self->SUPER::post_new();
306	0						return $self;
307							}
308
309
310							=head2 set_defaults
311
312							$filter->set_defaults();
313
314							Internal: set up defaults.
315
316							=cut
317
318							sub set_defaults ($) {
319	0			0	1		my($self) = @_;
320	0						$self->SUPER::set_defaults();
321	0						$self->{_confidence_fraction} = 0.95;
322	0						$self->{_format} = "%.5g";
323	0						$self->{_hypothesis} = undef;
324	0						$self->{_hyp_diff} = undef;
325	0						$self->{_hyp_class} = undef;
326	0						$self->{_arg_fields} = [];
327
328	0						$self->{_do_confidence} = undef;
329							# $self->{_do_t_test} = undef; just check for defined(hypothesis)
330							}
331
332							=head2 parse_options
333
334							$filter->parse_options(@ARGV);
335
336							Internal: parse command-line arguments.
337
338							=cut
339
340							sub parse_options ($@) {
341	0			0	1		my $self = shift @_;
342
343	0						my(@argv) = @_;
344							$self->get_options(
345							\@argv,
346	0			0			'help\|?' => sub { pod2usage(1); },
347	0			0			'man' => sub { pod2usage(-verbose => 2); },
348							'autorun!' => \$self->{_autorun},
349							'c\|confidence=f' => \$self->{_confidence_fraction},
350							'close!' => \$self->{_close},
351							'd\|debug+' => \$self->{_debug},
352							'f\|format=s' => \$self->{_format},
353							'h\|hypothesis=s' => \$self->{_hypothesis},
354	0			0			'i\|input=s' => sub { $self->parse_io_option('input', @_); },
355							'log!' => \$self->{_logprog},
356	0			0			'o\|output=s' => sub { $self->parse_io_option('output', @_); },
357	0	0					) or pod2usage(2);
358	0						push (@{$self->{_arg_fields}}, @argv);
	0
359							}
360
361							=head2 setup
362
363							$filter->setup();
364
365							Internal: setup, parse headers.
366
367							=cut
368
369							sub setup ($) {
370	0			0	1		my($self) = @_;
371
372							#
373							# valid hypothesis, if any?
374							#
375	0	0					if (defined($self->{_hypothesis})) {
376	0						my($hyp_class, $hyp_diff) = ($self->{_hypothesis} =~ /^\s([<>=!]+)\s([0-9\.]+)/);
377	0	0					$hyp_class = '' if (!defined($hyp_class));
378	0	0					$self->{_hyp_class} = -1 if ($hyp_class eq '<=');
379	0	0					$self->{_hyp_class} = 0 if ($hyp_class =~ /^==?$/);
380	0	0					$self->{_hyp_class} = 1 if ($hyp_class eq '>=');
381							croak $self->{_prog} . ": bad hypothesis specification ``" .
382							$self->{_hypothesis} . "''; not of format <=N, =N, >=N, where N is some number.\n"
383	0	0	0				if (!defined($self->{_hyp_class}) \|\| !defined($hyp_diff));
384	0						$self->{_hyp_diff} = $hyp_diff;
385							};
386
387							#
388							# what exactly are we doing?
389							#
390	0	0					if ($#{$self->{_arg_fields}} == 5) {
	0	0
391	0						($self->{_m1_column}, $self->{_ss1_column}, $self->{_n1_column}, $self->{_m2_column}, $self->{_ss2_column}, $self->{_n2_column}) = @{$self->{_arg_fields}};
	0
392	0						$self->{_do_confidence} = 1;
393	0						} elsif ($#{$self->{_arg_fields}} == 3) {
394	0						($self->{_m1_column}, $self->{_n1_column}, $self->{_m2_column}, $self->{_n2_column}) = @{$self->{_arg_fields}};
	0
395	0						$self->{_ss1_column} = $self->{_ss2_column} = undef;
396							croak $self->{_prog} . ": T-tests require standard deviations, but none were given as arguments.\n"
397	0	0					if (defined($self->{_hyp_diff}));
398							} else {
399	0						carp $self->{_prog} . ": confusing number of fields given; cannot identify desired type of stats.\n";
400	0						pod2usage(2);
401							};
402
403							#
404							# finish up IO
405							#
406	0						$self->finish_io_option('input', -comment_handler => $self->create_pass_comments_sub);
407
408							# check selected columns
409	0						foreach (qw(_m1_column _ss1_column _n1_column _m2_column _ss2_column _n2_column)) {
410	0						my $coli = $_;
411	0	0					next if (!defined($self->{$_})); # maybe unspecified sum-of-squares
412	0						$coli =~ s/_column/_coli/;
413	0						$self->{$coli} = $self->{_in}->col_to_i($self->{$_});
414							croak $self->{_prog} . ": unknown selected input column ``$_''.\n"
415	0	0					if (!defined($self->{$coli}));
416							};
417
418	0						$self->finish_io_option('output', -clone => $self->{_in}, -outputheader => 'delay');
419	0						my(@new_columns) = qw(diff diff_pct);
420							push(@new_columns, qw(diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high))
421	0	0					if (defined($self->{_do_confidence}));
422							push(@new_columns, qw(t_test t_test_result t_test_break t_test_break_pct))
423	0	0					if (defined($self->{_hypothesis}));
424	0						foreach (@new_columns) {
425							$self->{_out}->col_create($_)
426	0	0					or croak $self->{_prog} . ": cannot create column ``$_'' (maybe it already existed?)\n";
427	0						$self->{"_${_}_coli"} = $self->{_out}->col_to_i($_);
428	0	0					defined($self->{"_${_}_coli"}) or croak "internal error\n";
429							};
430
431							}
432
433							=head2 run
434
435							$filter->run();
436
437							Internal: run over each rows.
438
439							=cut
440							sub run ($) {
441	0			0	1		my($self) = @_;
442
443	0						my $conf_alpha = (1.0 - $self->{_confidence_fraction}) / 2.0;
444
445	0						my $read_fastpath_sub = $self->{_in}->fastpath_sub();
446	0						my $write_fastpath_sub = $self->{_out}->fastpath_sub();
447
448	0						my $m1c = $self->{_m1_coli};
449	0						my $m2c = $self->{_m2_coli};
450	0						my $n1c = $self->{_n1_coli};
451	0						my $n2c = $self->{_n2_coli};
452	0						my $ss1c = $self->{_ss1_coli};
453	0						my $ss2c = $self->{_ss2_coli};
454
455	0						my $empty = $self->{_empty};
456	0						my $hyp_diff = $self->{_hyp_diff};
457	0						my $hyp_class = $self->{_hyp_class};
458	0						my $format = $self->{_format};
459
460	0						my $diff_f = $self->{_diff_coli};
461	0						my $diff_pct_f = $self->{_diff_pct_coli};
462	0						my $diff_conf_half_f = $self->{_diff_conf_half_coli};
463	0						my $diff_conf_low_f = $self->{_diff_conf_low_coli};
464	0						my $diff_conf_high_f = $self->{_diff_conf_high_coli};
465	0						my $diff_conf_pct_half_f = $self->{_diff_conf_pct_half_coli};
466	0						my $diff_conf_pct_low_f = $self->{_diff_conf_pct_low_coli};
467	0						my $diff_conf_pct_high_f = $self->{_diff_conf_pct_high_coli};
468	0						my $t_test_f = $self->{_t_test_coli};
469	0						my $t_test_result_f = $self->{_t_test_result_coli};
470	0						my $t_test_break_f = $self->{_t_test_break_coli};
471	0						my $t_test_break_pct_f = $self->{_t_test_break_pct_coli};
472
473	0						my $fref;
474	0						my($diff_conf_half, $diff_conf_low, $diff_conf_high,
475							$diff_conf_pct_half, $diff_conf_pct_low, $diff_conf_pct_high,
476							$t_test, $t_test_result,
477							$t_test_break, $t_test_break_pct);
478	0						while ($fref = &$read_fastpath_sub()) {
479
480	0						my $diff = $fref->[$m2c] - $fref->[$m1c];
481	0						my $diff_pct;
482	0	0					if ($fref->[$m1c] == 0.0) {
483	0						$diff_pct = $empty;
484							} else {
485	0						$diff_pct = $diff / $fref->[$m1c] * 100.0;
486							};
487	0						$diff_conf_half = $diff_conf_low = $diff_conf_high =
488							$diff_conf_pct_half = $diff_conf_pct_low = $diff_conf_pct_high =
489							$t_test = $t_test_result =
490							$t_test_break = $t_test_break_pct = $empty;
491	0	0	0				if ($self->{_do_confidence} && $fref->[$n1c] > 1 && $fref->[$n2c] > 1) {
			0
492							# basic stuff
493	0						my $degrees_of_freedom = ($fref->[$n1c] + $fref->[$n2c] - 2);
494	0						my $ssp = (($fref->[$n1c] - 1) * $fref->[$ss1c] * $fref->[$ss1c] + ($fref->[$n2c] - 1) * $fref->[$ss2c] * $fref->[$ss2c]) / $degrees_of_freedom;
495	0						my $sqrt_ssp_inverses = sqrt($ssp * (1/$fref->[$n1c] + 1/$fref->[$n2c]));
496	0						my $t_value = t_distribution($degrees_of_freedom, $conf_alpha);
497
498							# confidence intervals
499	0						$diff_conf_half = $t_value * $sqrt_ssp_inverses;
500	0						$diff_conf_low = $diff - $diff_conf_half;
501	0						$diff_conf_high = $diff + $diff_conf_half;
502
503							# t-test
504	0	0	0				if (defined($self->{_hypothesis}) && $sqrt_ssp_inverses != 0.0) {
505	0						$t_test = ($diff - $hyp_diff) / $sqrt_ssp_inverses;
506	0						$t_test_result = "not-rejected";
507	0	0					if ($hyp_class < 0) {
		0
508	0	0					$t_test_result = "rejected"
509							if ($t_test > $t_value);
510							} elsif ($hyp_class > 0) {
511	0	0					$t_test_result = "rejected"
512							if ($t_test < $t_value);
513							} else {
514	0	0					$t_test_result = "rejected"
515							if (abs($t_test) > $t_value);
516							};
517							# also compute the break-even point
518	0						$t_test_break = $diff - $t_value * $sqrt_ssp_inverses;
519	0	0					$t_test_break_pct = $t_test_break / $fref->[$m1c] * 100.0
520							if ($fref->[$m1c] != 0.0);
521							};
522
523							# percentages
524	0	0					if ($fref->[$m1c] != 0.0) {
525	0						$diff_conf_pct_half = $diff_conf_half / $fref->[$m1c] * 100.0;
526	0						$diff_conf_pct_low = $diff_conf_low / $fref->[$m1c] * 100.0;
527	0						$diff_conf_pct_high = $diff_conf_high / $fref->[$m1c] * 100.0;
528							};
529							};
530
531
532	0						$fref->[$diff_f] = sprintf("$format", $diff);
533	0						$fref->[$diff_pct_f] = sprintf("$format", $diff_pct);
534	0	0					if ($self->{_do_confidence}) {
535	0						$fref->[$diff_conf_half_f] = sprintf("$format", $diff_conf_half);
536	0						$fref->[$diff_conf_low_f] = sprintf("$format", $diff_conf_low);
537	0						$fref->[$diff_conf_high_f] = sprintf("$format", $diff_conf_high);
538	0						$fref->[$diff_conf_pct_half_f] = sprintf("$format", $diff_conf_pct_half);
539	0						$fref->[$diff_conf_pct_low_f] = sprintf("$format", $diff_conf_pct_low);
540	0						$fref->[$diff_conf_pct_high_f] = sprintf("$format", $diff_conf_pct_high);
541							};
542	0	0					if (defined($self->{_hypothesis})) {
543	0						$fref->[$t_test_f] = sprintf("$format", $t_test);
544	0						$fref->[$t_test_result_f] = $t_test_result;
545	0						$fref->[$t_test_break_f] = sprintf("$format", $t_test_break);
546	0						$fref->[$t_test_break_pct_f] = sprintf("$format", $t_test_break_pct);
547							};
548
549	0						&$write_fastpath_sub($fref);
550							};
551							}
552
553
554							=head1 AUTHOR and COPYRIGHT
555
556							Copyright (C) 1991-2015 by John Heidemann
557
558							This program is distributed under terms of the GNU general
559							public license, version 2. See the file COPYING
560							with the distribution for details.
561
562							=cut
563
564							1;