File Coverage

blib/lib/Fsdb/Filter/dbcolscorrelate.pm

Criterion	Covered	Total	%
statement	27	128	21.0
branch	0	30	0.0
condition	0	6	0.0
subroutine	9	18	50.0
pod	5	5	100.0
total	41	187	21.9

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl
2
3							#
4							# dbcolscorrelate.pm
5							# Copyright (C) 1998-2015 by John Heidemann
6							# $Id: bdcf1da03251b46bded7f59984e64e7f5060ae46 $
7							#
8							# This program is distributed under terms of the GNU general
9							# public license, version 2. See the file COPYING
10							# in $dblibdir for details.
11							#
12
13							package Fsdb::Filter::dbcolscorrelate;
14
15							=head1 NAME
16
17							dbcolscorrelate - find the coefficient of correlation over columns
18
19							=head1 SYNOPSIS
20
21							dbcolscorrelate column1 column2 [column3...]
22
23							=head1 DESCRIPTION
24
25							Compute the coefficient of correlation over two (or more) columns.
26
27							The output is one line of correlations.
28
29							With exactly two columns, a new column I is created.
30
31							With more than two columns, correlations are computed for each
32							pairwise combination of rows, and each output column
33							is given a name which is the concatenation of the two source rows,
34							joined with an underscore.
35
36							By default, we compute the I
37							(usually designed rho, E<0x03c1>)
38							and assume we see all members of the population.
39							With the B<--sample> option we instead compute the
40							I, usually designated I.
41							(Be careful in that the default here to full-population
42							is the I of the default in L.)
43
44							This program requires a complete copy of the input data on disk.
45
46							=head1 OPTIONS
47
48							=over 4
49
50							=item B<--sample>
51
52							Select a the Pearson product-moment correlation coefficient
53							(the "sample correlation coefficient", usually designated I).
54
55							=item B<--nosample>
56
57							Select a the Pearson product-moment correlation coefficient
58							(the "sample correlation coefficient", usually designated I).
59
60
61
62							=item B<-f FORMAT> or B<--format FORMAT>
63
64							Specify a L-style format for output statistics.
65							Defaults to C<%.5g>.
66
67							=item B<-T TmpDir>
68
69							where to put tmp files.
70							Also uses environment variable TMPDIR, if -T is
71							not specified.
72							Default is /tmp.
73
74							=back
75
76							=for comment
77							begin_standard_fsdb_options
78
79							This module also supports the standard fsdb options:
80
81							=over 4
82
83							=item B<-d>
84
85							Enable debugging output.
86
87							=item B<-i> or B<--input> InputSource
88
89							Read from InputSource, typically a file name, or C<-> for standard input,
90							or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
91
92							=item B<-o> or B<--output> OutputDestination
93
94							Write to OutputDestination, typically a file name, or C<-> for standard output,
95							or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
96
97							=item B<--autorun> or B<--noautorun>
98
99							By default, programs process automatically,
100							but Fsdb::Filter objects in Perl do not run until you invoke
101							the run() method.
102							The C<--(no)autorun> option controls that behavior within Perl.
103
104							=item B<--help>
105
106							Show help.
107
108							=item B<--man>
109
110							Show full manual.
111
112							=back
113
114							=for comment
115							end_standard_fsdb_options
116
117
118							=head1 SAMPLE USAGE
119
120							=head2 Input:
121
122							#fsdb name id test1 test2
123							a 1 80 81
124							b 2 70 71
125							c 3 65 66
126							d 4 90 91
127							e 5 70 71
128							f 6 90 91
129
130							=head2 Command:
131
132							cat DATA/more_grades.fsdb \| dbcolscorrelate test1 test2
133
134							=head2 Output:
135
136							#fsdb correlation
137							0.83329
138							# \| dbcolscorrelate test1 test2
139
140
141							=head1 SEE ALSO
142
143							L,
144							L,
145							L.
146
147
148							=head1 CLASS FUNCTIONS
149
150							=cut
151
152							@ISA = qw(Fsdb::Filter);
153							$VERSION = 2.0;
154
155	1			1		4346	use strict;
	1					1
	1					23
156	1			1		3	use Pod::Usage;
	1					1
	1					62
157	1			1		3	use Carp;
	1					1
	1					36
158
159	1			1		3	use Fsdb::Filter;
	1					2
	1					12
160	1			1		3	use Fsdb::IO::Reader;
	1					1
	1					17
161	1			1		2	use Fsdb::IO::Writer;
	1					1
	1					15
162	1			1		3	use Fsdb::Support qw($is_numeric_regexp);
	1					1
	1					69
163	1			1		4	use Fsdb::Support::NamedTmpfile;
	1					1
	1					16
164	1			1		6	use Fsdb::Filter::dbpipeline qw(dbpipeline_open2 dbpipeline_close2_hash dbcolstats);
	1					1
	1					1028
165
166
167							=head2 new
168
169							$filter = new Fsdb::Filter::dbcolscorrelate(@arguments);
170
171							Create a new dbcolscorrelate object, taking command-line arguments.
172
173							=cut
174
175							sub new ($@) {
176	0			0	1		my $class = shift @_;
177	0						my $self = $class->SUPER::new(@_);
178	0						bless $self, $class;
179	0						$self->set_defaults;
180	0						$self->parse_options(@_);
181	0						$self->SUPER::post_new();
182	0						return $self;
183							}
184
185
186							=head2 set_defaults
187
188							$filter->set_defaults();
189
190							Internal: set up defaults.
191
192							=cut
193
194							sub set_defaults ($) {
195	0			0	1		my($self) = @_;
196	0						$self->SUPER::set_defaults();
197	0						$self->{_format} = "%.5g";
198	0						$self->{_columns} = [];
199	0						$self->{_include_non_numeric} = undef;
200	0						$self->{_sample} = undef;
201	0						$self->{_fscode} = undef;
202	0						$self->set_default_tmpdir;
203							}
204
205							=head2 parse_options
206
207							$filter->parse_options(@ARGV);
208
209							Internal: parse command-line arguments.
210
211							=cut
212
213							sub parse_options ($@) {
214	0			0	1		my $self = shift @_;
215
216	0						my(@argv) = @_;
217							$self->get_options(
218							\@argv,
219	0			0			'help\|?' => sub { pod2usage(1); },
220	0			0			'man' => sub { pod2usage(-verbose => 2); },
221							'a\|include-non-numeric!' => \$self->{_include_non_numeric},
222							'autorun!' => \$self->{_autorun},
223							'd\|debug+' => \$self->{_debug},
224							'f\|format=s' => \$self->{_format},
225							'F\|fs\|cs\|fieldseparator\|columnseparator=s' => \$self->{_fscode},
226	0			0			'i\|input=s' => sub { $self->parse_io_option('input', @_); },
227							'log!' => \$self->{_logprog},
228							's\|sample!' => \$self->{_sample},
229							'T\|tmpdir\|tempdir=s' => \$self->{_tmpdir},
230	0			0			'o\|output=s' => sub { $self->parse_io_option('output', @_); },
231	0	0					) or pod2usage(2);
232	0						push (@{$self->{_columns}}, @argv);
	0
233							}
234
235							=head2 setup
236
237							$filter->setup();
238
239							Internal: setup, parse headers.
240
241							=cut
242
243							sub setup ($) {
244	0			0	1		my($self) = @_;
245
246	0						$self->finish_io_option('input', -comment_handler => $self->create_pass_comments_sub);
247
248							croak $self->{_prog} . ": at least two columns must be specified to compute a correlation.\n"
249	0	0					if ($#{$self->{_columns}} < 1);
	0
250	0						my @output_columns;
251							my %columns_processed;
252	0						foreach my $i (0..$#{$self->{_columns}}) {
	0
253	0						my $column = $self->{_columns}[$i];
254							croak $self->{_prog} . ": column $column is double-listed as an input column (not allowed).\n"
255	0	0					if (defined($columns_processed{$column}));
256	0						$columns_processed{$column} = 1;
257	0						$self->{_colis}[$i] = $self->{_in}->col_to_i($column);
258							croak $self->{_prog} . ": column $column does not exist in the input stream.\n"
259	0	0					if (!defined($self->{_colis}[$i]));
260	0						foreach my $j (0..$#{$self->{_columns}}) {
	0
261	0	0					next if ($i >= $j);
262	0						push(@output_columns, $self->{_columns}[$i] . "_" . $self->{_columns}[$j]);
263							};
264							};
265							# if only one column, it has a special name
266	0	0					$output_columns[0] = 'correlation'
267							if ($#output_columns == 0);
268	0						my @output_options = (-cols => \@output_columns);
269							unshift (@output_options, -fscode => $self->{_fscode})
270	0	0					if (defined($self->{_fscode}));
271	0						$self->finish_io_option('output', @output_options);
272							};
273
274
275							=head2 run
276
277							$filter->run();
278
279							Internal: run over each rows.
280
281							=cut
282							sub run ($) {
283	0			0	1		my($self) = @_;
284
285							#
286							# First, read data and save it to a file,
287							# and send each relevant column off to get stats.
288							#
289	0						$self->{_copy_filename} = Fsdb::Support::NamedTmpfile::alloc($self->{_tmpdir});
290							my $copy_writer = new Fsdb::IO::Writer(-file => $self->{_copy_filename},
291	0						-clone => $self->{_in});
292
293	0						my $read_fastpath_sub = $self->{_in}->fastpath_sub();
294	0						my $copy_fastpath_sub = $copy_writer->fastpath_sub();
295
296							# and take stats
297	0						my(@stats_source_queues);
298							my(@stats_sinks);
299	0						my(@stats_threads);
300	0						my $columns_aref = $self->{_columns};
301	0						my $colis_aref = $self->{_colis};
302	0						foreach (0..$#$columns_aref) {
303							($stats_source_queues[$_], $stats_sinks[$_], $stats_threads[$_]) =
304	0	0					dbpipeline_open2([-cols => [qw(data)]], dbcolstats(($self->{_sample} ? '--sample' : '--nosample'), 'data'));
305							}
306	0						my $fref;
307	0						while ($fref = &$read_fastpath_sub()) {
308							# copy and send to stats
309	0						$copy_writer->write_rowobj($fref);
310							# with forking we have to close output explicitly
311							# otherwise we block on the pipe(2) to the subprocesses.
312	0						foreach (0..$#$columns_aref) {
313	0						$stats_sinks[$_]->write_row($fref->[$colis_aref->[$_]]);
314							};
315							};
316							# close up both
317	0						$copy_writer->close;
318	0						foreach (0..$#$columns_aref) {
319	0						$stats_sinks[$_]->close;
320	0						$stats_sinks[$_] = undef;
321							};
322	0						my @means;
323							my @stddevs;
324	0						foreach (0..$#$columns_aref) {
325	0						my $stats_href = dbpipeline_close2_hash($stats_source_queues[$_], $stats_sinks[$_], $stats_threads[$_]);
326	0						$means[$_] = $stats_href->{'mean'};
327	0	0					croak $self->{_prog} . ": column " . $columns_aref->[$_] . " does not have valid mean.\n"
328							if (!defined($means[$_]));
329	0						$stddevs[$_] = $stats_href->{'stddev'};
330	0	0					croak $self->{_prog} . ": column " . $columns_aref->[$_] . " does not have valid standard deviation.\n"
331							if (!defined($stddevs[$_]));
332							};
333
334							#
335							# Now read back the data,
336							# compute the z-scores on the fly,
337							# and use that to compute the correlation.
338							#
339	0						$self->{_in}->close;
340							$self->{_in} = new Fsdb::IO::Reader(-file => $self->{_copy_filename},
341	0						-comment_handler => $self->create_pass_comments_sub);
342	0						my $sum_zs;
343							my $ns;
344	0						foreach my $i (0..$#$columns_aref) {
345	0						foreach my $j (0..$#$columns_aref) {
346	0	0					next if ($i >= $j);
347	0						$sum_zs->[$i][$j] = 0;
348	0						$ns->[$i][$j] = 0;
349							};
350							};
351
352	0						$read_fastpath_sub = $self->{_in}->fastpath_sub(); # regenerate with copy stream
353	0						while ($fref = &$read_fastpath_sub()) {
354	0						my @zs;
355
356							# compute z-score
357	0						foreach (0..$#$columns_aref) {
358	0						my $x = $fref->[$colis_aref->[$_]];
359	0	0					if ($x !~ /$is_numeric_regexp/) {
360	0						$zs[$_] = undef;
361							} else {
362	0						$zs[$_] = ($x - $means[$_]) / $stddevs[$_];
363							};
364							};
365
366							# and compute running stats for correlations
367	0						foreach my $i (0..$#$columns_aref) {
368	0						foreach my $j (0..$#$columns_aref) {
369	0	0	0				next if ($i >= $j \|\|!defined($zs[$i]) \|\| !defined($zs[$j]));
			0
370	0						$sum_zs->[$i][$j] += $zs[$i] * $zs[$j];
371	0						($ns->[$i][$j])++;
372							};
373							};
374							};
375
376							#
377							# Finally output the results.
378							#
379	0						my(@correlations);
380	0						foreach my $i (0..$#$columns_aref) {
381	0						foreach my $j (0..$#$columns_aref) {
382	0	0					next if ($i >= $j);
383	0	0					if ($ns->[$i][$j] == 0) {
384	0						push(@correlations, $self->{_empty});
385							} else {
386	0						my $c = $sum_zs->[$i][$j] / $ns->[$i][$j];
387	0						push(@correlations, $self->numeric_formatting($c));
388							};
389							};
390							};
391	0						$self->{_out}->write_row_from_aref(\@correlations);
392							};
393
394							=head1 AUTHOR and COPYRIGHT
395
396							Copyright (C) 1991-2015 by John Heidemann
397
398							This program is distributed under terms of the GNU general
399							public license, version 2. See the file COPYING
400							with the distribution for details.
401
402							=cut
403
404							1;