File Coverage

blib/lib/Fsdb/Filter/dbmultistats.pm
Criterion Covered Total %
statement 24 98 24.4
branch 0 48 0.0
condition 0 3 0.0
subroutine 8 18 44.4
pod 6 6 100.0
total 38 173 21.9


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2              
3             #
4             # dbmultistats.pm
5             # Copyright (C) 1991-2015 by John Heidemann
6             # $Id: 85a9faaa887a82737100dceee7013e2894b800e1 $
7             #
8             # This program is distributed under terms of the GNU general
9             # public license, version 2. See the file COPYING
10             # in $dblibdir for details.
11             #
12              
13             package Fsdb::Filter::dbmultistats;
14              
15             =head1 NAME
16              
17             dbmultistats - run dbcolstats over each group of inputs identified by some key
18              
19             =head1 SYNOPSIS
20              
21             $0 [-dm] [-c ConfidencePercent] [-f FormatForm] [-q NumberOfQuartiles] -k KeyField ValueField
22              
23             =head1 DESCRIPTION
24              
25             The input table is grouped by KeyField,
26             then we compute a separate set of column statistics on ValueField
27             for each group with a unique key.
28              
29             Assumptions and requirements
30             are the same as L
31             (this program is just a wrapper around that program):
32              
33             By default, data can be provided in arbitrary order
34             and the program consumes O(number of unique tags) memory,
35             and O(size of data) disk space.
36              
37             With the -S option, data must arrive group by tags (not necessarily sorted),
38             and the program consumes O(number of tags) memory and no disk space.
39             The program will check and abort if this precondition is not met.
40              
41             With two -S's, program consumes O(1) memory, but doesn't verify
42             that the data-arrival precondition is met.
43              
44             (Note that these semantics are exactly like
45             dbmapreduce -k KeyField -- dbcolstats ValueField
46             L provides a simpler API that passes
47             through statistics-specific arguments
48             and is optimized when data is pre-sorted and there
49             are no quarties or medians.)
50              
51             =head1 OPTIONS
52              
53             Options are the same as L.
54              
55             =over 4
56              
57             =item B<-k> or B<--key> KeyField
58              
59             specify which column is the key for grouping (default: the first column)
60              
61             =item B<-a> or B<--include-non-numeric>
62              
63             Compute stats over all records (treat non-numeric records
64             as zero rather than just ignoring them).
65              
66             =item B<-c FRACTION> or B<--confidence FRACTION>
67              
68             Specify FRACTION for the confidence interval.
69             Defaults to 0.95 for a 95% confidence factor.
70              
71             =item B<-f FORMAT> or B<--format FORMAT>
72              
73             Specify a L-style format for output statistics.
74             Defaults to C<%.5g>.
75              
76             =item B<-m> or B<--median>
77              
78             Compute median value. (Will sort data if necessary.)
79             (Median is the quantitle for N=2.)
80              
81             =item B<-q N> or B<--quantile N>
82              
83             Compute quantile (quartile when N is 4),
84             or an arbitrary quantile for other values of N,
85             where the scores that are 1 Nth of the way across the population.
86              
87             =item B<-S> or B<--pre-sorted>
88              
89             Assume data is already sorted.
90             With one -S, we check and confirm this precondition.
91             When repeated, we skip the check.
92              
93             =item <-T TmpDir>
94              
95             where to put temporary data.
96             Only used if median or quantiles are requested.
97             Also uses environment variable TMPDIR, if -T is
98             not specified.
99             Default is /tmp.
100              
101             =item <--parallelism=N>
102              
103             Allow up to N reducers to run in parallel.
104             Default is the number of CPUs in the machine.
105              
106             =back
107              
108              
109             =for comment
110             begin_standard_fsdb_options
111              
112             This module also supports the standard fsdb options:
113              
114             =over 4
115              
116             =item B<-d>
117              
118             Enable debugging output.
119              
120             =item B<-i> or B<--input> InputSource
121              
122             Read from InputSource, typically a file name, or C<-> for standard input,
123             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
124              
125             =item B<-o> or B<--output> OutputDestination
126              
127             Write to OutputDestination, typically a file name, or C<-> for standard output,
128             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
129              
130             =item B<--autorun> or B<--noautorun>
131              
132             By default, programs process automatically,
133             but Fsdb::Filter objects in Perl do not run until you invoke
134             the run() method.
135             The C<--(no)autorun> option controls that behavior within Perl.
136              
137             =item B<--help>
138              
139             Show help.
140              
141             =item B<--man>
142              
143             Show full manual.
144              
145             =back
146              
147             =for comment
148             end_standard_fsdb_options
149              
150              
151             =head1 SAMPLE USAGE
152              
153             =head2 Input:
154              
155             #fsdb experiment duration
156             ufs_mab_sys 37.2
157             ufs_mab_sys 37.3
158             ufs_rcp_real 264.5
159             ufs_rcp_real 277.9
160              
161             =head2 Command:
162              
163             cat DATA/stats.fsdb | dbmultistats -k experiment duration
164              
165             =head2 Output:
166              
167             #fsdb experiment mean stddev pct_rsd conf_range conf_low conf_high conf_pct sum sum_squared min max n
168             ufs_mab_sys 37.25 0.070711 0.18983 0.6353 36.615 37.885 0.95 74.5 2775.1 37.2 37.3 2
169             ufs_rcp_real 271.2 9.4752 3.4938 85.13 186.07 356.33 0.95 542.4 1.4719e+05 264.5 277.9 2
170             # | /home/johnh/BIN/DB/dbmultistats experiment duration
171              
172              
173             =head1 SEE ALSO
174              
175             L.
176             L.
177             L.
178              
179              
180             =head1 CLASS FUNCTIONS
181              
182             =cut
183              
184             @ISA = qw(Fsdb::Filter);
185             $VERSION = 2.0;
186              
187 1     1   7305 use strict;
  1         3  
  1         36  
188 1     1   6 use Pod::Usage;
  1         2  
  1         139  
189 1     1   5 use Carp;
  1         2  
  1         58  
190              
191 1     1   4 use Fsdb::Filter;
  1         2  
  1         19  
192 1     1   5 use Fsdb::Filter::dbmapreduce;
  1         1  
  1         39  
193 1     1   4 use Fsdb::Filter::dbcolstats;
  1         2  
  1         23  
194 1     1   4 use Fsdb::IO::Reader;
  1         2  
  1         23  
195 1     1   4 use Fsdb::IO::Writer;
  1         2  
  1         979  
196              
197              
198             =head2 new
199              
200             $filter = new Fsdb::Filter::dbmultistats(@arguments);
201              
202             Create a new dbmultistats object, taking command-line arguments.
203              
204             =cut
205              
206             sub new ($@) {
207 0     0 1   my $class = shift @_;
208 0           my $self = $class->SUPER::new(@_);
209 0           bless $self, $class;
210 0           $self->set_defaults;
211 0           $self->parse_options(@_);
212 0           $self->SUPER::post_new();
213 0           return $self;
214             }
215              
216              
217             =head2 set_defaults
218              
219             $filter->set_defaults();
220              
221             Internal: set up defaults.
222              
223             =cut
224              
225             sub set_defaults ($) {
226 0     0 1   my($self) = @_;
227 0           $self->SUPER::set_defaults();
228 0           $self->{_key_column} = undef;
229 0           $self->{_pre_sorted} = 0;
230 0           $self->{_confidence_fraction} = undef;
231 0           $self->{_format} = undef;
232 0           $self->{_quantile} = undef;
233 0           $self->{_median} = undef; # special case: renames the output field
234 0           $self->{_max_parallelism} = undef;
235 0           $self->{_include_non_numeric} = undef;
236 0           $self->{_header} = undef;
237             }
238              
239             =head2 parse_options
240              
241             $filter->parse_options(@ARGV);
242              
243             Internal: parse command-line arguments.
244              
245             =cut
246              
247             sub parse_options ($@) {
248 0     0 1   my $self = shift @_;
249              
250 0           my(@argv) = @_;
251             $self->get_options(
252             \@argv,
253 0     0     'help|?' => sub { pod2usage(1); },
254 0     0     'man' => sub { pod2usage(-verbose => 2); },
255             'a|include-non-numeric!' => \$self->{_include_non_numeric},
256             'autorun!' => \$self->{_autorun},
257             'close!' => \$self->{_close},
258             'c|confidence=f' => \$self->{_confidence_fraction},
259             'd|debug+' => \$self->{_debug},
260             'f|format=s' => \$self->{_format},
261             'header=s' => \$self->{_header},
262 0     0     'i|input=s' => sub { $self->parse_io_option('input', @_); },
263             'k|key=s' => \$self->{_key_column},
264             'log!' => \$self->{_logprog},
265             'm|median!' => \$self->{_median},
266 0     0     'o|output=s' => sub { $self->parse_io_option('output', @_); },
267             'parallelism=i' => \$self->{_max_parallelism},
268             'q|quantile=i' => \$self->{_quantile},
269             'S|pre-sorted+' => \$self->{_pre_sorted},
270             'T|tmpdir|tempdir=s' => \$self->{_tmpdir},
271 0 0         ) or pod2usage(2);
272 0           $self->parse_target_column(\@argv);
273             }
274              
275             =head2 setup
276              
277             $filter->setup();
278              
279             Internal: setup, parse headers.
280              
281             Pass the right options to dbmapreduce and dbcolstats.
282              
283             =cut
284              
285             sub setup ($) {
286 0     0 1   my($self) = @_;
287              
288 0 0         pod2usage(2) if (!defined($self->{_target_column}));
289              
290             #
291             # First, dbcolstats:
292             #
293 0           my @dbcolstats_argv = (qw(--nolog));
294             push(@dbcolstats_argv, '--include-non-numeric')
295 0 0         if (defined($self->{_include_non_numeric}));
296             push(@dbcolstats_argv, '--confidence', $self->{_confidence_fraction})
297 0 0         if (defined($self->{_confidence_fraction}));
298             push(@dbcolstats_argv, '--format', $self->{_format})
299 0 0         if (defined($self->{_format}));
300             push(@dbcolstats_argv, '--median')
301 0 0         if (defined($self->{_median}));
302             push(@dbcolstats_argv, '--quantile', $self->{_quantile})
303 0 0         if (defined($self->{_quantile}));
304             push(@dbcolstats_argv, '--tmpdir', $self->{_tmpdir})
305 0 0         if (defined($self->{_tmpdir}));
306             push(@dbcolstats_argv, '--parallelism', $self->{_max_parallelism})
307 0 0         if (defined($self->{_max_parallelism}));
308             # last one!
309             # push (@dbcolstats_argv, $self->{_target_column});
310             # Added by hand below.
311              
312             # sigh, noclose/saveoutput didn't work
313 0           my @dbmapreduce_argv = (qw(--nolog --noclose --copy-fs)); # --noclose --saveoutput), \$self->{_out});
314             # push(@dbmapreduce_argv, qw(--noclose --saveoutput), \$self->{_out});
315             # $self->{_child_saves_output} = 1;
316             # pass input and output
317             push (@dbmapreduce_argv, "--header", $self->{_header})
318 0 0         if (defined($self->{_header}));
319 0           push (@dbmapreduce_argv, "--input", $self->{_input});
320 0           push (@dbmapreduce_argv, "--output", $self->{_output});
321             # the rest
322             push (@dbmapreduce_argv, ("-S") x $self->{_pre_sorted})
323 0 0         if ($self->{_pre_sorted});
324             push (@dbcolstats_argv, '--parallelism', $self->{_max_parallelism})
325 0 0         if (defined($self->{_max_parallelism}));
326             push (@dbmapreduce_argv, "--key", $self->{_key_column})
327 0 0         if (defined($self->{_key_column}));
328              
329             #
330             # Optimize: use dbcolstats -k if we can
331 0           $self->{_multi_aware_reducer} = 1;
332 0 0 0       $self->{_multi_aware_reducer} = undef if (defined($self->{_median}) || defined($self->{_quantile}));
333 0 0         $self->{_multi_aware_reducer} = undef if (!$self->{_pre_sorted});
334 0 0         if ($self->{_multi_aware_reducer}) {
335 0           push(@dbcolstats_argv, '--key', $self->{_key_column});
336 0           push(@dbmapreduce_argv, '--multiple-ok');
337 0           $self->{_child_saves_output} = 1;
338 0           push(@dbmapreduce_argv, '--noclose', '--saveoutput', \$self->{_out});
339             };
340              
341              
342 0           my $dbcolstats_code = 'dbcolstats(';
343 0           foreach (@dbcolstats_argv) {
344 0           $dbcolstats_code .= "'$_', ";
345             };
346 0           $dbcolstats_code .= "'" . $self->{_target_column} . "')";
347 0           push (@dbmapreduce_argv, '--code', $dbcolstats_code);
348              
349 0 0         print join(" ", @dbmapreduce_argv) if ($self->{_debug});
350              
351 0           $self->{_mapreducer} = new Fsdb::Filter::dbmapreduce(@dbmapreduce_argv);
352 0           $self->{_mapreducer}->setup;
353             }
354              
355             =head2 run
356              
357             $filter->run();
358              
359             Internal: run over each rows.
360              
361             =cut
362             sub run ($) {
363 0     0 1   my($self) = @_;
364 0           $self->{_mapreducer}->run;
365             }
366              
367              
368             =head2 finish
369              
370             $filter->finish();
371              
372             Internal: write trailer.
373              
374             =cut
375             sub finish ($) {
376 0     0 1   my($self) = @_;
377 0           $self->{_mapreducer}->finish;
378             # we need to add our trailer
379             # $self->SUPER::finish();
380             # xxx: hack hack hack
381             # --saveoutput didn't work, so fake it up here
382 0           my $post = "#" . $self->compute_program_log() . "\n";
383 0 0         if ($self->{_child_saves_output}) {
384 0           $self->SUPER::finish();
385             } else {
386 0 0         if (ref($self->{_output}) =~ /^Fsdb::IO/) {
    0          
    0          
387 0           $self->{_output}->write_comment($post);
388 0 0         $self->{_output}->close if ($self->{_close});
389             } elsif (ref($self->{_output}) =~ /^Fsdb::BoundedQueue/) {
390 0           $self->{_output}->enqueue($post);
391 0 0         $self->{_output}->enqueue(undef) if ($self->{_close});
392             } elsif ($self->{_output} eq '-') {
393             # stdout
394 0           print $post;
395             } else {
396             # assume file handle
397 0           $self->{_output}->print($post);
398 0 0         $self->{_output}->close if ($self->{_close});
399             };
400             };
401             }
402              
403             =head1 AUTHOR and COPYRIGHT
404              
405             Copyright (C) 1991-2015 by John Heidemann
406              
407             This program is distributed under terms of the GNU general
408             public license, version 2. See the file COPYING
409             with the distribution for details.
410              
411             =cut
412              
413             1;