File Coverage

blib/lib/Fsdb/Filter/dbmultistats.pm
Criterion Covered Total %
statement 24 99 24.2
branch 0 48 0.0
condition 0 3 0.0
subroutine 8 18 44.4
pod 6 6 100.0
total 38 174 21.8


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2              
3             #
4             # dbmultistats.pm
5             # Copyright (C) 1991-2015 by John Heidemann
6             # $Id: 85a9faaa887a82737100dceee7013e2894b800e1 $
7             #
8             # This program is distributed under terms of the GNU general
9             # public license, version 2. See the file COPYING
10             # in $dblibdir for details.
11             #
12              
13             package Fsdb::Filter::dbmultistats;
14              
15             =head1 NAME
16              
17             dbmultistats - run dbcolstats over each group of inputs identified by some key
18              
19             =head1 SYNOPSIS
20              
21             $0 [-dm] [-c ConfidencePercent] [-f FormatForm] [-q NumberOfQuartiles] -k KeyField ValueField
22              
23             =head1 DESCRIPTION
24              
25             The input table is grouped by KeyField,
26             then we compute a separate set of column statistics on ValueField
27             for each group with a unique key.
28              
29             Assumptions and requirements
30             are the same as L
31             (this program is just a wrapper around that program):
32              
33             By default, data can be provided in arbitrary order
34             and the program consumes O(number of unique tags) memory,
35             and O(size of data) disk space.
36              
37             With the -S option, data must arrive group by tags (not necessarily sorted),
38             and the program consumes O(number of tags) memory and no disk space.
39             The program will check and abort if this precondition is not met.
40              
41             With two -S's, program consumes O(1) memory, but doesn't verify
42             that the data-arrival precondition is met.
43              
44             (Note that these semantics are exactly like
45             dbmapreduce -k KeyField -- dbcolstats ValueField
46             L provides a simpler API that passes
47             through statistics-specific arguments
48             and is optimized when data is pre-sorted and there
49             are no quarties or medians.)
50              
51             =head1 OPTIONS
52              
53             Options are the same as L.
54              
55             =over 4
56              
57             =item B<-k> or B<--key> KeyField
58              
59             specify which column is the key for grouping (default: the first column)
60              
61             =item B<--output-on-no-input>
62              
63             Enables null output (all fields are "-", n is 0)
64             if we get input with a schema but no records.
65             Without this option, just output the schema but no rows.
66             Default: no output if no input.
67              
68             =item B<-a> or B<--include-non-numeric>
69              
70             Compute stats over all records (treat non-numeric records
71             as zero rather than just ignoring them).
72              
73             =item B<-c FRACTION> or B<--confidence FRACTION>
74              
75             Specify FRACTION for the confidence interval.
76             Defaults to 0.95 for a 95% confidence factor.
77              
78             =item B<-f FORMAT> or B<--format FORMAT>
79              
80             Specify a L-style format for output statistics.
81             Defaults to C<%.5g>.
82              
83             =item B<-m> or B<--median>
84              
85             Compute median value. (Will sort data if necessary.)
86             (Median is the quantitle for N=2.)
87              
88             =item B<-q N> or B<--quantile N>
89              
90             Compute quantile (quartile when N is 4),
91             or an arbitrary quantile for other values of N,
92             where the scores that are 1 Nth of the way across the population.
93              
94             =item B<-S> or B<--pre-sorted>
95              
96             Assume data is already sorted.
97             With one -S, we check and confirm this precondition.
98             When repeated, we skip the check.
99              
100             =item B<-T TmpDir>
101              
102             where to put temporary data.
103             Only used if median or quantiles are requested.
104             Also uses environment variable TMPDIR, if -T is
105             not specified.
106             Default is /tmp.
107              
108             =item B<--parallelism=N> or B<-j N>
109              
110             Allow up to N reducers to run in parallel.
111             Default is the number of CPUs in the machine.
112              
113             =back
114              
115              
116             =for comment
117             begin_standard_fsdb_options
118              
119             This module also supports the standard fsdb options:
120              
121             =over 4
122              
123             =item B<-d>
124              
125             Enable debugging output.
126              
127             =item B<-i> or B<--input> InputSource
128              
129             Read from InputSource, typically a file name, or C<-> for standard input,
130             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
131              
132             =item B<-o> or B<--output> OutputDestination
133              
134             Write to OutputDestination, typically a file name, or C<-> for standard output,
135             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
136              
137             =item B<--autorun> or B<--noautorun>
138              
139             By default, programs process automatically,
140             but Fsdb::Filter objects in Perl do not run until you invoke
141             the run() method.
142             The C<--(no)autorun> option controls that behavior within Perl.
143              
144             =item B<--help>
145              
146             Show help.
147              
148             =item B<--man>
149              
150             Show full manual.
151              
152             =back
153              
154             =for comment
155             end_standard_fsdb_options
156              
157              
158             =head1 SAMPLE USAGE
159              
160             =head2 Input:
161              
162             #fsdb experiment duration
163             ufs_mab_sys 37.2
164             ufs_mab_sys 37.3
165             ufs_rcp_real 264.5
166             ufs_rcp_real 277.9
167              
168             =head2 Command:
169              
170             cat DATA/stats.fsdb | dbmultistats -k experiment duration
171              
172             =head2 Output:
173              
174             #fsdb experiment mean stddev pct_rsd conf_range conf_low conf_high conf_pct sum sum_squared min max n
175             ufs_mab_sys 37.25 0.070711 0.18983 0.6353 36.615 37.885 0.95 74.5 2775.1 37.2 37.3 2
176             ufs_rcp_real 271.2 9.4752 3.4938 85.13 186.07 356.33 0.95 542.4 1.4719e+05 264.5 277.9 2
177             # | /home/johnh/BIN/DB/dbmultistats experiment duration
178              
179              
180             =head1 SEE ALSO
181              
182             L.
183             L.
184             L.
185              
186              
187             =head1 CLASS FUNCTIONS
188              
189             =cut
190              
191             @ISA = qw(Fsdb::Filter);
192             $VERSION = 2.0;
193              
194 1     1   5173 use strict;
  1         2  
  1         27  
195 1     1   6 use Pod::Usage;
  1         2  
  1         77  
196 1     1   7 use Carp;
  1         2  
  1         42  
197              
198 1     1   6 use Fsdb::Filter;
  1         1  
  1         16  
199 1     1   349 use Fsdb::Filter::dbmapreduce;
  1         3  
  1         32  
200 1     1   338 use Fsdb::Filter::dbcolstats;
  1         2  
  1         27  
201 1     1   7 use Fsdb::IO::Reader;
  1         2  
  1         15  
202 1     1   4 use Fsdb::IO::Writer;
  1         2  
  1         747  
203              
204              
205             =head2 new
206              
207             $filter = new Fsdb::Filter::dbmultistats(@arguments);
208              
209             Create a new dbmultistats object, taking command-line arguments.
210              
211             =cut
212              
213             sub new ($@) {
214 0     0 1   my $class = shift @_;
215 0           my $self = $class->SUPER::new(@_);
216 0           bless $self, $class;
217 0           $self->set_defaults;
218 0           $self->parse_options(@_);
219 0           $self->SUPER::post_new();
220 0           return $self;
221             }
222              
223              
224             =head2 set_defaults
225              
226             $filter->set_defaults();
227              
228             Internal: set up defaults.
229              
230             =cut
231              
232             sub set_defaults ($) {
233 0     0 1   my($self) = @_;
234 0           $self->SUPER::set_defaults();
235 0           $self->{_key_column} = undef;
236 0           $self->{_pre_sorted} = 0;
237 0           $self->{_confidence_fraction} = undef;
238 0           $self->{_format} = undef;
239 0           $self->{_quantile} = undef;
240 0           $self->{_median} = undef; # special case: renames the output field
241 0           $self->{_max_parallelism} = undef;
242 0           $self->{_include_non_numeric} = undef;
243 0           $self->{_header} = undef;
244 0           $self->{_output_on_no_input} = undef;
245             }
246              
247             =head2 parse_options
248              
249             $filter->parse_options(@ARGV);
250              
251             Internal: parse command-line arguments.
252              
253             =cut
254              
255             sub parse_options ($@) {
256 0     0 1   my $self = shift @_;
257              
258 0           my(@argv) = @_;
259             $self->get_options(
260             \@argv,
261 0     0     'help|?' => sub { pod2usage(1); },
262 0     0     'man' => sub { pod2usage(-verbose => 2); },
263             'a|include-non-numeric!' => \$self->{_include_non_numeric},
264             'autorun!' => \$self->{_autorun},
265             'close!' => \$self->{_close},
266             'c|confidence=f' => \$self->{_confidence_fraction},
267             'd|debug+' => \$self->{_debug},
268             'f|format=s' => \$self->{_format},
269             'header=s' => \$self->{_header},
270 0     0     'i|input=s' => sub { $self->parse_io_option('input', @_); },
271             'j|parallelism=i' => \$self->{_max_parallelism},
272             'k|key=s' => \$self->{_key_column},
273             'log!' => \$self->{_logprog},
274             'm|median!' => \$self->{_median},
275 0     0     'o|output=s' => sub { $self->parse_io_option('output', @_); },
276             'output-on-no-input!' => \$self->{_output_on_no_input},
277             'q|quantile=i' => \$self->{_quantile},
278             'S|pre-sorted+' => \$self->{_pre_sorted},
279             'T|tmpdir|tempdir=s' => \$self->{_tmpdir},
280 0 0         ) or pod2usage(2);
281 0           $self->parse_target_column(\@argv);
282             }
283              
284             =head2 setup
285              
286             $filter->setup();
287              
288             Internal: setup, parse headers.
289              
290             Pass the right options to dbmapreduce and dbcolstats.
291              
292             =cut
293              
294             sub setup ($) {
295 0     0 1   my($self) = @_;
296              
297 0 0         pod2usage(2) if (!defined($self->{_target_column}));
298              
299             #
300             # First, dbcolstats:
301             #
302 0           my @dbcolstats_argv = (qw(--no-output-on-no-input --nolog));
303             push(@dbcolstats_argv, '--include-non-numeric')
304 0 0         if (defined($self->{_include_non_numeric}));
305             push(@dbcolstats_argv, '--confidence', $self->{_confidence_fraction})
306 0 0         if (defined($self->{_confidence_fraction}));
307             push(@dbcolstats_argv, '--format', $self->{_format})
308 0 0         if (defined($self->{_format}));
309             push(@dbcolstats_argv, '--median')
310 0 0         if (defined($self->{_median}));
311             push(@dbcolstats_argv, '--quantile', $self->{_quantile})
312 0 0         if (defined($self->{_quantile}));
313             push(@dbcolstats_argv, '--tmpdir', $self->{_tmpdir})
314 0 0         if (defined($self->{_tmpdir}));
315             push(@dbcolstats_argv, '--parallelism', $self->{_max_parallelism})
316 0 0         if (defined($self->{_max_parallelism}));
317             # last one!
318             # push (@dbcolstats_argv, $self->{_target_column});
319             # Added by hand below.
320              
321             # sigh, noclose/saveoutput didn't work
322 0           my @dbmapreduce_argv = (qw(--nolog --noclose --copy-fs)); # --noclose --saveoutput), \$self->{_out});
323             # push(@dbmapreduce_argv, qw(--noclose --saveoutput), \$self->{_out});
324             # $self->{_child_saves_output} = 1;
325             # pass input and output
326             push (@dbmapreduce_argv, "--header", $self->{_header})
327 0 0         if (defined($self->{_header}));
328 0           push (@dbmapreduce_argv, "--input", $self->{_input});
329 0           push (@dbmapreduce_argv, "--output", $self->{_output});
330             # the rest
331             push (@dbmapreduce_argv, ("-S") x $self->{_pre_sorted})
332 0 0         if ($self->{_pre_sorted});
333             push (@dbcolstats_argv, '--parallelism', $self->{_max_parallelism})
334 0 0         if (defined($self->{_max_parallelism}));
335             push (@dbmapreduce_argv, "--key", $self->{_key_column})
336 0 0         if (defined($self->{_key_column}));
337              
338             #
339             # Optimize: use dbcolstats -k if we can
340 0           $self->{_multi_aware_reducer} = 1;
341 0 0 0       $self->{_multi_aware_reducer} = undef if (defined($self->{_median}) || defined($self->{_quantile}));
342 0 0         $self->{_multi_aware_reducer} = undef if (!$self->{_pre_sorted});
343 0 0         if ($self->{_multi_aware_reducer}) {
344 0           push(@dbcolstats_argv, '--key', $self->{_key_column});
345 0           push(@dbmapreduce_argv, '--multiple-ok');
346 0           $self->{_child_saves_output} = 1;
347 0           push(@dbmapreduce_argv, '--noclose', '--saveoutput', \$self->{_out});
348             };
349              
350              
351 0           my $dbcolstats_code = 'dbcolstats(';
352 0           foreach (@dbcolstats_argv) {
353 0           $dbcolstats_code .= "'$_', ";
354             };
355 0           $dbcolstats_code .= "'" . $self->{_target_column} . "')";
356 0           push (@dbmapreduce_argv, '--code', $dbcolstats_code);
357              
358 0 0         print join(" ", @dbmapreduce_argv) if ($self->{_debug});
359              
360 0           $self->{_mapreducer} = new Fsdb::Filter::dbmapreduce(@dbmapreduce_argv);
361 0           $self->{_mapreducer}->setup;
362             }
363              
364             =head2 run
365              
366             $filter->run();
367              
368             Internal: run over each rows.
369              
370             =cut
371             sub run ($) {
372 0     0 1   my($self) = @_;
373 0           $self->{_mapreducer}->run;
374             }
375              
376              
377             =head2 finish
378              
379             $filter->finish();
380              
381             Internal: write trailer.
382              
383             =cut
384             sub finish ($) {
385 0     0 1   my($self) = @_;
386 0           $self->{_mapreducer}->finish;
387             # we need to add our trailer
388             # $self->SUPER::finish();
389             # xxx: hack hack hack
390             # --saveoutput didn't work, so fake it up here
391 0           my $post = "# " . $self->compute_program_log() . "\n";
392 0 0         if ($self->{_child_saves_output}) {
393 0           $self->SUPER::finish();
394             } else {
395 0 0         if (ref($self->{_output}) =~ /^Fsdb::IO/) {
    0          
    0          
396 0           $self->{_output}->write_comment($post);
397 0 0         $self->{_output}->close if ($self->{_close});
398             } elsif (ref($self->{_output}) =~ /^Fsdb::BoundedQueue/) {
399 0           $self->{_output}->enqueue($post);
400 0 0         $self->{_output}->enqueue(undef) if ($self->{_close});
401             } elsif ($self->{_output} eq '-') {
402             # stdout
403 0           print $post;
404             } else {
405             # assume file handle
406 0           $self->{_output}->print($post);
407 0 0         $self->{_output}->close if ($self->{_close});
408             };
409             };
410             }
411              
412             =head1 AUTHOR and COPYRIGHT
413              
414             Copyright (C) 1991-2015 by John Heidemann
415              
416             This program is distributed under terms of the GNU general
417             public license, version 2. See the file COPYING
418             with the distribution for details.
419              
420             =cut
421              
422             1;