File Coverage

blib/lib/Fsdb/Filter/dbfilediff.pm
Criterion Covered Total %
statement 32 123 26.0
branch 2 36 5.5
condition 0 12 0.0
subroutine 7 17 41.1
pod 5 5 100.0
total 46 193 23.8


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2              
3             #
4             # dbfilediff.pm
5             # Copyright (C) 2012-2015 by John Heidemann
6             # $Id: 3221524c041f6e1037daba3af5e80a4df19feb6d $
7             #
8             # This program is distributed under terms of the GNU general
9             # public license, version 2. See the file COPYING
10             # in $dblibdir for details.
11             #
12              
13             package Fsdb::Filter::dbfilediff;
14              
15             =head1 NAME
16              
17             dbfilediff - compare two fsdb tables
18              
19             =head1 SYNOPSIS
20              
21             dbfilediff [-Eq] [-N diff_column_name] --input table1.fsdb --input table2.fsdb
22              
23             OR
24              
25             cat table1.fsdb | dbfilediff [-sq] --input table2.fsdb
26              
27             =head1 DESCRIPTION
28              
29             Dbfilediff compares two Fsdb tables, row by row.
30             Unlike Unix L, this program assumes the files are identical
31             line-by-line and we compare fields.
32             Thus, insertion of one extra row will result in all
33             subsequent lines being marked different.
34              
35             By default, I columns must be unique.
36             (At some point, support to specific specific columns may be added.)
37              
38             Output is a new table with a new column C
39             (or something else if the C<-N> option is given),
40             "-" and "+" for the first and second non-equal rows,
41             "=" for matching lines,
42             or "~" if they are equal with epsilon numerics
43             (in which case only the second row is included).
44             Unlike Unix L, we output I rows (the "=" lines),
45             not just diffs (the C<--quiet> option suppresses this output).
46              
47             Optionally, with C<-E> it will do a "epsilon numeric" comparision,
48             to account for things like variations in different computer's
49             floating point precision and differences in printf output.
50              
51             Epsilon comparision is asymmetric, in that it assumes the first
52             input is correct an allows the second input to vary,
53             but not the reverse.
54              
55             Because two tables are required,
56             input is typically in files.
57             Standard input is accessible by the file "-".
58              
59             =head1 OPTIONS
60              
61             =over 4
62              
63             =item B<-E> or B<--epsilon>
64              
65             Do epsilon-numeric comparison. (Described above.)
66              
67             Epsilon-comparision is only done on columns that look like floating
68             point numbers, not on strings or integers.
69             Epsilon comparision allows the last digit to vary by 1,
70             or for there to be one extra digit of precision,
71             but only for floating point numbers.
72              
73             Rows that are within epsilon are not considered different
74             for purposes of the exit code.
75              
76             =item B<--exit>
77              
78             Exit with a status of 1 if some differences were found.
79             (By default, the exit status is 0 with or without differences
80             if the file is processed successfully.)
81              
82             =item B<-N> on B<--new-name>
83              
84             Specify the name of the C column, if any.
85             (Default is C.)
86              
87             =item B<-q> or B<--quiet>
88              
89             Be quiet, suppressing output for identical rows.
90             (This behavior is different from Unix L
91             where C<-q> suppresses I output.)
92             If repeated, omits epsilon-equivalent rows.
93              
94             =back
95              
96              
97             =for comment
98             begin_standard_fsdb_options
99              
100             This module also supports the standard fsdb options:
101              
102             =over 4
103              
104             =item B<-d>
105              
106             Enable debugging output.
107              
108             =item B<-i> or B<--input> InputSource
109              
110             Read from InputSource, typically a file name, or C<-> for standard input,
111             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
112              
113             =item B<-o> or B<--output> OutputDestination
114              
115             Write to OutputDestination, typically a file name, or C<-> for standard output,
116             or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.
117              
118             =item B<--autorun> or B<--noautorun>
119              
120             By default, programs process automatically,
121             but Fsdb::Filter objects in Perl do not run until you invoke
122             the run() method.
123             The C<--(no)autorun> option controls that behavior within Perl.
124              
125             =item B<--help>
126              
127             Show help.
128              
129             =item B<--man>
130              
131             Show full manual.
132              
133             =back
134              
135             =for comment
136             end_standard_fsdb_options
137              
138              
139             =head1 SAMPLE USAGE
140              
141             =head2 Input:
142              
143             #fsdb event clock absdiff pctdiff
144             _null_getpage+128 815812813.281756 0 0
145             _null_getpage+128 815812813.328709 0.046953 5.7554e-09
146             _null_getpage+128 815812813.353830 0.025121 3.0793e-09
147             _null_getpage+128 815812813.357169 0.0033391 4.0929e-10
148              
149             And in the file F:
150              
151             #fsdb event clock absdiff pctdiff
152             _null_getpage+128 815812813.281756 0 0
153             _null_getpage+128 815812813.328709 0.046953 5.7554e-09
154             _null_getpage+128 815812813.353830 0.025121 3.0793e-09
155             _null_getpage+128 815812813.357169 0.003339 4.0929e-10
156              
157              
158             =head2 Command:
159              
160             cat TEST/dbfilediff_ex.in | dbfilediff -i - -i TEST/dbfilediff_ex.in-2
161              
162             =head2 Output:
163              
164             #fsdb event clock absdiff pctdiff diff
165             _null_getpage+128 815812813.281756 0 0 =
166             _null_getpage+128 815812813.328709 0.046953 5.7554e-09 =
167             _null_getpage+128 815812813.353830 0.025121 3.0793e-09 =
168             _null_getpage+128 815812813.357169 0.0033391 4.0929e-10 -
169             _null_getpage+128 815812813.357169 0.003339 4.0929e-10 +
170             # | dbfilediff --input TEST/dbfilediff_ex.in-2
171              
172             By comparision, if one adds the C<-s> option, then all rows will pass as equal.
173              
174             =head1 SEE ALSO
175              
176             L.
177             L.
178             L.
179              
180             L, L, and L are similar but different.
181             L computes row-by-row differences for a column,
182             L eliminates rows that have no differences,
183             and L compares fields of two files.
184              
185              
186              
187             =head1 CLASS FUNCTIONS
188              
189             =cut
190              
191             @ISA = qw(Fsdb::Filter);
192             ($VERSION) = 2.0;
193              
194 2     2   17387 use strict;
  2         2  
  2         53  
195 2     2   5 use Carp qw(croak);
  2         2  
  2         82  
196 2     2   423 use Pod::Usage;
  2         34072  
  2         163  
197             # use Regexp::Common;
198              
199 2     2   428 use Fsdb::Filter;
  2         4  
  2         41  
200 2     2   9 use Fsdb::IO::Reader;
  2         2  
  2         30  
201 2     2   6 use Fsdb::IO::Writer;
  2         3  
  2         1700  
202              
203              
204             =head2 new
205              
206             $filter = new Fsdb::Filter::dbfilediff(@arguments);
207              
208             Create a new dbfilediff object, taking command-line arguments.
209              
210             =cut
211              
212             sub new ($@) {
213 0     0 1 0 my $class = shift @_;
214 0         0 my $self = $class->SUPER::new(@_);
215 0         0 bless $self, $class;
216 0         0 $self->set_defaults;
217 0         0 $self->parse_options(@_);
218 0         0 $self->SUPER::post_new();
219 0         0 return $self;
220             }
221              
222              
223             =head2 set_defaults
224              
225             $filter->set_defaults();
226              
227             Internal: set up defaults.
228              
229             =cut
230              
231             sub set_defaults ($) {
232 0     0 1 0 my($self) = @_;
233 0         0 $self->SUPER::set_defaults();
234 0         0 $self->{_info}{input_count} = 2;
235 0         0 $self->{_epsilon_numerics} = undef;
236 0         0 $self->{_exit_one_if_diff} = undef;
237 0         0 $self->{_destination_column} = 'diff';
238 0         0 $self->{_quiet} = 0;
239             }
240              
241             =head2 parse_options
242              
243             $filter->parse_options(@ARGV);
244              
245             Internal: parse command-line arguments.
246              
247             =cut
248              
249             sub parse_options ($@) {
250 0     0 1 0 my $self = shift @_;
251              
252 0         0 my(@argv) = @_;
253             $self->get_options(
254             \@argv,
255 0     0   0 'help|?' => sub { pod2usage(1); },
256 0     0   0 'man' => sub { pod2usage(-verbose => 2); },
257 0     0   0 'a|all!' => sub { $self->{_join_type} = 'outer'; },
258             'autorun!' => \$self->{_autorun},
259             'close!' => \$self->{_close},
260             'd|debug+' => \$self->{_debug},
261             'E|epsilon!' => \$self->{_epsilon_numerics},
262             'exit!' => \$self->{_exit_one_if_diff},
263 0     0   0 'i|input=s@' => sub { $self->parse_io_option('inputs', @_); },
264             'log!' => \$self->{_logprog},
265             'N|new-name=s' => \$self->{_destination_column},
266 0     0   0 'o|output=s' => sub { $self->parse_io_option('output', @_); },
267             'q|quiet+' => \$self->{_quiet},
268 0 0       0 ) or pod2usage(2);
269 0 0       0 croak $self->{_prog} . ": internal error, extra arguments.\n"
270             if ($#argv != -1);
271             }
272              
273             =head2 setup
274              
275             $filter->setup();
276              
277             Internal: setup, parse headers.
278              
279             =cut
280              
281             sub setup ($) {
282 0     0 1 0 my($self) = @_;
283              
284 0         0 $self->setup_exactly_two_inputs;
285 0         0 $self->finish_io_option('inputs', -comment_handler => undef);
286 0         0 $self->finish_io_option('output', -clone => $self->{_ins}[0], -outputheader => 'delay');
287             $self->{_out}->col_create($self->{_destination_column})
288 0 0       0 or croak $self->{_prog} . ": cannot create column " . $self->{_destination_column} . " (maybe it already existed?)\n";
289              
290              
291             croak $self->{_prog} . ": input streams have different schemas; cannot merge\n"
292 0 0       0 if ($self->{_ins}[0]->compare($self->{_ins}[1]) ne 'identical');
293             }
294              
295             =head2 _find_epsilon
296              
297             ($value, $epsilon, $sig_figs) = _find_epsilon($fp)
298              
299             Return a numeric VALUE and an EPSILON that reflects its significant figures
300             with possible rounding error.
301              
302             =cut
303             sub _find_epsilon {
304 8     8   5434 my($v) = @_;
305              
306             # use Regexp::Common;
307             # # (now in-lined since it's unlikely floating point numbers will change)
308             # my $real_regexp = $RE{num}{real}{-keep};
309             # $v =~ /$real_regexp/;
310 8         29 $v =~ /((?i)([+-]?)((?=[.]?[0123456789])([0123456789]*)(?:([.])([0123456789]{0,}))?)(?:([E])(([+-]?)([0123456789]+))|))/;
311 8         15 my($match, $significand) = ($1, $3); # these are STRINGS
312 8 50       16 return undef if (!defined($match)); # non-numeric
313             #
314             # Need to convert significand to epsilon.
315             #
316             # first, find significant digits in the string (non-trivial)
317             # then, adjust that by the exponent.
318             #
319             # For test cases, see TEST/find_epsilon.t.
320             #
321 8         8 my($figs) = $significand;
322 8         15 $figs =~ s/\.//;
323 8         17 $figs =~ s/^0*//g; # longest match
324 8 50       15 $figs = '0' if ($figs eq '');
325 8         9 my($sig_figs) = length($figs);
326              
327 8         58 my($reformat) = sprintf("%e", $v);
328 8         23 my($log10) = ($reformat =~ /e([-+]\d+)$/);
329 8         10 $log10 += 1; # force numeric and account for the digit before the decimal
330              
331 8         23 my($epsilon) = 10**($log10 - $sig_figs);
332              
333 8         25 return($v, $epsilon, $sig_figs, $log10);
334             }
335              
336             =head2 run
337              
338             $filter->run();
339              
340             Internal: run over each row.
341              
342             =cut
343             sub run ($) {
344 0     0 1   my($self) = @_;
345              
346 0           my @fastpath_subs;
347 0           foreach (0..1) {
348 0           $fastpath_subs[$_] = $self->{_ins}[$_]->fastpath_sub();
349             };
350 0           my $out_fastpath_sub = $self->{_out}->fastpath_sub();
351              
352 0           my $epsilon_numerics = $self->{_epsilon_numerics};
353              
354 0           my $difference_count = 0;
355 0           my($f0, $f1);
356             # prime the pump
357 0           $f0 = &{$fastpath_subs[0]}();
  0            
358 0           $f1 = &{$fastpath_subs[1]}();
  0            
359 0           for (;;) {
360 0 0 0       last if (!defined($f0) && !defined($f1));
361 0 0         if (!defined($f0)) {
362             # 0 is done, but 1 lives on: drain it
363 0           $difference_count++;
364 0           push(@$f1, "+");
365 0           &$out_fastpath_sub($f1);
366 0           $f1 = &{$fastpath_subs[1]}();
  0            
367 0           next;
368             };
369 0 0         if (!defined($f1)) {
370             # vice versa
371 0           $difference_count++;
372 0           push(@$f0, "-");
373 0           &$out_fastpath_sub($f0);
374 0           $f0 = &{$fastpath_subs[0]}();
  0            
375 0           next;
376             };
377             # diff them
378 0           my $eq = 1;
379 0           foreach (0..$#{$f0}) {
  0            
380 0 0         if ($f0->[$_] ne $f1->[$_]) {
381 0 0 0       if ($epsilon_numerics && $f0->[$_] =~ /\./) {
382 0           my($g0, $epsilon) = _find_epsilon($f0->[$_]);
383 0           my($g1) = _find_epsilon($f1->[$_]);
384 0 0 0       if (!defined($g0) || !defined($g1)) {
385 0           $eq = undef;
386 0           last;
387             # non-numeric compare
388             };
389             #
390             # epsilon handles the print-level differences,
391             # BUT we still get floating point representation
392             # problems. For example, 7.22 vs 7.23
393             # has a difference of 0.0100000000000007,
394             # but an epsilon of 0.01
395             # (because computers work in binary).
396             #
397             # Fix: increase epsilon by
398             # its own "epsilon" of 1e-6 :-)
399             # (because single precision IEEE floating
400             # point has about 7 digits of precision).
401             #
402 0           $epsilon *= 1.000001;
403 0 0         if (abs($g0 - $g1) > $epsilon) {
404 0           $eq = undef;
405 0           last;
406             # sloppy numeric compare fails;
407             # non-numeric compare
408             };
409 0           $eq = 'epsilon';
410             # sloppy numeric compare succeeds; keep going
411             } else {
412 0           $eq = undef;
413 0           last;
414             };
415             };
416             };
417 0 0         if (defined($eq)) {
418 0 0         if ($eq eq 'epsilon') {
419 0           push(@$f1, "~");
420 0 0         &$out_fastpath_sub($f1) if ($self->{_quiet} <= 1);
421             } else {
422 0           push(@$f1, "=");
423 0 0         &$out_fastpath_sub($f1) if ($self->{_quiet} == 0);
424             };
425             } else {
426 0           $difference_count++;
427 0           push(@$f0, "-");
428 0           &$out_fastpath_sub($f0);
429 0           push(@$f1, "+");
430 0           &$out_fastpath_sub($f1);
431             };
432             # continue
433 0           $f0 = &{$fastpath_subs[0]}();
  0            
434 0           $f1 = &{$fastpath_subs[1]}();
  0            
435             };
436 0 0 0       if ($self->{_exit_one_if_diff} && $difference_count > 0) {
437             # unusual for Fsdb.
438 0           exit(1);
439             };
440             }
441              
442              
443             =head1 AUTHOR and COPYRIGHT
444              
445             Copyright (C) 2012-2015 by John Heidemann
446              
447             This program is distributed under terms of the GNU general
448             public license, version 2. See the file COPYING
449             with the distribution for details.
450              
451             =cut
452              
453             1;