File Coverage

blib/lib/DBIx/DataAudit.pm
Criterion Covered Total %
statement 30 149 20.1
branch 4 42 9.5
condition 2 40 5.0
subroutine 8 17 47.0
pod 10 10 100.0
total 54 258 20.9


line stmt bran cond sub pod time code
1             package DBIx::DataAudit;
2 3     3   4215 use strict;
  3         6  
  3         334  
3 3     3   48 use Carp qw(croak carp);
  3         5  
  3         598  
4 3     3   7472 use DBI;
  3         77995  
  3         397  
5 3     3   4381 use parent 'Class::Accessor';
  3         890  
  3         15  
6 3     3   9842 use vars '$VERSION';
  3         6  
  3         222  
7             $VERSION = '0.13';
8              
9             =head1 NAME
10              
11             DBIx::DataAudit - summarize column data for a table
12              
13             =head1 SYNOPSIS
14              
15             use DBIx::DataAudit;
16              
17             warn "Running audit for table $table";
18             my $audit = DBIx::DataAudit->audit( dsn => 'dbi:SQLite:dbname=test.sqlite', table => 'test' );
19             print $audit->as_text;
20              
21             # or
22             print $audit->as_html;
23              
24             This module provides a summary about the data contained in a table. It provides
25             the descriptive statistics for every column. It's surprising
26             how much bad data you find by looking at the minimum and maximum
27             values of a column alone.
28              
29             It tries to get the information in one table scan.
30              
31             =head1 HOW IT WORKS
32              
33             The module works by constructing an SQL statement that collects the information
34             about the columns in a single full table scan.
35              
36             =head1 COLUMN TRAITS
37              
38             You can specify which information is collected about every column by specifying the traits.
39             The hierarchy of traits is as follows:
40              
41             any < ordered < numeric
42             < string
43              
44             The following traits are collected for every column by default:
45              
46             =over 4
47              
48             =item * C
49              
50             Number of rows in the column
51              
52             =item * C
53              
54             Number of distinct values in the column
55              
56             =item * C
57              
58             Number of C values for the column
59              
60             =back
61              
62             For columns that are recognized as ordered, the following additional traits are collected:
63              
64             =over 4
65              
66             =item * C
67              
68             Minimum value for the column
69              
70             =item * C
71              
72             Maximum value for the column
73              
74             =back
75              
76             For columns that are recognized as numeric, the following additional traits are collected:
77              
78             =over 4
79              
80             =item * C
81              
82             Average value for the column
83              
84             =back
85              
86             For columns that are recognized as string, the following additional traits are collected:
87              
88             =over 4
89              
90             =item * C
91              
92             Number of values that consist only of blanks (C)
93              
94             =item * C
95              
96             Number of values that consist only of the empty string (C<''>)
97              
98             =item * C
99              
100             Number of values that consist only of the empty string (C<''>),
101             are blank (C) or are C
102              
103             =back
104              
105             =cut
106              
107             =head1 GLOBAL VARIABLES
108              
109             To customize some default behaviour, the some global variables
110             are defined. Read the source to find their names.
111              
112             =cut
113              
114 3     3   13 use vars qw'@default_traits %trait_type %trait_hierarchy $trait_inapplicable %sql_type_map';
  3         6  
  3         6181  
115              
116             @default_traits = qw[min max count values null avg blank empty missing ];
117              
118             %trait_type = (
119             count => ['any','count(%s)'],
120             values => ['any','count(distinct %s)'],
121             null => ['any','sum(case when %s is null then 1 else 0 end)'],
122             min => ['ordered','min(%s)'],
123             max => ['ordered','max(%s)'],
124             avg => ['numeric','avg(%s)'],
125             #modus => ['any','sum(1)group by %s'], # find the element that occurs the most
126             # Possibly with only a single table scan
127             blank => ['string',"sum(case when trim(%s)='' then 1 else 0 end)"],
128             empty => ['string',"sum(case when %s='' then 1 else 0 end)"],
129             missing => ['string',"sum(case when trim(%s)='' then 1 when %s is null then 1 else 0 end)"],
130             );
131              
132             %trait_hierarchy = (
133             any => [],
134             ordered => ['any'],
135             numeric => ['ordered','any'],
136             string => ['ordered','any'],
137             );
138              
139             $trait_inapplicable = 'NULL';
140              
141             %sql_type_map = (
142             BIGINT => 'numeric',
143             BOOLEAN => 'any',
144             CHAR => 'string',
145             'CHARACTER VARYING' => 'string',
146             DATETIME => 'ordered',
147             DATE => 'ordered',
148             DECIMAL => 'numeric',
149             ENUM => 'ordered',
150             INET => 'any',
151             INTEGER => 'numeric',
152             INT => 'numeric',
153             NUMERIC => 'numeric',
154             SMALLINT => 'numeric',
155             TEXT => 'string',
156             TIME => 'ordered',
157             'TIMESTAMP WITHOUT TIME ZONE' => 'ordered',
158             TIMESTAMP => 'ordered',
159             TINYINT => 'numeric',
160             'UNSIGNED BIGINT' => 'numeric',
161             VARCHAR => 'string',
162             );
163              
164             =head1 METHODS
165              
166             The class implements the following methods:
167              
168             =cut
169              
170             __PACKAGE__->mk_accessors(qw(table dbh dsn columns traits results where));
171              
172             =head2 C<< __PACKAGE__->audit ARGS >>
173              
174             Performs the data audit. Valid arguments are:
175              
176             =over 4
177              
178             =item * C
179              
180             Name of the table to audit. No default.
181              
182             =item * C
183              
184             Array reference to the traits. Default traits are
185              
186             min max count null avg blank empty missing
187              
188             =item * C
189              
190             Names of the columns to audit. Default are all columns of the table.
191              
192             =item * C
193              
194             Database handle. If missing, hopefully you have specified the C.
195              
196             =item * C
197              
198             DSN to use. Can be omitted if you pass in a valid C instead.
199              
200             =item * C
201              
202             Column information, in the same format as the DBI returns it.
203             By default, this will be read in via DBI.
204              
205             =back
206              
207             =cut
208              
209             sub audit {
210 0     0 1 0 my ($class, %args) = @_;
211              
212 0   0     0 $args{traits} ||= [ @default_traits ];
213 0 0       0 if (! @{$args{traits}}) {
  0         0  
214 0         0 $args{traits} = [ @default_traits ];
215             };
216 0   0     0 $args{dbh} ||= DBI->connect( $args{dsn}, undef, undef, {RaiseError => 1});
217              
218 0         0 my $self = \%args;
219 0         0 bless $self => $class;
220 0   0     0 $self->{columns} ||= [$self->get_columns];
221 0 0       0 if (! @{ $self->{columns}}) {
  0         0  
222 0         0 croak "Couldn't retrieve column information for table '$args{table}'. Does your DBD implement ->column_info?";
223             };
224 0   0     0 $self->{column_info} ||= $self->collect_column_info;
225              
226 0         0 $self
227             };
228              
229             =head2 C<< $audit->as_text RESULTS >>
230              
231             Returns a table drawn as text with the results.
232              
233             =cut
234              
235             sub as_text {
236 0     0 1 0 my ($self,$results) = @_;
237              
238 0         0 require Text::Table;
239 0         0 my $data = $self->template_data($results);
240 0         0 my $table = Text::Table->new( @{$data->{headings}} );
  0         0  
241 0         0 $table->load( @{$data->{rows}} );
  0         0  
242              
243 0         0 "Data analysis for $data->{table}:\n\n" . $table->table;
244             };
245              
246             =head2 C<< $audit->as_html RESULTS, TEMPLATE >>
247              
248             Returns a HTML page with the results.
249              
250             You can pass in a custom resultset or C if you want
251             the module to determine the results.
252              
253             You can pass in a custom (L) template
254             if you want fancier rendering.
255              
256             =cut
257              
258             sub as_html {
259 0     0 1 0 my ($self,$results,$template) = @_;
260 0         0 require Template;
261 0   0     0 $template ||= <
262             Data audit of table '[% table %]'
263            

Data audit of table '[% table %]'

264             [% FOR h IN headings %][%END%][% FOR v IN r %][%END%]
265            
266            
[%h%]
267            
268            
269             [% FOR r IN rows %]
270            
[%v FILTER html_entity%]
271             [% END %]
272            
273            
274            
275             TEMPLATE
276              
277 0         0 my $t = Template->new();
278 0         0 my $data = $self->template_data($results);
279              
280 0 0       0 $t->process(\$template,$data,\my $result)
281             || croak $t->error;
282 0         0 $result
283             };
284              
285             =head2 C<< $audit->template_data >>
286              
287             Returns a hash with the following three keys, suitable
288             for using with whatever templating system you have:
289              
290             =over 4
291              
292             =item *
293              
294             C - the name of the table
295              
296             =item *
297              
298             C - the headings of the columns
299              
300             =item *
301              
302             C - the values of the traits of every column
303              
304             =back
305              
306             =cut
307              
308             sub template_data {
309 0     0 1 0 my ($self,$results) = @_;
310 0   0     0 $results ||= $self->{results} || $self->run_audit;
      0        
311 0         0 my @results = @{ $results->[0] };
  0         0  
312              
313 0         0 my @headings = (@{ $self->traits });
  0         0  
314 0         0 my @rows;
315 0         0 for my $column (@{ $self->columns }) {
  0         0  
316 0         0 my @row = $column;
317 0         0 for my $trait (@headings) {
318 0         0 my $val = shift @results;
319 0 0       0 if (defined $val) {
320 0 0       0 if (length($val) > 20) {
321 0         0 $val = substr($val,0,20);
322             };
323 0         0 $val =~ s/[\x00-\x1f]/./g;
324             };
325 0 0       0 push @row, defined $val ? $val : 'n/a';
326             };
327 0         0 push @rows, \@row;
328             };
329              
330 0         0 my $res = {
331             table => $self->table,
332             headings => ['column',@headings],
333             rows => \@rows,
334             };
335             };
336              
337             =head2 C<< $audit->run_audit >>
338              
339             Actually runs the SQL in the database.
340              
341             =cut
342              
343             sub run_audit {
344 0     0 1 0 my ($self) = @_;
345              
346 0         0 my $sql = $self->get_sql;
347 0         0 $self->{results} = $self->dbh->selectall_arrayref($sql,{});
348             };
349              
350             =head2 C<< $audit->column_type COLUMN >>
351              
352             Returns the type for the column. The four valid types are C, C, C and C.
353              
354             =cut
355              
356             sub column_type {
357 0     0 1 0 my ($self,$column) = @_;
358 0 0       0 if (! $self->{column_info}) {
359 0         0 $self->{column_info} = $self->collect_column_info;
360             };
361 0         0 my $info = $self->{column_info};
362 0         0 map {
363 0         0 $_->{trait_type};
364 0         0 } grep { $_->{COLUMN_NAME} eq $column } @$info;
365             };
366              
367             =head2 C<< $audit->get_columns TABLE >>
368              
369             Returns the names of the columns for the table C.
370             By default, the value of C will be taken from the value
371             passed to the constructor C.
372              
373             =cut
374              
375             sub get_columns {
376 0     0 1 0 my ($self,$table) = @_;
377 0   0     0 $table ||= $self->table;
378 0 0       0 if (! $self->{column_info}) {
379 0         0 $self->{column_info} = $self->collect_column_info;
380             };
381 0         0 my $info = $self->{column_info};
382 0         0 my @sorted = @$info;
383              
384             # Order the columns in the "right" order, if possible
385 0 0 0     0 if (exists $sorted[0]->{ORDINAL_POSITION} && defined $sorted[0]->{ORDINAL_POSITION}) {
386 0         0 @sorted = sort { $a->{ORDINAL_POSITION} <=> $b->{ORDINAL_POSITION} } @sorted;
  0         0  
387             };
388 0         0 map {
389 0         0 $_->{COLUMN_NAME};
390             } @sorted;
391             };
392              
393             =head2 C<< $audit->collect_column_info TABLE >>
394              
395             Collects the information about the columns for the table C
396             from the DBI. By default, C will be taken from the
397             value passed to the constructor C.
398              
399             If your database driver does not implement the C<< ->column_info >>
400             method you are out of luck. A fatal error is raised by this method
401             if C<< ->column_info >> does not return anything.
402              
403             This method will raise warnings if it encounters a data type that
404             it doesn't know yet. You can either patch the
405             global variable C<%sql_type_map> to add the type or submit a patch
406             to me to add the type and its interpretation.
407              
408             =cut
409              
410             sub collect_column_info {
411 0     0 1 0 my ($self,$table) = @_;
412 0   0     0 $table ||= $self->table;
413 0         0 my $schema;
414 0 0       0 if ($table =~ s/^(.*)\.//) {
415 0         0 $schema = $1;
416             };
417 0         0 my $sth = $self->dbh->column_info(undef,$schema,$table,undef);
418 0 0       0 if (! $sth) {
419 0 0       0 if( $schema ) {
420 0         0 $schema= "$schema.";
421             } else {
422 0         0 $schema= '';
423             };
424 0         0 croak "Couldn't collect column information for table '$schema$table'. Does your DBD implement ->column_info?";
425             };
426 0         0 my $info = $sth->fetchall_arrayref({});
427              
428 0 0       0 if( !@$info ) {
429 0         0 croak "'$schema$table' seems to have no columns. Does your DBD implement ->column_info?";
430             };
431              
432 0         0 for my $i (@$info) {
433 0         0 my $sqltype = uc $i->{TYPE_NAME};
434              
435             # Fix for Pg - convert enum types to "ENUM":
436 0 0 0     0 if (exists $i->{pg_enum_values} && defined $i->{pg_enum_values}) {
437 0         0 $sqltype = 'ENUM';
438             };
439              
440 0 0       0 if (not exists $sql_type_map{ $sqltype }) {
441 0         0 warn sprintf q{Unknown SQL data type '%s' for column "%s.%s"; some traits will be unavailable\n},
442             $sqltype, $table, $i->{COLUMN_NAME};
443             };
444 0   0     0 $i->{trait_type} = $sql_type_map{ $sqltype } || 'any';
445             };
446              
447 0         0 $info
448             };
449              
450             =head2 C<< $audit->get_sql TABLE >>
451              
452             Creates the SQL statement to collect the information.
453             The default value for C will be the table passed
454             to the constructor C.
455              
456             If you encounter errors from your SQL engine, you may want
457             to print the result of this method out.
458              
459             =cut
460              
461             sub get_sql {
462 0     0 1 0 my ($self,$table) = @_;
463 0   0     0 $table ||= $self->table;
464 0         0 my @columns = @{ $self->columns };
  0         0  
465 0         0 my @traits = @{$self->traits};
  0         0  
466              
467 0         0 my @resultset;
468 0         0 for my $column (@columns) {
469 0         0 for my $trait (@traits) {
470 0         0 my $name = "${column}_${trait}";
471 0         0 $name =~ s/"//g; # unquote quoted columns
472 0 0       0 if ($self->trait_applies( $trait, $column )) {
473 0         0 my $tmpl = $trait_type{$trait}->[1];
474 0         0 $tmpl =~ s/%s/$column/g;
475 0         0 push @resultset, "$tmpl as $name";
476             } else {
477 0         0 push @resultset, "NULL as $name";
478             };
479             };
480             };
481 0 0       0 my $where = $self->where ? "WHERE " . $self->where : '';
482 0         0 my $statement = sprintf "SELECT %s FROM %s\n%s", join("\n ,", @resultset), $table, $where;
483 0         0 return $statement
484             };
485              
486             =head2 C<< $audit->trait_applies TRAIT, COLUMN >>
487              
488             Checks whether a trait applies to a column.
489              
490             A trait applies to a column if the trait type is C
491             or if it is the same type as the column type as returned
492             by C.
493              
494             The method will raise an error if it is passed an unknown
495             trait name. See the source code for how to add custom
496             traits.
497              
498             =cut
499              
500             sub trait_applies {
501 36     36 1 9370 my ($self, $trait, $column) = @_;
502 36 50       94 if (not exists $trait_type{$trait}) {
503 0         0 carp "Unknown trait '$trait'";
504             };
505 36   50     114 my $trait_type = $trait_type{$trait}->[0] || '';
506              
507 36 100       81 return 1 if ($trait_type eq 'any');
508              
509 24         68 (my $type) = $self->column_type($column);
510 24         69 my @subtypes = @{ $trait_hierarchy{ $type } };
  24         56  
511              
512 24         34 return scalar grep { $trait_type eq $_ } ($type,@subtypes);
  54         132  
513             };
514              
515             =head1 COMMAND LINE USAGE
516              
517             You can use this mail from the command line if you need a quick check of data:
518              
519             perl -MDBIx::DataAudit=dbi:SQLite:dbname=some/db.sqlite my_table [traits]
520              
521             This could also incredibly useful if you want a breakdown of a csv-file:
522              
523             perl -MDBIx::DataAudit=dbi:AnyData:dbname=some/db.sqlite my_table [traits]
524              
525             Unfortunately, that does not work yet, as I haven't found a convenient
526             oneliner way to make a CSV file appear as database.
527              
528             =cut
529              
530             sub import {
531 2     2   20 my ($class, $dsn) = @_;
532 2         5 (my $target) = caller;
533 2 50 33     53 if ($target eq 'main' and $dsn) {
534 0           my ($table,@traits) = @ARGV;
535 0           my @tables = split /,/,$table;
536 0 0         if (! @traits) {
537 0           @traits = @default_traits;
538             };
539 0           for my $table (@tables) {
540 0           my $self = $class->audit(dsn => $dsn, table => $table, traits => \@traits);
541 0           print "Data audit for table '$table'\n\n";
542 0           print $self->as_text;
543             };
544             };
545             };
546              
547             1;
548              
549             __END__