File Coverage

blib/lib/Data/Prepare.pm
Criterion Covered Total %
statement 118 123 95.9
branch 38 46 82.6
condition 26 29 89.6
subroutine 15 15 100.0
pod 10 10 100.0
total 207 223 92.8


line stmt bran cond sub pod time code
1             package Data::Prepare;
2              
3 1     1   112632 use strict;
  1         12  
  1         30  
4 1     1   5 use warnings;
  1         2  
  1         27  
5 1     1   5 use Exporter 'import';
  1         2  
  1         2045  
6              
7             our $VERSION = '0.004';
8             our @EXPORT_OK = qw(
9             cols_non_empty
10             non_unique_cols
11             key_to_index
12             make_pk_map
13             pk_col_counts
14             pk_match
15             chop_lines
16             chop_cols
17             header_merge
18             pk_insert
19             );
20              
21             sub chop_lines {
22 4     4 1 26473 my ($choplines, $data) = @_;
23 4         48 splice @$data, $_, 1 for @$choplines;
24             }
25              
26             sub chop_cols {
27 1     1 1 4 my ($chopcols, $data) = @_;
28 1         8 for my $c (sort {$b <=> $a} @$chopcols) {
  28         37  
29 15         922 splice @$_, $c, 1 for @$data;
30             }
31             }
32              
33             my %where2offset = (up => -1, self => 0, down => 1);
34             sub header_merge {
35 3     3 1 12 my ($merge_spec, $data) = @_;
36 3         11 for my $spec (@$merge_spec) {
37 10         37 my ($do, $l, $matchfrom, $matchto) = @$spec{qw(do line matchfrom matchto)};
38 10         52 my ($from_row, $to_row) = map $data->[$l + $where2offset{$spec->{$_}}], qw(from to);
39 10   100     42 my ($kept, $fromspec, $justone, $which_index) = ('', ($spec->{fromspec} || ''));
40 10 100 100     58 if (($spec->{tospec} || '') =~ /^index:(\d+)/) {
41 4         8 $justone = 1;
42 4         9 $which_index = $1;
43             }
44 10         29 for my $i (0..$#$to_row) {
45 76   100     223 $kept = $from_row->[$i] || $kept;
46 76 100 100     326 next if defined $matchto and $to_row->[$i] !~ /$matchto/;
47 50 100 100     156 next if defined $matchfrom and $from_row->[$i] !~ /$matchfrom/;
48 48 100       133 my $basic_from =
    100          
    100          
    100          
49             $fromspec eq 'lastnonblank' ? $kept :
50             $fromspec eq 'left' ? $from_row->[$i - 1] :
51             $fromspec =~ /^literal:(.*)/ ? $1 :
52             $from_row->[$justone ? $which_index : $i];
53 48 100       84 my $basic_to = $to_row->[$justone ? $which_index : $i];
54 48 0       124 my $what =
    50          
    100          
55             $do->[0] eq 'overwrite' ? $basic_from :
56             $do->[0] eq 'prepend' ? $basic_from . $do->[1] . $basic_to :
57             $do->[0] eq 'append' ? $basic_to . $do->[1] . $basic_from :
58             die "Unknown action '$do->[0]'";
59 48 100       78 if ($justone) {
60 4         7 $to_row->[$which_index] = $what;
61 4         16 last;
62             } else {
63 44         79 $to_row->[$i] = $what;
64             }
65             }
66             }
67             }
68              
69             sub pk_insert {
70 1     1 1 6 my ($spec, $data, $pk_map, $stopwords) = @_;
71 1         5 my ($ch, $lc, $pkc, $fb) = (@$spec{qw(column_heading local_column pk_column use_fallback)});
72 1         7 my $key_index = key_to_index($data->[0])->{$lc};
73 1 50       8 die "undef index for key '$lc'" if !defined $key_index;
74 1         3 unshift @{ $data->[0] }, $ch;
  1         4  
75 1         4 my $exact_map = $pk_map->{$pkc};
76 1         12 for my $row (@$data[ 1..$#$data ]) {
77 196         367 my $key_val = $row->[ $key_index ];
78 196         337 my $pkv = $exact_map->{ $key_val };
79 196 100 66     473 unshift(@$row, $pkv), next if defined $pkv or !$fb;
80 20         47 ($pkv) = pk_match($key_val, $pk_map, $stopwords);
81 20         64 unshift(@$row, $pkv);
82             }
83             }
84              
85             sub cols_non_empty {
86 1     1 1 17777 my ($data) = @_;
87 1         3 my @col_non_empty;
88 1         3 for my $line (@$data) {
89 199   100     4538 $col_non_empty[$_] ||= 0 for 0..$#$line;
90 199         1566 $col_non_empty[$_]++ for grep length $line->[$_], 0..$#$line;
91             }
92 1         11 @col_non_empty;
93             }
94              
95             sub non_unique_cols {
96 1     1 1 3 my ($data) = @_;
97 1         3 my ($line, %col2count) = $data->[0];
98 1         6 $col2count{$_}++ for @$line;
99 1         7 delete @col2count{ grep $col2count{$_} == 1, keys %col2count };
100 1         4 \%col2count;
101             }
102              
103             sub key_to_index {
104 4     4 1 13 my ($row) = @_;
105 4         143 +{ map +($row->[$_] => $_), 0..$#$row };
106             }
107              
108             sub make_pk_map {
109 1     1 1 16685 my ($data, $pk_colkey, $other_colkeys) = @_;
110 1         6 my $k2i = key_to_index($data->[0]);
111 1         8 my @invalid = grep !defined $k2i->{$_}, $pk_colkey, @$other_colkeys;
112 1 50       5 die "Invalid keys (@invalid)" if @invalid;
113 1         2 my $pk_colnum = $k2i->{$pk_colkey};
114 1         2 my %altcol2value2pk;
115 1         5 for my $i (1..$#$data) {
116 250         324 my $row = $data->[$i];
117 250 100       434 next if !length(my $pk_val = $row->[$pk_colnum]);
118 249         349 for my $alt_k ($pk_colkey, @$other_colkeys) {
119 1494 100       2867 next if !length(my $alt_v = $row->[$k2i->{$alt_k}]);
120 1387         2872 $altcol2value2pk{$alt_k}{$alt_v} = $pk_val;
121             }
122             }
123 1         12 \%altcol2value2pk;
124             }
125              
126             sub pk_col_counts {
127 1     1 1 5388 my ($data, $pk_map) = @_;
128 1         4 my $k2i = key_to_index($data->[0]);
129 1         3 my (%col2code2exact, @no_exact_match);
130 1         5 for my $i (1..$#$data) {
131 196         322 my ($row, $exact_match) = $data->[$i];
132 196         527 for my $possible_col (keys %$k2i) {
133 3724         5457 my $val = $row->[ $k2i->{$possible_col} ];
134 3724         10590 my @match_codes_yes = grep exists $pk_map->{$_}{$val}, keys %$pk_map;
135 3724         6157 $col2code2exact{$possible_col}{$_}++ for @match_codes_yes;
136 3724   100     7267 $exact_match ||= @match_codes_yes;
137             }
138 196 100       498 push @no_exact_match, $row if !$exact_match;
139             }
140 1         15 (\%col2code2exact, \@no_exact_match);
141             }
142              
143             sub _match_register {
144 1440     1440   2687 my ($matches, $code, $this_map, $pk_val2count, $pk_col2pk_value2count, $pk_val2from) = @_;
145             $pk_val2count->{$_}++,
146             $pk_col2pk_value2count->{$code}{$_}++
147 1440         4795 for map $this_map->{$_}, @$matches;
148 1440         3058 for (@$matches) {
149             # track longest matched-value per PK, to tie-break on shortest one
150 858         1275 my $this_pk_val = $this_map->{$_};
151             $pk_val2from->{$this_pk_val} = $_ if
152 858 100 100     3654 length($pk_val2from->{$this_pk_val}||'') < length;
153             }
154             }
155              
156             sub pk_match {
157 216     216 1 841 my ($value, $pk_map, $stopwords) = @_;
158 216         336 my %stopword; @stopword{@$stopwords} = ();
  216         602  
159 216         871 my @val_words = grep length, split /[^A-Za-z]/, $value;
160 216 50       1201 my $val_pat = join '.*', map +(/[A-Z]{2,}/ ? split //, $_ : $_), @val_words;
161 216         964 @val_words = grep !exists $stopword{$_}, map lc, grep length > 2, @val_words;
162 216         419 my (%pk_col2pk_value2count, %pk_val2count, %pk_val2from);
163 216         554 for my $code (keys %$pk_map) {
164 1296         2147 my $this_map = $pk_map->{$code};
165 1     1   817 my @matches = grep /$val_pat/i, keys %$this_map;
  1         32  
  1         16  
  1296         93713  
166 1296         39154 _match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count, \%pk_val2from);
167             }
168 216 100       1247 if ((my @abbrev_parts = grep length, split /\s*[\(,]\s*/, $value) > 1) {
169 12         176 s/(.*?)[^A-Za-z]+(.*?)/$1.*$2/g for @abbrev_parts;
170 12         39 my $suff_pref_pat = join '.*', reverse @abbrev_parts;
171 12         38 for my $code (keys %$pk_map) {
172 72         125 my $this_map = $pk_map->{$code};
173 72         5873 my @matches = grep /$suff_pref_pat/i, keys %$this_map;
174 72         787 _match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count);
175 72         9507 @matches = grep /^$suff_pref_pat/i, keys %$this_map;
176 72         736 _match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count);
177             }
178             }
179 216 50       608 if (!keys %pk_col2pk_value2count) {
180 0         0 for my $code (keys %$pk_map) {
181 0         0 my $this_map = $pk_map->{$code};
182 0         0 for my $word (@val_words) {
183 0         0 my @matches = grep /\b\Q$word\E\b/i, keys %$this_map;
184 0         0 _match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count, \%pk_val2from);
185             }
186             }
187             }
188             my ($best) = sort {
189 216         588 $pk_val2count{$b} <=> $pk_val2count{$a}
190             ||
191 29 50 66     137 length($pk_val2from{$a}) <=> length($pk_val2from{$b})
192             ||
193             $a cmp $b
194             } keys %pk_val2count;
195 216   66     505 my @pk_cols_unique_best = sort grep keys %{ $pk_col2pk_value2count{$_} } == 1 && $pk_col2pk_value2count{$_}{$best}, keys %pk_col2pk_value2count;
196 216         1890 ($best, \@pk_cols_unique_best);
197             }
198              
199             1;
200              
201             =encoding utf8
202              
203             =head1 NAME
204              
205             Data::Prepare - prepare CSV (etc) data for automatic processing
206              
207             =head1 SYNOPSIS
208              
209             use Text::CSV qw(csv);
210             use Data::Prepare qw(
211             cols_non_empty non_unique_cols
212             chop_lines chop_cols header_merge
213             );
214             my $data = csv(in => 'unclean.csv', encoding => "UTF-8");
215             chop_cols([0, 2], $data);
216             header_merge($spec, $data);
217             chop_lines(\@lines, $data); # mutates the data
218              
219             # or:
220             my @non_empty_counts = cols_non_empty($data);
221             print Dumper(non_unique_cols($data));
222              
223             =head1 DESCRIPTION
224              
225             A module with utility functions for turning spreadsheets published for
226             human consumption into ones suitable for automatic processing. Intended
227             to be used by the supplied L script. See that script's
228             documentation for a suggested workflow.
229              
230             All the functions are exportable, none are exported by default.
231             All the C<$data> inputs are an array-ref-of-array-refs.
232              
233             =head1 FUNCTIONS
234              
235             =head2 chop_cols
236              
237             chop_cols([0, 2], $data);
238              
239             Uses C to delete each zero-based column index. The example above
240             deletes the first and third columns.
241              
242             =head2 chop_lines
243              
244             chop_lines([ 0, (-1) x $n ], $data);
245              
246             Uses C to delete each zero-based line index, in the order
247             given. The example above deletes the first, and last C<$n>, lines.
248              
249             =head2 header_merge
250              
251             header_merge([
252             { line => 1, from => 'up', fromspec => 'lastnonblank', to => 'self', matchto => 'HH', do => [ 'overwrite' ] },
253             { line => 1, from => 'self', matchfrom => '.', to => 'down', do => [ 'prepend', ' ' ] },
254             { line => 2, from => 'self', fromspec => 'left', to => 'self', matchto => 'Year', do => [ 'prepend', '/' ] },
255             { line => 2, from => 'self', fromspec => 'literal:Country', to => 'self', tospec => 'index:0', do => [ 'overwrite' ] },
256             ], $data);
257             # Turns:
258             # [
259             # [ '', 'Proportion of households with', '', '', '' ],
260             # [ '', '(HH1)', 'Year', '(HH2)', 'Year' ],
261             # [ '', 'Radio', 'of data', 'TV', 'of data' ],
262             # ]
263             # into (after a further chop_lines to remove the first two):
264             # [
265             # [
266             # 'Country',
267             # 'Proportion of households with Radio', 'Proportion of households with Radio/Year of data',
268             # 'Proportion of households with TV', 'Proportion of households with TV/Year of data'
269             # ]
270             # ]
271              
272             Applies the given transformations to the given data, so you can make the
273             given data have the first row be your desired headers for the columns.
274             As shown in the above example, this does not delete lines so further
275             operations may be needed.
276              
277             Broadly, each hash-ref specifies one operation, which acts on a single
278             (specified) line-number. It scans along that line from left to right,
279             unless C matches C in which case only one operation
280             is done.
281              
282             The above merge operations in YAML format:
283              
284             spec:
285             - do:
286             - overwrite
287             from: up
288             fromspec: lastnonblank
289             line: 2
290             matchto: HH
291             to: self
292             - do:
293             - prepend
294             - ' '
295             from: self
296             line: 2
297             matchfrom: .
298             to: down
299             - do:
300             - prepend
301             - /
302             from: self
303             fromspec: left
304             line: 3
305             matchto: Year
306             to: self
307             - do:
308             - overwrite
309             from: self
310             fromspec: literal:Country
311             line: 3
312             to: self
313             tospec: index:0
314              
315             This turns the first three lines of data excerpted from the supplied example
316             data (shown in CSV with spaces inserted for alignment reasons only):
317              
318             ,Proportion of households with, , ,
319             ,(HH1) ,Year ,(HH2),Year
320             ,Radio ,of data,TV ,of data
321             Belize,58.7 ,2019 ,78.7 ,2019
322              
323             into the following. Note that the first two lines will still be present
324             (not shown), possibly modified, so you will need your chop_lines to
325             remove them. The columns of the third line are shown, one per line,
326             for readability:
327              
328             Country,
329             Proportion of households with Radio,
330             Proportion of households with Radio/Year of data,
331             Proportion of households with TV,
332             Proportion of households with TV/Year of data
333              
334             This achieves a single row of column-headings, with each column-heading
335             being unique, and sufficiently meaningful.
336              
337             =head2 pk_insert
338              
339             pk_insert({
340             column_heading => 'ISO3CODE',
341             local_column => 'Country',
342             pk_column => 'official_name_en',
343             }, $data, $pk_map, $stopwords);
344              
345             In YAML format, this is the same configuration:
346              
347             pk_insert:
348             - files:
349             - examples/CoreHouseholdIndicators.csv
350             spec:
351             column_heading: ISO3CODE
352             local_column: Country
353             pk_column: official_name_en
354             use_fallback: true
355              
356             And the C<$pk_map> made with L, inserts the
357             C in front of the current zero-th column, mapping the
358             value of the C column as looked up from the specified column
359             of the C file, and if C is true, also tries
360             L if no exact match is found. In that case, C
361             must be specified in the configuration
362              
363             =head2 cols_non_empty
364              
365             my @col_non_empty = cols_non_empty($data);
366              
367             In the given data, iterates through all rows and returns a list of
368             quantities of non-blank entries in each column. This can be useful to spot
369             columns with only a couple of entries, which are more usefully chopped.
370              
371             =head2 non_unique_cols
372              
373             my $col2count = non_unique_cols($data);
374              
375             Takes the first row of the given data, and returns a hash-ref mapping
376             any non-unique column-names to the number of times they appear.
377              
378             =head2 key_to_index
379              
380             Given an array-ref (probably the first row of a CSV file, i.e. column
381             headings), returns a hash-ref mapping the cell values to their zero-based
382             index.
383              
384             =head2 make_pk_map
385              
386             my $altcol2value2pk = make_pk_map($data, $pk_colkey, \@other_colkeys);
387              
388             Given C<$data>, the heading of the primary-key column, and an array-ref
389             of headings of alternative key columns, returns a hash-ref mapping each
390             of those alternative key columns (plus the C<$pk_colkey>) to a map from
391             that column's value to the relevant row's primary-key value.
392              
393             This is most conveniently represented in YAML format:
394              
395             pk_spec:
396             file: examples/country-codes.csv
397             primary_key: ISO3166-1-Alpha-3
398             alt_keys:
399             - ISO3166-1-Alpha-2
400             - UNTERM English Short
401             - UNTERM English Formal
402             - official_name_en
403             - CLDR display name
404             stopwords:
405             - islands
406             - china
407             - northern
408              
409             =head2 pk_col_counts
410              
411             my ($colname2potential_key2count, $no_exact_match) = pk_col_counts($data, $pk_map);
412              
413             Given C<$data> and a primary-key (etc) map created by the above, returns
414             a tuple of a hash-ref mapping each column that gave any matches to a
415             further hash-ref mapping each of the potential key columns given above
416             to how many matches it gave, and an array-ref of rows that had no exact
417             matches.
418              
419             =head2 pk_match
420              
421             my ($best, $pk_cols_unique_best) = pk_match($value, $pk_map, $stopwords);
422              
423             Given a value, C<$pk_map>, and an array-ref of case-insensitive stopwords,
424             returns its best match for the right primary-key value, and an array-ref
425             of which primary-key columns in the C<$pk_map> matched the given value
426             exactly once.
427              
428             The latter is useful for analysis purposes to select which primary-key
429             column to use for this data-set.
430              
431             The algorithm used for this best-match:
432              
433             =over
434              
435             =item *
436              
437             Splits the value into words (or where a word is two or more capital
438             letters, letters). The search allows any, or no, text, to occur between
439             these entities. Each configured primary-key column's keys are searched
440             for matches.
441              
442             =item *
443              
444             If there is a separating C<,> or C<(> (as commonly used for
445             abbreviations), splits the value into chunks, reverses them, and then
446             reassembles the chunks as above for a similar search.
447              
448             =item *
449              
450             Only if there were no matches from the previous steps, splits the value
451             into words. Words that are shorter than three characters, or that occur in
452             the stopword list, are omitted. Then each word is searched for as above.
453              
454             =item *
455              
456             "Votes" on which primary-key value got the most matches. Tie-breaks on
457             which primary-key value matched on the shortest key in the relevant
458             C<$pk_map> column, and then on the lexically lowest-valued primary-key
459             value, to ensure stable return values.
460              
461             =back
462              
463             =head1 SEE ALSO
464              
465             L
466              
467             =head1 LICENSE AND COPYRIGHT
468              
469             Copyright (C) Ed J
470              
471             This library is free software; you can redistribute it and/or modify
472             it under the same terms as Perl itself.
473              
474             =cut