File Coverage

blib/lib/Data/Prepare.pm

Criterion	Covered	Total	%
statement	118	123	95.9
branch	38	46	82.6
condition	26	29	89.6
subroutine	15	15	100.0
pod	10	10	100.0
total	207	223	92.8

line	stmt	bran	cond	sub	pod	time	code
1							package Data::Prepare;
2
3	1			1		112632	use strict;
	1					12
	1					30
4	1			1		5	use warnings;
	1					2
	1					27
5	1			1		5	use Exporter 'import';
	1					2
	1					2045
6
7							our $VERSION = '0.004';
8							our @EXPORT_OK = qw(
9							cols_non_empty
10							non_unique_cols
11							key_to_index
12							make_pk_map
13							pk_col_counts
14							pk_match
15							chop_lines
16							chop_cols
17							header_merge
18							pk_insert
19							);
20
21							sub chop_lines {
22	4			4	1	26473	my ($choplines, $data) = @_;
23	4					48	splice @$data, $_, 1 for @$choplines;
24							}
25
26							sub chop_cols {
27	1			1	1	4	my ($chopcols, $data) = @_;
28	1					8	for my $c (sort {$b <=> $a} @$chopcols) {
	28					37
29	15					922	splice @$_, $c, 1 for @$data;
30							}
31							}
32
33							my %where2offset = (up => -1, self => 0, down => 1);
34							sub header_merge {
35	3			3	1	12	my ($merge_spec, $data) = @_;
36	3					11	for my $spec (@$merge_spec) {
37	10					37	my ($do, $l, $matchfrom, $matchto) = @$spec{qw(do line matchfrom matchto)};
38	10					52	my ($from_row, $to_row) = map $data->[$l + $where2offset{$spec->{$_}}], qw(from to);
39	10		100			42	my ($kept, $fromspec, $justone, $which_index) = ('', ($spec->{fromspec} \|\| ''));
40	10	100	100			58	if (($spec->{tospec} \|\| '') =~ /^index:(\d+)/) {
41	4					8	$justone = 1;
42	4					9	$which_index = $1;
43							}
44	10					29	for my $i (0..$#$to_row) {
45	76		100			223	$kept = $from_row->[$i] \|\| $kept;
46	76	100	100			326	next if defined $matchto and $to_row->[$i] !~ /$matchto/;
47	50	100	100			156	next if defined $matchfrom and $from_row->[$i] !~ /$matchfrom/;
48	48	100				133	my $basic_from =
		100
		100
		100
49							$fromspec eq 'lastnonblank' ? $kept :
50							$fromspec eq 'left' ? $from_row->[$i - 1] :
51							$fromspec =~ /^literal:(.*)/ ? $1 :
52							$from_row->[$justone ? $which_index : $i];
53	48	100				84	my $basic_to = $to_row->[$justone ? $which_index : $i];
54	48	0				124	my $what =
		50
		100
55							$do->[0] eq 'overwrite' ? $basic_from :
56							$do->[0] eq 'prepend' ? $basic_from . $do->[1] . $basic_to :
57							$do->[0] eq 'append' ? $basic_to . $do->[1] . $basic_from :
58							die "Unknown action '$do->[0]'";
59	48	100				78	if ($justone) {
60	4					7	$to_row->[$which_index] = $what;
61	4					16	last;
62							} else {
63	44					79	$to_row->[$i] = $what;
64							}
65							}
66							}
67							}
68
69							sub pk_insert {
70	1			1	1	6	my ($spec, $data, $pk_map, $stopwords) = @_;
71	1					5	my ($ch, $lc, $pkc, $fb) = (@$spec{qw(column_heading local_column pk_column use_fallback)});
72	1					7	my $key_index = key_to_index($data->[0])->{$lc};
73	1	50				8	die "undef index for key '$lc'" if !defined $key_index;
74	1					3	unshift @{ $data->[0] }, $ch;
	1					4
75	1					4	my $exact_map = $pk_map->{$pkc};
76	1					12	for my $row (@$data[ 1..$#$data ]) {
77	196					367	my $key_val = $row->[ $key_index ];
78	196					337	my $pkv = $exact_map->{ $key_val };
79	196	100	66			473	unshift(@$row, $pkv), next if defined $pkv or !$fb;
80	20					47	($pkv) = pk_match($key_val, $pk_map, $stopwords);
81	20					64	unshift(@$row, $pkv);
82							}
83							}
84
85							sub cols_non_empty {
86	1			1	1	17777	my ($data) = @_;
87	1					3	my @col_non_empty;
88	1					3	for my $line (@$data) {
89	199		100			4538	$col_non_empty[$_] \|\|= 0 for 0..$#$line;
90	199					1566	$col_non_empty[$_]++ for grep length $line->[$_], 0..$#$line;
91							}
92	1					11	@col_non_empty;
93							}
94
95							sub non_unique_cols {
96	1			1	1	3	my ($data) = @_;
97	1					3	my ($line, %col2count) = $data->[0];
98	1					6	$col2count{$_}++ for @$line;
99	1					7	delete @col2count{ grep $col2count{$_} == 1, keys %col2count };
100	1					4	\%col2count;
101							}
102
103							sub key_to_index {
104	4			4	1	13	my ($row) = @_;
105	4					143	+{ map +($row->[$_] => $_), 0..$#$row };
106							}
107
108							sub make_pk_map {
109	1			1	1	16685	my ($data, $pk_colkey, $other_colkeys) = @_;
110	1					6	my $k2i = key_to_index($data->[0]);
111	1					8	my @invalid = grep !defined $k2i->{$_}, $pk_colkey, @$other_colkeys;
112	1	50				5	die "Invalid keys (@invalid)" if @invalid;
113	1					2	my $pk_colnum = $k2i->{$pk_colkey};
114	1					2	my %altcol2value2pk;
115	1					5	for my $i (1..$#$data) {
116	250					324	my $row = $data->[$i];
117	250	100				434	next if !length(my $pk_val = $row->[$pk_colnum]);
118	249					349	for my $alt_k ($pk_colkey, @$other_colkeys) {
119	1494	100				2867	next if !length(my $alt_v = $row->[$k2i->{$alt_k}]);
120	1387					2872	$altcol2value2pk{$alt_k}{$alt_v} = $pk_val;
121							}
122							}
123	1					12	\%altcol2value2pk;
124							}
125
126							sub pk_col_counts {
127	1			1	1	5388	my ($data, $pk_map) = @_;
128	1					4	my $k2i = key_to_index($data->[0]);
129	1					3	my (%col2code2exact, @no_exact_match);
130	1					5	for my $i (1..$#$data) {
131	196					322	my ($row, $exact_match) = $data->[$i];
132	196					527	for my $possible_col (keys %$k2i) {
133	3724					5457	my $val = $row->[ $k2i->{$possible_col} ];
134	3724					10590	my @match_codes_yes = grep exists $pk_map->{$_}{$val}, keys %$pk_map;
135	3724					6157	$col2code2exact{$possible_col}{$_}++ for @match_codes_yes;
136	3724		100			7267	$exact_match \|\|= @match_codes_yes;
137							}
138	196	100				498	push @no_exact_match, $row if !$exact_match;
139							}
140	1					15	(\%col2code2exact, \@no_exact_match);
141							}
142
143							sub _match_register {
144	1440			1440		2687	my ($matches, $code, $this_map, $pk_val2count, $pk_col2pk_value2count, $pk_val2from) = @_;
145							$pk_val2count->{$_}++,
146							$pk_col2pk_value2count->{$code}{$_}++
147	1440					4795	for map $this_map->{$_}, @$matches;
148	1440					3058	for (@$matches) {
149							# track longest matched-value per PK, to tie-break on shortest one
150	858					1275	my $this_pk_val = $this_map->{$_};
151							$pk_val2from->{$this_pk_val} = $_ if
152	858	100	100			3654	length($pk_val2from->{$this_pk_val}\|\|'') < length;
153							}
154							}
155
156							sub pk_match {
157	216			216	1	841	my ($value, $pk_map, $stopwords) = @_;
158	216					336	my %stopword; @stopword{@$stopwords} = ();
	216					602
159	216					871	my @val_words = grep length, split /[^A-Za-z]/, $value;
160	216	50				1201	my $val_pat = join '.*', map +(/[A-Z]{2,}/ ? split //, $_ : $_), @val_words;
161	216					964	@val_words = grep !exists $stopword{$_}, map lc, grep length > 2, @val_words;
162	216					419	my (%pk_col2pk_value2count, %pk_val2count, %pk_val2from);
163	216					554	for my $code (keys %$pk_map) {
164	1296					2147	my $this_map = $pk_map->{$code};
165	1			1		817	my @matches = grep /$val_pat/i, keys %$this_map;
	1					32
	1					16
	1296					93713
166	1296					39154	_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count, \%pk_val2from);
167							}
168	216	100				1247	if ((my @abbrev_parts = grep length, split /\s[\(,]\s/, $value) > 1) {
169	12					176	s/(.?)[^A-Za-z]+(.?)/$1.*$2/g for @abbrev_parts;
170	12					39	my $suff_pref_pat = join '.*', reverse @abbrev_parts;
171	12					38	for my $code (keys %$pk_map) {
172	72					125	my $this_map = $pk_map->{$code};
173	72					5873	my @matches = grep /$suff_pref_pat/i, keys %$this_map;
174	72					787	_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count);
175	72					9507	@matches = grep /^$suff_pref_pat/i, keys %$this_map;
176	72					736	_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count);
177							}
178							}
179	216	50				608	if (!keys %pk_col2pk_value2count) {
180	0					0	for my $code (keys %$pk_map) {
181	0					0	my $this_map = $pk_map->{$code};
182	0					0	for my $word (@val_words) {
183	0					0	my @matches = grep /\b\Q$word\E\b/i, keys %$this_map;
184	0					0	_match_register(\@matches, $code, $this_map, \%pk_val2count, \%pk_col2pk_value2count, \%pk_val2from);
185							}
186							}
187							}
188							my ($best) = sort {
189	216					588	$pk_val2count{$b} <=> $pk_val2count{$a}
190							\|\|
191	29	50	66			137	length($pk_val2from{$a}) <=> length($pk_val2from{$b})
192							\|\|
193							$a cmp $b
194							} keys %pk_val2count;
195	216		66			505	my @pk_cols_unique_best = sort grep keys %{ $pk_col2pk_value2count{$_} } == 1 && $pk_col2pk_value2count{$_}{$best}, keys %pk_col2pk_value2count;
196	216					1890	($best, \@pk_cols_unique_best);
197							}
198
199							1;
200
201							=encoding utf8
202
203							=head1 NAME
204
205							Data::Prepare - prepare CSV (etc) data for automatic processing
206
207							=head1 SYNOPSIS
208
209							use Text::CSV qw(csv);
210							use Data::Prepare qw(
211							cols_non_empty non_unique_cols
212							chop_lines chop_cols header_merge
213							);
214							my $data = csv(in => 'unclean.csv', encoding => "UTF-8");
215							chop_cols([0, 2], $data);
216							header_merge($spec, $data);
217							chop_lines(\@lines, $data); # mutates the data
218
219							# or:
220							my @non_empty_counts = cols_non_empty($data);
221							print Dumper(non_unique_cols($data));
222
223							=head1 DESCRIPTION
224
225							A module with utility functions for turning spreadsheets published for
226							human consumption into ones suitable for automatic processing. Intended
227							to be used by the supplied L script. See that script's
228							documentation for a suggested workflow.
229
230							All the functions are exportable, none are exported by default.
231							All the C<$data> inputs are an array-ref-of-array-refs.
232
233							=head1 FUNCTIONS
234
235							=head2 chop_cols
236
237							chop_cols([0, 2], $data);
238
239							Uses C to delete each zero-based column index. The example above
240							deletes the first and third columns.
241
242							=head2 chop_lines
243
244							chop_lines([ 0, (-1) x $n ], $data);
245
246							Uses C to delete each zero-based line index, in the order
247							given. The example above deletes the first, and last C<$n>, lines.
248
249							=head2 header_merge
250
251							header_merge([
252							{ line => 1, from => 'up', fromspec => 'lastnonblank', to => 'self', matchto => 'HH', do => [ 'overwrite' ] },
253							{ line => 1, from => 'self', matchfrom => '.', to => 'down', do => [ 'prepend', ' ' ] },
254							{ line => 2, from => 'self', fromspec => 'left', to => 'self', matchto => 'Year', do => [ 'prepend', '/' ] },
255							{ line => 2, from => 'self', fromspec => 'literal:Country', to => 'self', tospec => 'index:0', do => [ 'overwrite' ] },
256							], $data);
257							# Turns:
258							# [
259							# [ '', 'Proportion of households with', '', '', '' ],
260							# [ '', '(HH1)', 'Year', '(HH2)', 'Year' ],
261							# [ '', 'Radio', 'of data', 'TV', 'of data' ],
262							# ]
263							# into (after a further chop_lines to remove the first two):
264							# [
265							# [
266							# 'Country',
267							# 'Proportion of households with Radio', 'Proportion of households with Radio/Year of data',
268							# 'Proportion of households with TV', 'Proportion of households with TV/Year of data'
269							# ]
270							# ]
271
272							Applies the given transformations to the given data, so you can make the
273							given data have the first row be your desired headers for the columns.
274							As shown in the above example, this does not delete lines so further
275							operations may be needed.
276
277							Broadly, each hash-ref specifies one operation, which acts on a single
278							(specified) line-number. It scans along that line from left to right,
279							unless C matches C in which case only one operation
280							is done.
281
282							The above merge operations in YAML format:
283
284							spec:
285							- do:
286							- overwrite
287							from: up
288							fromspec: lastnonblank
289							line: 2
290							matchto: HH
291							to: self
292							- do:
293							- prepend
294							- ' '
295							from: self
296							line: 2
297							matchfrom: .
298							to: down
299							- do:
300							- prepend
301							- /
302							from: self
303							fromspec: left
304							line: 3
305							matchto: Year
306							to: self
307							- do:
308							- overwrite
309							from: self
310							fromspec: literal:Country
311							line: 3
312							to: self
313							tospec: index:0
314
315							This turns the first three lines of data excerpted from the supplied example
316							data (shown in CSV with spaces inserted for alignment reasons only):
317
318							,Proportion of households with, , ,
319							,(HH1) ,Year ,(HH2),Year
320							,Radio ,of data,TV ,of data
321							Belize,58.7 ,2019 ,78.7 ,2019
322
323							into the following. Note that the first two lines will still be present
324							(not shown), possibly modified, so you will need your chop_lines to
325							remove them. The columns of the third line are shown, one per line,
326							for readability:
327
328							Country,
329							Proportion of households with Radio,
330							Proportion of households with Radio/Year of data,
331							Proportion of households with TV,
332							Proportion of households with TV/Year of data
333
334							This achieves a single row of column-headings, with each column-heading
335							being unique, and sufficiently meaningful.
336
337							=head2 pk_insert
338
339							pk_insert({
340							column_heading => 'ISO3CODE',
341							local_column => 'Country',
342							pk_column => 'official_name_en',
343							}, $data, $pk_map, $stopwords);
344
345							In YAML format, this is the same configuration:
346
347							pk_insert:
348							- files:
349							- examples/CoreHouseholdIndicators.csv
350							spec:
351							column_heading: ISO3CODE
352							local_column: Country
353							pk_column: official_name_en
354							use_fallback: true
355
356							And the C<$pk_map> made with L, inserts the
357							C in front of the current zero-th column, mapping the
358							value of the C column as looked up from the specified column
359							of the C file, and if C is true, also tries
360							L if no exact match is found. In that case, C
361							must be specified in the configuration
362
363							=head2 cols_non_empty
364
365							my @col_non_empty = cols_non_empty($data);
366
367							In the given data, iterates through all rows and returns a list of
368							quantities of non-blank entries in each column. This can be useful to spot
369							columns with only a couple of entries, which are more usefully chopped.
370
371							=head2 non_unique_cols
372
373							my $col2count = non_unique_cols($data);
374
375							Takes the first row of the given data, and returns a hash-ref mapping
376							any non-unique column-names to the number of times they appear.
377
378							=head2 key_to_index
379
380							Given an array-ref (probably the first row of a CSV file, i.e. column
381							headings), returns a hash-ref mapping the cell values to their zero-based
382							index.
383
384							=head2 make_pk_map
385
386							my $altcol2value2pk = make_pk_map($data, $pk_colkey, \@other_colkeys);
387
388							Given C<$data>, the heading of the primary-key column, and an array-ref
389							of headings of alternative key columns, returns a hash-ref mapping each
390							of those alternative key columns (plus the C<$pk_colkey>) to a map from
391							that column's value to the relevant row's primary-key value.
392
393							This is most conveniently represented in YAML format:
394
395							pk_spec:
396							file: examples/country-codes.csv
397							primary_key: ISO3166-1-Alpha-3
398							alt_keys:
399							- ISO3166-1-Alpha-2
400							- UNTERM English Short
401							- UNTERM English Formal
402							- official_name_en
403							- CLDR display name
404							stopwords:
405							- islands
406							- china
407							- northern
408
409							=head2 pk_col_counts
410
411							my ($colname2potential_key2count, $no_exact_match) = pk_col_counts($data, $pk_map);
412
413							Given C<$data> and a primary-key (etc) map created by the above, returns
414							a tuple of a hash-ref mapping each column that gave any matches to a
415							further hash-ref mapping each of the potential key columns given above
416							to how many matches it gave, and an array-ref of rows that had no exact
417							matches.
418
419							=head2 pk_match
420
421							my ($best, $pk_cols_unique_best) = pk_match($value, $pk_map, $stopwords);
422
423							Given a value, C<$pk_map>, and an array-ref of case-insensitive stopwords,
424							returns its best match for the right primary-key value, and an array-ref
425							of which primary-key columns in the C<$pk_map> matched the given value
426							exactly once.
427
428							The latter is useful for analysis purposes to select which primary-key
429							column to use for this data-set.
430
431							The algorithm used for this best-match:
432
433							=over
434
435							=item *
436
437							Splits the value into words (or where a word is two or more capital
438							letters, letters). The search allows any, or no, text, to occur between
439							these entities. Each configured primary-key column's keys are searched
440							for matches.
441
442							=item *
443
444							If there is a separating C<,> or C<(> (as commonly used for
445							abbreviations), splits the value into chunks, reverses them, and then
446							reassembles the chunks as above for a similar search.
447
448							=item *
449
450							Only if there were no matches from the previous steps, splits the value
451							into words. Words that are shorter than three characters, or that occur in
452							the stopword list, are omitted. Then each word is searched for as above.
453
454							=item *
455
456							"Votes" on which primary-key value got the most matches. Tie-breaks on
457							which primary-key value matched on the shortest key in the relevant
458							C<$pk_map> column, and then on the lexically lowest-valued primary-key
459							value, to ensure stable return values.
460
461							=back
462
463							=head1 SEE ALSO
464
465							L
466
467							=head1 LICENSE AND COPYRIGHT
468
469							Copyright (C) Ed J
470
471							This library is free software; you can redistribute it and/or modify
472							it under the same terms as Perl itself.
473
474							=cut