File Coverage

blib/lib/Pg/Explain/StringAnonymizer.pm

Criterion	Covered	Total	%
statement	118	118	100.0
branch	16	20	80.0
condition			n/a
subroutine	22	22	100.0
pod	6	6	100.0
total	162	166	97.5

line	stmt	bran	sub	pod	time	code
1						package Pg::Explain::StringAnonymizer;
2
3						# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
4	73		73		71028	use v5.18;
	73				352
5	73		73		344	use strict;
	73				120
	73				1274
6	73		73		283	use warnings;
	73				125
	73				2027
7	73		73		295	use warnings qw( FATAL utf8 );
	73				142
	73				2489
8	73		73		832	use utf8;
	73				133
	73				410
9	73		73		1979	use open qw( :std :utf8 );
	73				1128
	73				432
10	73		73		9484	use Unicode::Normalize qw( NFC );
	73				2035
	73				3631
11	73		73		985	use Unicode::Collate;
	73				7828
	73				1684
12	73		73		813	use Encode qw( decode );
	73				8845
	73				3789
13
14	1		1		7	if ( grep /\P{ASCII}/ => @ARGV ) {
	1				2
	1				8
15						@ARGV = map { decode( 'UTF-8', $_ ) } @ARGV;
16						}
17
18						# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
19
20	73		73		30866	use Carp;
	73				127
	73				3668
21	73		73		33610	use Digest::SHA qw( sha1 );
	73				166841
	73				67754
22
23						=head1 NAME
24
25						Pg::Explain::StringAnonymizer - Class to anonymize sets of strings
26
27						=head1 VERSION
28
29						Version 2.2
30
31						=cut
32
33						our $VERSION = '2.2';
34
35						=head1 SYNOPSIS
36
37						This module provides a way to turn defined set of strings into anonymized version of it, that has 4 properties:
38
39						=over
40
41						=item * the same original string should give the same output string (within the same input set)
42
43						=item * strings shouldn't be very long
44
45						=item * it shouldn't be possible to reverse the operation
46
47						=item * generated strings should be easy to read, and easy to distinguish between themselves.
48
49						=back
50
51						Points first and third can be done easily with some hashing function (md5, sha), but generated hashes violate fourth point, and sometimes also second.
52
53						Example of usage:
54
55						my $anonymizer = Pg::Explain::StringAnonymizer->new();
56						$anonymizer->add( 'a', 'b', 'c');
57						$anonymizer->add( 'depesz' );
58						$anonymizer->add( [ "any strings, "are possible" ] );
59						$anonymizer->finalize();
60
61						print $anonymizer->anonymized( 'a' ), "\n";
62
63						my $full_dictionary = $anonymizer->anonymization_dictionary();
64
65						=head1 METHODS
66
67						=head2 new
68
69						Object constructor, doesn't take any arguments.
70
71						=cut
72
73						sub new {
74	16		16	1	725	my $class = shift;
75	16				46	my $self = bless {}, $class;
76	16				74	$self->{ 'strings' } = {};
77	16				50	return $self;
78						}
79
80						=head2 add
81
82						Adds new string(s) to anonymization list.
83
84						Strings can be given either as list of ArrayRef.
85
86						It is important to note, that one cannot add() more elements to anonymized set after finalization (call to finalize() method).
87
88						If such call will be made (add() after finalize()) it will raise exception.
89
90						=cut
91
92						sub add {
93	657		657	1	2507	my $self = shift;
94	657	100			1147	croak( "Cannot run ->add() after finalization.\n" ) if $self->{ 'is_finalized' };
95
96	656				1143	my @input = @_;
97	656	100			1440	@input = @{ $input[ 0 ] } if 'ARRAY' eq ref( $input[ 0 ] );
	3				7
98	656				991	for my $string ( @input ) {
99	677	100			1523	next if $self->{ 'strings' }->{ $string };
100	116				266	$self->{ 'strings' }->{ $string } = $self->_hash( $string );
101						}
102	656				1626	return;
103						}
104
105						=head2 finalize
106
107						Finalizes string set creation, and creates anonymized versions.
108
109						It has to be called after some number of add() calls, so that it will have something to work on.
110
111						After running finalize() one cannot add() more string.
112
113						Also, before finalize() you cannot run anonymized() or anonymization_dictionary() methods.
114
115						=cut
116
117						sub finalize {
118	16		16	1	9081	my $self = shift;
119	16	50			75	return if $self->{ 'is_finalized' };
120	16				37	$self->{ 'is_finalized' } = 1;
121
122						$self->_make_prefixes(
123	16				35	'keys' => [ keys %{ $self->{ 'strings' } } ],
	16				144
124						'level' => 0,
125						);
126
127	16				69	$self->_stringify();
128
129	16				38	my @keys_sorted = sort { length( $b ) <=> length( $a ) } keys %{ $self->{ 'strings' } };
	355				483
	16				90
130	16				53	$self->{ 'keys_re' } = join '\|', map { qr{\Q$_\E} } @keys_sorted;
	116				862
131
132	16				119	return;
133						}
134
135						=head2 anonymized
136
137						Returns anonymized version of given string, or undef if the string wasn't previously added to anonymization set.
138
139						If it will be called before finalize() it will raise exception.
140
141						=cut
142
143						sub anonymized {
144	672		672	1	10084	my $self = shift;
145	672	100			1301	croak( "Cannot run ->anonymized() before finalization.\n" ) unless $self->{ 'is_finalized' };
146	671				894	my $input = shift;
147	671				2389	return $self->{ 'strings' }->{ $input };
148						}
149
150						=head2 anonymize_text
151
152						Anonymize given text using loaded dictionary of substiturions.
153
154						=cut
155
156						sub anonymize_text {
157	1		1	1	3	my $self = shift;
158	1				3	my $text = shift;
159	1				2	my $re = $self->{ 'keys_re' };
160	1				111	$text =~ s{(\b\|\s)($re)(\b\|\s)}{ $1 . $self->{'strings'}->{$2} . $3 }mge;
	29				145
161	1				23	return $text;
162						}
163
164						=head2 anonymization_dictionary
165
166						Returns hash reference containing all input strings and their anonymized versions, like:
167
168						{
169						'original1' => 'anon1',
170						'original2' => 'anon2',
171						...
172						'originalN' => 'anonN',
173						}
174
175						If it will be called before finalize() it will raise exception.
176
177						=cut
178
179						sub anonymization_dictionary {
180	2		2	1	955	my $self = shift;
181	2	100			14	croak( "Cannot run ->anonymization_dictionary() before finalization.\n" ) unless $self->{ 'is_finalized' };
182	1				3	return $self->{ 'strings' };
183						}
184
185						=head1 INTERNAL METHODS
186
187						=head2 _hash
188
189						Converts given string into array of 32 integers in range 0..31.
190
191						This is done by taking sha1 checksum of string, splitting it into 32 5-bit
192						long "segments", and transposing each segment into integer.
193
194						=cut
195
196						sub _hash {
197	116		116		158	my $self = shift;
198	116				158	my $input = shift;
199
200	116				675	my $hash = sha1( $input );
201
202						# sha1() (20 bytes) to 32 integers (0..31) transformation thanks to
203						# mauke and LeoNerd on #perl on irc.freenode.net
204
205	116				420	my $binary_hash = unpack( "B*", $hash );
206	116				860	my @segments = unpack "(a5)*", $binary_hash;
207	116				196	return [ map { oct "0b$_" } @segments ];
	3712				5144
208						}
209
210						=head2 _word
211
212						Returns n-th word from number-to-word translation dictionary.
213
214						=cut
215
216						sub _word {
217	184		184		200	my $self = shift;
218	184				195	my $n = shift;
219	184	50			289	$n = 0 unless defined $n;
220	184	50			290	$n = 0 if $n < 0;
221	184	50			257	$n = 31 if $n > 31;
222	184				617	my @words = qw(
223						alpha bravo charlie delta
224						echo foxtrot golf hotel
225						india juliet kilo lima
226						mike november oscar papa
227						quebec romeo sierra tango
228						uniform victor whiskey xray
229						yankee zulu two three
230						four five six seven
231						);
232	184				451	return $words[ $n ];
233						}
234
235						=head2 _make_prefixes
236
237						Scan given keys, and changes their values (in ->{'strings'} hash) to
238						shortest unique prefix.
239
240						=cut
241
242						sub _make_prefixes {
243	44		44		63	my $self = shift;
244	44				131	my %args = @_;
245
246	44				68	my $S = $self->{ 'strings' };
247
248	44				86	my %unique_ints = ();
249
250	44				55	for my $key ( @{ $args{ 'keys' } } ) {
	44				83
251	184				269	my $KA = $S->{ $key };
252	184				219	my $interesting_int = $KA->[ $args{ 'level' } ];
253	184				321	$unique_ints{ $interesting_int }++;
254						}
255
256						# At this moment, we know how many times given int happened at given
257						# level, so we can make sensible decisions
258
259	44				103	my %to_redo = ();
260
261	44				57	for my $key ( @{ $args{ 'keys' } } ) {
	44				85
262	184				250	my $KA = $S->{ $key };
263	184				226	my $interesting_int = $KA->[ $args{ 'level' } ];
264	184	100			302	if ( 1 == $unique_ints{ $interesting_int } ) {
265	116				135	splice @{ $KA }, 1 + $args{ 'level' };
	116				236
266	116				170	next;
267						}
268	68				74	push @{ $to_redo{ $interesting_int } }, $key;
	68				122
269						}
270
271						# In to_redo, we have blocks of keys, that share prefix (up to given
272						# level), so they have to be further processed.
273
274	44				125	for my $key_group ( values %to_redo ) {
275						$self->_make_prefixes(
276						'keys' => $key_group,
277	28				74	'level' => $args{ 'level' } + 1,
278						);
279						}
280
281	44				122	return;
282						}
283
284						=head2 _stringify
285
286						Converts arrays of ints (prefixes for hashed words) into strings
287
288						=cut
289
290						sub _stringify {
291	16		16		31	my $self = shift;
292
293	16				25	for my $key ( keys %{ $self->{ 'strings' } } ) {
	16				64
294	116				193	my $ints = $self->{ 'strings' }->{ $key };
295	116				133	my @words = map { $self->_word( $_ ) } @{ $ints };
	184				264
	116				183
296	116				315	$self->{ 'strings' }->{ $key } = join( '_', @words );
297						}
298						}
299
300						=head1 AUTHOR
301
302						hubert depesz lubaczewski, C<< >>
303
304						=head1 BUGS
305
306						Please report any bugs or feature requests to C.
307
308						=head1 SUPPORT
309
310						You can find documentation for this module with the perldoc command.
311
312						perldoc Pg::Explain::StringAnonymizer
313
314						=head1 COPYRIGHT & LICENSE
315
316						Copyright 2008-2021 hubert depesz lubaczewski, all rights reserved.
317
318						This program is free software; you can redistribute it and/or modify it
319						under the same terms as Perl itself.
320
321
322						=cut
323
324						1; # End of Pg::Explain::StringAnonymizer