File Coverage

blib/lib/Pg/Explain/StringAnonymizer.pm

Criterion	Covered	Total	%
statement	118	118	100.0
branch	16	20	80.0
condition			n/a
subroutine	22	22	100.0
pod	6	6	100.0
total	162	166	97.5

line	stmt	bran	sub	pod	time	code
1						package Pg::Explain::StringAnonymizer;
2
3						# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
4	75		75		84180	use v5.18;
	75				461
5	75		75		482	use strict;
	75				167
	75				1495
6	75		75		382	use warnings;
	75				146
	75				2441
7	75		75		374	use warnings qw( FATAL utf8 );
	75				203
	75				3198
8	75		75		969	use utf8;
	75				165
	75				551
9	75		75		2396	use open qw( :std :utf8 );
	75				1411
	75				607
10	75		75		12445	use Unicode::Normalize qw( NFC );
	75				2252
	75				4641
11	75		75		1210	use Unicode::Collate;
	75				9278
	75				2105
12	75		75		929	use Encode qw( decode );
	75				10397
	75				4649
13
14	1		1		7	if ( grep /\P{ASCII}/ => @ARGV ) {
	1				3
	1				12
15						@ARGV = map { decode( 'UTF-8', $_ ) } @ARGV;
16						}
17
18						# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
19
20	75		75		36427	use Carp;
	75				152
	75				4641
21	75		75		39135	use Digest::SHA qw( sha1 );
	75				203725
	75				84192
22
23						=head1 NAME
24
25						Pg::Explain::StringAnonymizer - Class to anonymize sets of strings
26
27						=head1 VERSION
28
29						Version 2.4
30
31						=cut
32
33						our $VERSION = '2.4';
34
35						=head1 SYNOPSIS
36
37						This module provides a way to turn defined set of strings into anonymized version of it, that has 4 properties:
38
39						=over
40
41						=item * the same original string should give the same output string (within the same input set)
42
43						=item * strings shouldn't be very long
44
45						=item * it shouldn't be possible to reverse the operation
46
47						=item * generated strings should be easy to read, and easy to distinguish between themselves.
48
49						=back
50
51						Points first and third can be done easily with some hashing function (md5, sha), but generated hashes violate fourth point, and sometimes also second.
52
53						Example of usage:
54
55						my $anonymizer = Pg::Explain::StringAnonymizer->new();
56						$anonymizer->add( 'a', 'b', 'c');
57						$anonymizer->add( 'depesz' );
58						$anonymizer->add( [ "any strings, "are possible" ] );
59						$anonymizer->finalize();
60
61						print $anonymizer->anonymized( 'a' ), "\n";
62
63						my $full_dictionary = $anonymizer->anonymization_dictionary();
64
65						=head1 METHODS
66
67						=head2 new
68
69						Object constructor, doesn't take any arguments.
70
71						=cut
72
73						sub new {
74	17		17	1	836	my $class = shift;
75	17				67	my $self = bless {}, $class;
76	17				95	$self->{ 'strings' } = {};
77	17				62	return $self;
78						}
79
80						=head2 add
81
82						Adds new string(s) to anonymization list.
83
84						Strings can be given either as list of ArrayRef.
85
86						It is important to note, that one cannot add() more elements to anonymized set after finalization (call to finalize() method).
87
88						If such call will be made (add() after finalize()) it will raise exception.
89
90						=cut
91
92						sub add {
93	677		677	1	2999	my $self = shift;
94	677	100			1504	croak( "Cannot run ->add() after finalization.\n" ) if $self->{ 'is_finalized' };
95
96	676				1482	my @input = @_;
97	676	100			1384	@input = @{ $input[ 0 ] } if 'ARRAY' eq ref( $input[ 0 ] );
	3				9
98	676				1261	for my $string ( @input ) {
99	699	100			3476	next if $self->{ 'strings' }->{ $string };
100	130				364	$self->{ 'strings' }->{ $string } = $self->_hash( $string );
101						}
102	676				2000	return;
103						}
104
105						=head2 finalize
106
107						Finalizes string set creation, and creates anonymized versions.
108
109						It has to be called after some number of add() calls, so that it will have something to work on.
110
111						After running finalize() one cannot add() more string.
112
113						Also, before finalize() you cannot run anonymized() or anonymization_dictionary() methods.
114
115						=cut
116
117						sub finalize {
118	17		17	1	10987	my $self = shift;
119	17	50			72	return if $self->{ 'is_finalized' };
120	17				63	$self->{ 'is_finalized' } = 1;
121
122						$self->_make_prefixes(
123	17				49	'keys' => [ keys %{ $self->{ 'strings' } } ],
	17				228
124						'level' => 0,
125						);
126
127	17				104	$self->_stringify();
128
129	17				54	my @keys_sorted = sort { length( $b ) <=> length( $a ) } keys %{ $self->{ 'strings' } };
	382				616
	17				155
130	17				71	$self->{ 'keys_re' } = join '\|', map { qr{\Q$_\E} } @keys_sorted;
	130				1127
131
132	17				135	return;
133						}
134
135						=head2 anonymized
136
137						Returns anonymized version of given string, or undef if the string wasn't previously added to anonymization set.
138
139						If it will be called before finalize() it will raise exception.
140
141						=cut
142
143						sub anonymized {
144	694		694	1	11926	my $self = shift;
145	694	100			1635	croak( "Cannot run ->anonymized() before finalization.\n" ) unless $self->{ 'is_finalized' };
146	693				1105	my $input = shift;
147	693				3035	return $self->{ 'strings' }->{ $input };
148						}
149
150						=head2 anonymize_text
151
152						Anonymize given text using loaded dictionary of substiturions.
153
154						=cut
155
156						sub anonymize_text {
157	3		3	1	7	my $self = shift;
158	3				7	my $text = shift;
159	3				6	my $re = $self->{ 'keys_re' };
160	3				213	$text =~ s{(\b\|\s)($re)(\b\|\s)}{ $1 . $self->{'strings'}->{$2} . $3 }mge;
	30				179
161	3				27	return $text;
162						}
163
164						=head2 anonymization_dictionary
165
166						Returns hash reference containing all input strings and their anonymized versions, like:
167
168						{
169						'original1' => 'anon1',
170						'original2' => 'anon2',
171						...
172						'originalN' => 'anonN',
173						}
174
175						If it will be called before finalize() it will raise exception.
176
177						=cut
178
179						sub anonymization_dictionary {
180	2		2	1	1138	my $self = shift;
181	2	100			16	croak( "Cannot run ->anonymization_dictionary() before finalization.\n" ) unless $self->{ 'is_finalized' };
182	1				4	return $self->{ 'strings' };
183						}
184
185						=head1 INTERNAL METHODS
186
187						=head2 _hash
188
189						Converts given string into array of 32 integers in range 0..31.
190
191						This is done by taking sha1 checksum of string, splitting it into 32 5-bit
192						long "segments", and transposing each segment into integer.
193
194						=cut
195
196						sub _hash {
197	130		130		214	my $self = shift;
198	130				225	my $input = shift;
199
200	130				847	my $hash = sha1( $input );
201
202						# sha1() (20 bytes) to 32 integers (0..31) transformation thanks to
203						# mauke and LeoNerd on #perl on irc.freenode.net
204
205	130				598	my $binary_hash = unpack( "B*", $hash );
206	130				1174	my @segments = unpack "(a5)*", $binary_hash;
207	130				301	return [ map { oct "0b$_" } @segments ];
	4160				7163
208						}
209
210						=head2 _word
211
212						Returns n-th word from number-to-word translation dictionary.
213
214						=cut
215
216						sub _word {
217	202		202		286	my $self = shift;
218	202				263	my $n = shift;
219	202	50			368	$n = 0 unless defined $n;
220	202	50			374	$n = 0 if $n < 0;
221	202	50			351	$n = 31 if $n > 31;
222	202				826	my @words = qw(
223						alpha bravo charlie delta
224						echo foxtrot golf hotel
225						india juliet kilo lima
226						mike november oscar papa
227						quebec romeo sierra tango
228						uniform victor whiskey xray
229						yankee zulu two three
230						four five six seven
231						);
232	202				627	return $words[ $n ];
233						}
234
235						=head2 _make_prefixes
236
237						Scan given keys, and changes their values (in ->{'strings'} hash) to
238						shortest unique prefix.
239
240						=cut
241
242						sub _make_prefixes {
243	47		47		102	my $self = shift;
244	47				156	my %args = @_;
245
246	47				131	my $S = $self->{ 'strings' };
247
248	47				101	my %unique_ints = ();
249
250	47				72	for my $key ( @{ $args{ 'keys' } } ) {
	47				115
251	202				341	my $KA = $S->{ $key };
252	202				362	my $interesting_int = $KA->[ $args{ 'level' } ];
253	202				492	$unique_ints{ $interesting_int }++;
254						}
255
256						# At this moment, we know how many times given int happened at given
257						# level, so we can make sensible decisions
258
259	47				99	my %to_redo = ();
260
261	47				69	for my $key ( @{ $args{ 'keys' } } ) {
	47				105
262	202				325	my $KA = $S->{ $key };
263	202				303	my $interesting_int = $KA->[ $args{ 'level' } ];
264	202	100			428	if ( 1 == $unique_ints{ $interesting_int } ) {
265	130				180	splice @{ $KA }, 1 + $args{ 'level' };
	130				330
266	130				256	next;
267						}
268	72				101	push @{ $to_redo{ $interesting_int } }, $key;
	72				168
269						}
270
271						# In to_redo, we have blocks of keys, that share prefix (up to given
272						# level), so they have to be further processed.
273
274	47				124	for my $key_group ( values %to_redo ) {
275						$self->_make_prefixes(
276						'keys' => $key_group,
277	30				103	'level' => $args{ 'level' } + 1,
278						);
279						}
280
281	47				152	return;
282						}
283
284						=head2 _stringify
285
286						Converts arrays of ints (prefixes for hashed words) into strings
287
288						=cut
289
290						sub _stringify {
291	17		17		37	my $self = shift;
292
293	17				35	for my $key ( keys %{ $self->{ 'strings' } } ) {
	17				82
294	130				259	my $ints = $self->{ 'strings' }->{ $key };
295	130				173	my @words = map { $self->_word( $_ ) } @{ $ints };
	202				357
	130				217
296	130				471	$self->{ 'strings' }->{ $key } = join( '_', @words );
297						}
298						}
299
300						=head1 AUTHOR
301
302						hubert depesz lubaczewski, C<< >>
303
304						=head1 BUGS
305
306						Please report any bugs or feature requests to C.
307
308						=head1 SUPPORT
309
310						You can find documentation for this module with the perldoc command.
311
312						perldoc Pg::Explain::StringAnonymizer
313
314						=head1 COPYRIGHT & LICENSE
315
316						Copyright 2008-2021 hubert depesz lubaczewski, all rights reserved.
317
318						This program is free software; you can redistribute it and/or modify it
319						under the same terms as Perl itself.
320
321
322						=cut
323
324						1; # End of Pg::Explain::StringAnonymizer