File Coverage

blib/lib/Text/Record/Deduper.pm

Criterion	Covered	Total	%
statement	225	326	69.0
branch	64	114	56.1
condition	4	18	22.2
subroutine	22	27	81.4
pod	7	7	100.0
total	322	492	65.4

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Text::Record::Deduper - Separate complete, partial and near duplicate text records
4
5							=head1 SYNOPSIS
6
7							use Text::Record::Deduper;
8
9							my $deduper = new Text::Record::Deduper;
10
11							# Find and remove entire lines that are duplicated
12							$deduper->dedupe_file("orig.txt");
13
14							# Dedupe comma separated records, duplicates defined by several fields
15							$deduper->field_separator(',');
16							$deduper->add_key(field_number => 1, ignore_case => 1 );
17							$deduper->add_key(field_number => 2, ignore_whitespace => 1);
18							# unique records go to file names_uniqs.csv, dupes to names_dupes.csv
19							$deduper->dedupe_file('names.csv');
20
21							# Find 'near' dupes by allowing for given name aliases
22							my %nick_names = (Bob => 'Robert',Rob => 'Robert');
23							my $near_deduper = new Text::Record::Deduper();
24							$near_deduper->add_key(field_number => 2, alias => \%nick_names) or die;
25							$near_deduper->dedupe_file('names.txt');
26
27							# Create a text report, names_report.txt to identify all duplicates
28							$near_deduper->report_file('names.txt',all_records => 1);
29
30							# Find 'near' dupes in an array of records, returning references
31							# to a unique and a duplicate array
32							my ($uniqs,$dupes) = $near_deduper->dedupe_array(\@some_records);
33
34							# Create a report on unique and duplicate records
35							$deduper->report_file("orig.txt",all_records => 0);
36
37
38							=head1 DESCRIPTION
39
40							This module allows you to take a text file of records and split it into
41							a file of unique and a file of duplicate records. Deduping of arrays is
42							also possible.
43
44							Records are defined as a set of fields. Fields may be separated by spaces,
45							commas, tabs or any other delimiter. Records are separated by a new line.
46
47							If no options are specifed, a duplicate will be created only when all the
48							fields in a record (the entire line) are duplicated.
49
50							By specifying options a duplicate record is defined by which fields or partial
51							fields must not occur more than once per record. There are also options to
52							ignore case sensitivity, leading and trailing white space.
53
54							Additionally 'near' or 'fuzzy' duplicates can be defined. This is done by creating
55							aliases, such as Bob => Robert.
56
57							This module is useful for finding duplicates that have been created by
58							multiple data entry, or merging of similar records
59
60
61							=head1 METHODS
62
63							=head2 new
64
65							The C method creates an instance of a deduping object. This must be
66							called before any of the following methods are invoked.
67
68							=head2 field_separator
69
70							Sets the token to use as the field delimiter. Accepts any character as well as
71							Perl escaped characters such as "\t" etc. If this method ins not called the
72							deduper assumes you have fixed width fields .
73
74							$deduper->field_separator(',');
75
76
77							=head2 add_key
78
79							Lets you add a field to the definition of a duplicate record. If no keys
80							have been added, the entire record will become the key, so that only records
81							duplicated in their entirity are removed.
82
83							$deduper->add_key
84							(
85							field_number => 1,
86							key_length => 5,
87							ignore_case => 1,
88							ignore_whitespace => 1,
89							alias => \%nick_names
90							);
91
92							=over 4
93
94							=item field_number
95
96							Specifies the number of the field in the record to add to the key (1,2 ...).
97							Note that this option only applies to character separated data. You will get a
98							warning if you try to specify a field_number for fixed width data.
99
100							=item start_pos
101
102							Specifies the position of the field in characters to add to the key. Note that
103							this option only applies to fixed width data. You will get a warning if you
104							try to specify a start_pos for character separated data. You must also specify
105							a key_length.
106
107							Note that the first column is numbered 1, not 0.
108
109
110							=item key_length
111
112							The length of a key field. This must be specifed if you are using fixed width
113							data (along with a start_pos). It is optional for character separated data.
114
115							=item ignore_case
116
117							When defining a duplicate, ignore the case of characters, so Robert and ROBERT
118							are equivalent.
119
120							=item ignore_whitespace
121
122							When defining a duplicate, ignore white space that leasd or trails a field's data.
123
124							=item alias
125
126							When defining a duplicate, allow for aliases substitution. For example
127
128							my %nick_names = (Bob => 'Robert',Rob => 'Robert');
129							$near_deduper->add_key(field_number => 2, alias => \%nick_names) or die;
130
131							Whenever field 2 contains 'Bob', it will be treated as a duplicate of a record
132							where field 2 contains 'Robert'.
133
134							=back
135
136
137							=head2 dedupe_file
138
139							This method takes a file name F as it's only argument. The file is
140							processed to detect duplicates, as defined by the methods above. Unique records
141							are place in a file named F and duplicates in a file named
142							F. Note that If either of this output files exist, they are
143							over written The orignal file is left intact.
144
145							$deduper->dedupe_file("orig.txt");
146
147
148							=head2 dedupe_array
149
150							This method takes an array reference as it's only argument. The array is
151							processed to detect duplicates, as defined by the methods above. Two array
152							references are retuned, the first to the set of unique records and the second
153							to the set of duplicates.
154
155							Note that the memory constraints of your system may prevent you from processing
156							very large arrays.
157
158							my ($unique_records,duplicate_records) = $deduper->dedupe_array(\@some_records);
159
160
161							=head2 report_file
162
163							This method takes a file name F as it's initial argument.
164
165							A text report is produced with the following columns
166
167							record number : the line number of the record
168
169							key : the key values that define record uniqueness
170
171							type: the type of record
172							unique : record only occurs once
173							identical : record occurs more than once, first occurence has parent record number of 0
174							alias : record occurs more than once, after alias substitutions have been applied
175
176							parent record number : the line number of the record that THIS record is a duplicate of.
177
178							By default, the report file name is F.
179
180							Various setup options may be defined in a hash that is passed as an optional argument to
181							the C method. Note that all the arguments are optional. They include
182
183							=over 4
184
185							=item all_records
186
187							When this option is set to a positive value, all records will be included in
188							the report. If this value is not set, only the duplicate records will be included
189							in the report
190
191							=back
192
193
194							$deduper->report_file("orig.txt",all_records => 0)
195
196							=head2 report_array
197
198							This method takes an array as it's initial argument. The behaviour is the same as
199							C above except that the report file is named F
200
201							=head1 EXAMPLES
202
203							=head2 Dedupe an array of single records
204
205							Given an array of strings:
206
207							my @emails =
208							(
209							'John.Smith@xyz.com',
210							'Bob.Smith@xyz.com',
211							'John.Brown@xyz.com.au,
212							'John.Smith@xyz.com'
213							);
214
215							use Text::Record::Deduper;
216
217							my $deduper = new Text::Record::Deduper();
218							my ($uniq,$dupe);
219							($uniq,$dupe) = $deduper->dedupe_array(\@emails);
220
221							The array reference $uniq now contains
222
223							'John.Smith@xyz.com',
224							'Bob.Smith@xyz.com',
225							'John.Brown@xyz.com.au'
226
227							The array reference $dupe now contains
228
229							'John.Smith@xyz.com'
230
231
232							=head2 Dedupe a file of fixed width records
233
234							Given a text file F with space separated values and duplicates defined
235							by the second and third columns:
236
237							100 Bob Smith
238							101 Robert Smith
239							102 John Brown
240							103 Jack White
241							104 Bob Smythe
242							105 Robert Smith
243
244
245							use Text::Record::Deduper;
246
247							my %nick_names = (Bob => 'Robert',Rob => 'Robert');
248							my $near_deduper = new Text::Record::Deduper();
249							$near_deduper->add_key(start_pos => 5, key_length => 9, ignore_whitespace => 1, alias => \%nick_names) or die;
250							$near_deduper->add_key(start_pos => 14, key_length => 9,) or die;
251							$near_deduper->dedupe_file("names.txt");
252							$near_deduper->report_file("names.txt");
253
254
255							Text::Record::Deduper will produce a file of unique records, F
256							in the same directory as F.
257
258							101 Robert Smith
259							102 John Brown
260							103 Jack White
261							104 Bob Smythe
262
263
264							and a file of duplicates, F in the same directory as F
265
266							100 Bob Smith
267							105 Robert Smith
268
269							The original file, F is left intact.
270
271							A report file F, is created in the same directory as F
272
273							Number Key Type Parent Parent Key
274							--------------------------------------------------------------------------------
275							1 Bob_Smith alias 2 Robert_Smith
276							2 Robert_Smith identical 0
277							3 John_Brown unique 0
278							4 Jack_White unique 0
279							5 Bob_Smythe unique 0
280							6 Robert_Smith identical 2 Robert_Smith
281
282
283							=head1 TO DO
284
285							Allow for multi line records
286							Add batch mode driven by config file or command line options
287							Allow option to warn user when over writing output files
288							Allow user to customise suffix for uniq and dupe output files
289
290
291							=head1 SEE ALSO
292
293							sort(3), uniq(3), L, L, L
294
295
296							=head1 AUTHOR
297
298							Text::Record::Deduper was written by Kim Ryan
299
300
301							=head1 COPYRIGHT AND LICENSE
302
303							Copyright (C) 2011 Kim Ryan.
304
305
306							This library is free software; you can redistribute it and/or modify
307							it under the same terms as Perl itself.
308
309
310							=cut
311
312							package Text::Record::Deduper;
313	1			1		25002	use FileHandle;
	1					13962
	1					7
314	1			1		439	use File::Basename;
	1					2
	1					102
315	1			1		966	use Text::ParseWords;
	1					1524
	1					70
316	1			1		1258	use Data::Dumper;
	1					11616
	1					84
317
318
319
320	1			1		9	use strict;
	1					2
	1					36
321	1			1		6	use warnings;
	1					2
	1					4009
322
323							require Exporter;
324
325							our @ISA = qw(Exporter);
326							our $VERSION = '0.07';
327
328
329							#-------------------------------------------------------------------------------
330							# Create a new instance of a deduping object.
331
332							sub new
333							{
334	3			3	1	2091	my $class = shift;
335	3					8	my %args = @_;
336
337	3					9	my $deduper = {};
338	3					8	bless($deduper,$class);
339
340							# Default to no separator, until we find otherwise
341	3					13	$deduper->{field_separator} = '';
342
343	3					9	return ($deduper);
344							}
345							#-------------------------------------------------------------------------------
346							# Create a new instance of a deduping object.
347
348							sub field_separator
349							{
350	1			1	1	7	my $deduper = shift;
351
352	1					2	my ($field_separator) = @_;
353
354							# Escape pipe symbol so it does get interpreted as alternation character
355							# when splitting fields in _get_key_fields
356	1	50				5	$field_separator eq '\|' and $field_separator = '\\|';
357
358							# add more error checking here
359
360	1					3	$deduper->{field_separator} = $field_separator;
361	1					2	return($deduper);
362							}
363							#-------------------------------------------------------------------------------
364							# Dewfine a key field in the record
365							sub add_key
366							{
367	4			4	1	25	my $deduper = shift;
368	4					18	my %args = @_;
369
370
371	4					9	$deduper->{key_counter}++;
372
373	4	100				14	if ( $args{field_number} )
		50
374							{
375	2	50				8	unless ( $deduper->{field_separator} )
376							{
377	0					0	warn "Cannot use field_number on fixed width lines";
378	0					0	return;
379							}
380							}
381							elsif ( $args{start_pos} )
382							{
383	2	50				9	if ( $deduper->{field_separator} )
384							{
385	0					0	warn "Cannot use start_pos on character separated records";
386	0					0	return;
387							}
388							else
389							{
390	2	50				9	unless ( $args{key_length} )
391							{
392	0					0	warn "No key_length defined for start_pos: $args{start_pos}";
393	0					0	return;
394							}
395							}
396							}
397
398	4					13	foreach my $current_key (keys %args)
399							{
400	15	100				38	if ($current_key eq 'ignore_case' )
401							{
402	4					13	$deduper->{ignore_case}{$deduper->{key_counter}} = 1;
403							}
404	15	100				36	if ($current_key eq 'ignore_whitespace' )
405							{
406	3					11	$deduper->{ignore_whitespace}{$deduper->{key_counter}} = 1;
407							}
408	15	100				33	if ($current_key eq 'alias' )
409							{
410	2	50				6	if ( $args{ignore_case} )
411							{
412							# if ignore case, fold all of alias to upper case
413	2					4	my %current_alias = %{ $args{$current_key} };
	2					10
414	2					5	my %corrected_alias;
415	2					7	foreach my $current_alias_key ( keys %current_alias )
416							{
417	6					30	$corrected_alias{uc($current_alias_key)} = uc($current_alias{$current_alias_key});
418
419							}
420	2					10	$deduper->{alias}{$deduper->{key_counter}} = \%corrected_alias;
421							}
422							else
423							{
424	0					0	$deduper->{alias}{$deduper->{key_counter}} = $args{$current_key};
425							}
426							}
427	15	100				69	if ($current_key =~ /field_number\|start_pos\|key_length/ )
428							{
429	6					26	$deduper->{key}{$deduper->{key_counter}}{$current_key} = $args{$current_key};
430							}
431							}
432
433	4					25	return ($deduper);
434							}
435							#-------------------------------------------------------------------------------
436							#
437							sub dedupe_file
438							{
439	0			0	1	0	my $deduper = shift;
440	0					0	my ($input_file_name) = @_;
441
442	0	0	0			0	unless ( -T $input_file_name and -s $input_file_name )
443							{
444	0					0	warn("Could not find input file: $input_file_name");
445	0					0	return(0);
446							}
447
448	0					0	my $input_fh = new FileHandle "<$input_file_name";
449	0	0				0	unless ($input_fh)
450							{
451	0					0	warn "Could not open input file: $input_file_name";
452	0					0	return(0);
453							}
454
455	0					0	my ($file_name,$path,$suffix) = File::Basename::fileparse($input_file_name,qr{\..*});
456	0					0	my $file_name_unique_records = "$path$file_name\_uniqs$suffix";
457	0					0	my $file_name_duplicate_records = "$path$file_name\_dupes$suffix";
458
459							# TO DO!!! test for overwriting of previous Deduper output
460	0					0	my $unique_fh = new FileHandle ">$file_name_unique_records";
461	0	0				0	unless($unique_fh)
462							{
463	0					0	warn "Could not open file: $file_name_unique_records: $!";
464	0					0	return(0);
465							}
466
467	0					0	my $dupes_fh = new FileHandle ">$file_name_duplicate_records";
468	0	0				0	unless ( $dupes_fh )
469							{
470	0					0	warn "Could not open file: $file_name_duplicate_records: $!";
471	0					0	return(0);
472							}
473
474	0					0	my ($master_record_index) = $deduper->_analyse('file',undef,$input_fh);
475	0					0	$deduper->_separate('file',$master_record_index,undef,$input_fh,$unique_fh,$dupes_fh);
476
477	0					0	$input_fh->close;
478	0					0	$unique_fh->close;
479	0					0	$dupes_fh->close;
480							}
481							#-------------------------------------------------------------------------------
482
483							sub _rewind_file
484							{
485	0			0		0	my ($input_fh) = @_;
486	0					0	$input_fh->seek(0,0); # rewind file
487							}
488							#-------------------------------------------------------------------------------
489							#
490							sub dedupe_array
491							{
492	3			3	1	17	my $deduper = shift;
493
494	3					5	my ($input_array_ref) = @_;
495	3					10	my ($master_record_index) = $deduper->_analyse('array',$input_array_ref,undef);
496	3					11	my ($uniq_array_ref,$dupe_array_ref) = $deduper->_separate('array',$master_record_index,$input_array_ref,undef);
497
498	3					21	return($uniq_array_ref,$dupe_array_ref);
499							}
500							#-------------------------------------------------------------------------------
501							# Produce a text report on deduping statistics
502							sub report_file
503							{
504
505	0			0	1	0	my $deduper = shift;
506	0					0	my ($input_file_name,%report_options) = @_;
507
508	0	0	0			0	unless ( -T $input_file_name and -s $input_file_name )
509							{
510	0					0	warn("Could not find input file: $input_file_name");
511	0					0	return(0);
512							}
513
514	0					0	my $input_fh = new FileHandle "<$input_file_name";
515	0	0				0	unless ($input_fh)
516							{
517	0					0	warn "Could not open input file: $input_file_name";
518	0					0	return(0);
519							}
520
521	0					0	my ($master_record_index_ref) = $deduper->_analyse('file',undef,$input_fh);
522
523	0					0	my $report_file_name;
524	0	0				0	if ( $report_options{file_name} )
525							{
526							# user has specified name and path to report file
527	0					0	$report_file_name = $report_options{file_name};
528							}
529							else
530							{
531							# use base of input file, append _report.txt to report file name
532	0					0	my ($file_name,$path,$suffix) = File::Basename::fileparse($input_file_name,qr{\..*});
533	0					0	$report_file_name = "$path$file_name\_report\.txt";
534							}
535
536
537	0					0	$deduper->_report($master_record_index_ref,$report_file_name,%report_options);
538	0					0	$input_fh->close;
539
540							}
541							#-------------------------------------------------------------------------------
542							#
543							sub report_array
544							{
545
546	0			0	1	0	my $deduper = shift;
547	0					0	my ($input_array_ref,%report_options) = @_;
548
549	0					0	my ($master_record_index_ref) = $deduper->_analyse('array',$input_array_ref,undef);
550
551	0					0	my $report_file_name;
552	0	0				0	if ( $report_options{file_name} )
553							{
554							# user has specified name and path to report file
555	0					0	$report_file_name = $report_options{file_name};
556							}
557							else
558							{
559							# TO DO, make name more unique, eg add time stamp
560	0					0	$report_file_name = "./deduper_array_report.txt";
561							}
562
563	0					0	$deduper->_report($master_record_index_ref,$report_file_name,%report_options);
564
565							}
566							#-------------------------------------------------------------------------------
567							#
568							sub _report
569							{
570	0			0		0	my $deduper = shift;
571	0					0	my ($master_record_index_ref,$report_file_name,%report_options) = @_;
572
573
574	0					0	my $report_fh = new FileHandle ">$report_file_name";
575	0	0				0	unless($report_fh)
576							{
577	0					0	warn "Could not open report file: $report_file_name: $!";
578	0					0	return(0);
579							}
580
581							# TO DO, report format, side be side or interleaved?
582							# options, all records or dupes (default), group all dupes even first
583							# full record dump, not just key
584
585	0					0	my $current_line = sprintf("%6s %-30.30s %-10.10s %6s %-30.30s\n",
586							'Number', 'Key','Type','Parent','Parent Key');
587	0					0	$report_fh->print($current_line);
588	0					0	$report_fh->print('-' x 80,"\n");
589
590	0					0	foreach my $record_num ( sort { $a <=> $b } keys %$master_record_index_ref )
	0					0
591							{
592
593	0	0	0			0	if ( $report_options{all_records} or
			0
594							($master_record_index_ref->{$record_num}->{type} ne 'unique' and
595							$master_record_index_ref->{$record_num}->{parent} > 0 ) )
596							{
597	0					0	my $parent_record_key = '';
598	0	0				0	if ( $master_record_index_ref->{$record_num}->{parent} )
599							{
600	0					0	$parent_record_key = $master_record_index_ref->{$master_record_index_ref->{$record_num}->{parent}}->{key};
601							}
602	0					0	my $current_line = sprintf("%6d %-30.30s %-10.10s %6d %-30.30s\n",
603							$record_num,
604							$master_record_index_ref->{$record_num}->{key},
605							$master_record_index_ref->{$record_num}->{type},
606							$master_record_index_ref->{$record_num}->{parent},
607							$parent_record_key);
608
609	0					0	$report_fh->print($current_line);
610							}
611							}
612	0					0	$report_fh->close;
613							}
614							#-------------------------------------------------------------------------------
615							#
616							sub _analyse
617							{
618	3			3		5	my $deduper = shift;
619	3					13	my ($storage_type,$input_array_ref,$input_fh) = @_;
620
621	3					5	my $current_record_number = 0;
622	3					4	my $current_line;
623	3					4	my $finished = 0;
624
625
626	3					4	my %alias_candidates;
627	3	50				11	if ( $deduper->{alias} )
628							{
629	3					12	my %all_alias_values = $deduper->_get_all_alias_values;
630	3					8	while ( not $finished )
631							{
632	16					39	($current_line,$finished) = _read_one_record($storage_type,$current_record_number,$input_array_ref,$input_fh);
633	16					25	$current_record_number++;
634	16					43	my $alias_candidate_key = $deduper->_alias_candidate($current_line,%all_alias_values);
635	16	100	100			81	if ( $alias_candidate_key and not $alias_candidates{$alias_candidate_key} )
636							{
637	2					9	$alias_candidates{$alias_candidate_key} = $current_record_number;
638							}
639							}
640							}
641							# print(Dumper(\%alias_candidates));
642							# die;
643
644	3					4	my %seen_exact_dupes;
645	3					5	my $unique_ref = [];
646	3					6	my $dupe_ref = [];
647	3					4	$current_record_number = 0;
648	3					4	my %master_record_index;
649
650	3					4	$finished = 0;
651	3	50	33			12	if ( $storage_type eq 'file' and $deduper->{alias} )
652							{
653	0					0	_rewind_file($input_fh);
654							}
655
656	3					8	while ( not $finished )
657							{
658	16					32	($current_line,$finished) = _read_one_record($storage_type,$current_record_number,$input_array_ref,$input_fh);
659	16					20	$current_record_number++;
660
661	16					17	my $dupe_type;
662	16					36	my %record_keys = $deduper->_get_key_fields($current_line);
663
664
665	16					47	%record_keys = $deduper->_transform_key_fields(%record_keys);
666	16					59	my $full_key = _assemble_full_key(%record_keys);
667
668	16					23	my $parent_record_number;
669	16	100				44	if ( $parent_record_number = $deduper->_alias_dupe(\%alias_candidates,%record_keys) )
		100
670							{
671	4					5	$dupe_type = 'alias';
672							}
673							# add soundex dupe
674							# add string approx dupe
675							elsif ( $parent_record_number = _exact_dupe($current_line,$full_key,%seen_exact_dupes) )
676							{
677	5					8	$dupe_type = 'identical';
678							}
679							else
680							{
681	7					10	$dupe_type = 'unique';
682							# retain the record number of dupe, useful for detailed reporting and grouping
683	7					15	$seen_exact_dupes{$full_key} = $current_record_number;
684	7					8	$parent_record_number = 0;
685							}
686
687	16					41	_classify_record($dupe_type,$parent_record_number,$current_record_number,$full_key,\%master_record_index);
688							}
689	3					14	return(\%master_record_index);
690
691							}
692
693							#-------------------------------------------------------------------------------
694							sub _alias_candidate
695							{
696	16			16		19	my $deduper = shift;
697	16					31	my ($current_line,%all_alias_values) = @_;
698
699	16					39	my %record_keys = $deduper->_get_key_fields($current_line);
700	16					54	%record_keys = $deduper->_transform_key_fields(%record_keys);
701
702	16					31	my $alias_candidate_key = '';
703	16					41	foreach my $current_key ( sort keys %record_keys )
704							{
705
706	20					32	my $current_key_data = $record_keys{$current_key};
707	20	100				49	if ( $deduper->{alias}{$current_key} )
708							{
709	16	50				38	not $all_alias_values{$current_key} and next;
710	16	100				19	if ( grep(/^$current_key_data$/,@{ $all_alias_values{$current_key} }) )
	16					204
711							{
712	4					12	$alias_candidate_key .= $current_key_data . ':';
713							}
714							else
715							{
716	12					44	return(0);
717							}
718							}
719							else
720							{
721	4					13	$alias_candidate_key .= $current_key_data . ':';
722							}
723							}
724	4					16	return($alias_candidate_key);
725							}
726							#-------------------------------------------------------------------------------
727							#
728							sub _get_all_alias_values
729							{
730	3			3		4	my $deduper = shift;
731
732	3					4	my %all_alias_values;
733
734	3					5	foreach my $key_number ( sort keys %{$deduper->{alias}} )
	3					13
735							{
736	3					4	my %current_alias = %{ $deduper->{alias}{$key_number} };
	3					35
737	3					5	my (@current_alias_values,%seen_alias_values);
738	3					8	foreach my $current_alias_value ( values %current_alias )
739							{
740	9	100				26	unless ( $seen_alias_values{$current_alias_value} )
741							{
742	6					8	push(@current_alias_values,$current_alias_value);
743	6					18	$seen_alias_values{$current_alias_value}++;
744							}
745							}
746	3					20	$all_alias_values{$key_number}= [ @current_alias_values ];
747							}
748	3					12	return(%all_alias_values)
749							}
750
751							#-------------------------------------------------------------------------------
752							#
753
754							sub _get_key_fields
755							{
756
757	32			32		36	my $deduper = shift;
758	32					45	my ($current_line) = @_;
759
760	32					34	my %record_keys;
761
762
763	32	50				69	if ( $deduper->{key} )
764							{
765	32	100				89	if ( $deduper->{field_separator} )
766							{
767
768							# The ParseWords module will not handle single quotes within fields,
769							# so add an escape sequence between any apostrophe bounded by a
770							# letter on each side. Note that this applies even if there are no
771							# quotes in your data, the module needs balanced quotes.
772	12	100				40	if ( $current_line =~ /\w'\w/ )
773							{
774							# check for names with apostrophes, like O'Reilly
775	4					32	$current_line =~ s/(\w)'(\w)/$1\\'$2/g;
776							}
777
778							# Use ParseWords module to spearate delimited field.
779							# '0' option means don't return any quotes enclosing a field
780	12					45	my (@field_data) = &Text::ParseWords::parse_line($deduper->{field_separator},0,$current_line);
781
782
783	12					1427	foreach my $key_number ( sort keys %{$deduper->{key}} )
	12					49
784							{
785	24					55	my $current_field_data = $field_data[$deduper->{key}->{$key_number}->{field_number} - 1];
786	24	50				52	unless ( $current_field_data )
787							{
788							# A record has less fields then we were expecting, so no
789							# point searching for anymore.
790	0					0	print("Short record\n");
791	0					0	print("Current line : $current_line\n");
792	0					0	print("All fields :", @field_data,"\n");
793	0					0	last;
794							# TO DO, add a warning if user specifies records that must have
795							# a full set of fields??
796							}
797
798	24	50				56	if ( $deduper->{key}->{$key_number}->{key_length} )
799							{
800	0					0	$current_field_data = substr($current_field_data,0,$deduper->{key}->{$key_number}->{key_length});
801							}
802	24					81	$record_keys{$key_number} = $current_field_data;
803							}
804							}
805							else
806							{
807	20					25	foreach my $key_number ( sort keys %{$deduper->{key}} )
	20					63
808							{
809	40					196	my $current_field_data = substr($current_line,$deduper->{key}->{$key_number}->{start_pos} - 1,
810							$deduper->{key}->{$key_number}->{key_length});
811	40	50				61	if ( $current_field_data )
812							{
813	40					107	$record_keys{$key_number} = $current_field_data;
814							}
815							else
816							{
817	0					0	print("Short record\n");
818	0					0	print("Current line : $current_line\n");
819	0					0	last;
820							# TO DO, add a warning if user specifies records must have
821							# a full set of fields??
822							}
823							}
824							}
825							}
826							else
827							{
828							# no key fileds defined, use whole line as key
829	0					0	$record_keys{1} = $current_line;
830							}
831	32					314	return(%record_keys);
832							}
833							#-------------------------------------------------------------------------------
834							#
835
836							sub _transform_key_fields
837							{
838	32			32		43	my $deduper = shift;
839	32					64	my (%record_keys) = @_;
840
841	32	50				84	if ( $deduper->{ignore_whitespace} )
842							{
843	32					35	foreach my $key_number ( keys %{$deduper->{ignore_whitespace}} )
	32					75
844							{
845							# strip out leading and/or trailing whitespace
846	44					91	$record_keys{$key_number} =~ s/^\s+//;
847	44					162	$record_keys{$key_number} =~ s/\s+$//;
848							}
849							}
850
851	32	50				96	if ( $deduper->{ignore_case} )
852							{
853							# Transform every field where ignore_case was specified
854
855	32					34	foreach my $key_number ( keys %{$deduper->{ignore_case}} )
	32					77
856							{
857							# If this key is case insensitive, fold data to upper case
858	64					144	$record_keys{$key_number} = uc($record_keys{$key_number});
859							}
860							}
861	32					167	return(%record_keys);
862							}
863							#-------------------------------------------------------------------------------
864							#
865
866							sub _assemble_full_key
867							{
868	16			16		33	my (%record_keys) = @_;
869	16					16	my $full_key;
870							my @each_key;
871	16					37	foreach my $current_key ( sort keys %record_keys )
872							{
873	32					59	push(@each_key,$record_keys{$current_key});
874							}
875	16					38	$full_key = join('_',@each_key);
876	16					46	return($full_key);
877
878							}
879							#-------------------------------------------------------------------------------
880							#
881
882							sub _alias_dupe
883							{
884	16			16		17	my $deduper = shift;
885	16					34	my ($alias_candidates_ref,%record_keys) = @_;
886
887
888	16					19	my $alias_dupe = 0;
889	16	50				41	if ( $deduper->{alias} )
890							{
891	16					15	my $alias_was_substituted = 0;
892
893	16					18	foreach my $key_number ( keys %{$deduper->{alias}} )
	16					38
894							{
895	16					13	my %current_alias = %{ $deduper->{alias}{$key_number} };
	16					66
896	16					35	foreach my $current_alias_key ( keys %current_alias )
897							{
898	39	100				123	if ( $record_keys{$key_number} eq $current_alias_key )
899							{
900	8					9	$alias_was_substituted = 1;
901	8					12	$record_keys{$key_number} = $current_alias{$current_alias_key};
902	8					27	last;
903							}
904							}
905							}
906	16	100				43	if ( $alias_was_substituted )
907							{
908	8					10	my $full_key;
909	8					18	foreach my $current_key ( sort keys %record_keys )
910							{
911	16					33	$full_key .= $record_keys{$current_key} . ':';
912							}
913	8	100				22	if ( $alias_candidates_ref->{$full_key} )
914							{
915	4					8	$alias_dupe = $alias_candidates_ref->{$full_key};
916							}
917							}
918							}
919							# returns the number of the orignal unique record for which this current record is an alias dupe of
920	16					63	return($alias_dupe);
921							}
922							#-------------------------------------------------------------------------------
923							#
924
925							sub _exact_dupe
926							{
927	12			12		16	my $deduper = shift;
928	12					27	my ($full_key,%seen_exact_dupes) = @_;
929							# problem with unitialized value, set to undef??
930	12	100				23	if ( $seen_exact_dupes{$full_key} )
931							{
932	5					17	return($seen_exact_dupes{$full_key});
933							}
934							else
935							{
936	7					20	return(0);
937							}
938							}
939
940							#-------------------------------------------------------------------------------
941							#
942
943							sub _read_one_record
944							{
945	48			48		78	my ($storage_type,$current_record_number,$input_array_ref,$input_fh) = @_;
946
947	48					51	my $finished = 0;
948	48					51	my $current_line;
949
950	48	50				116	if ( $storage_type eq 'file' )
		50
951							{
952	0	0				0	if ( $current_line = $input_fh->getline )
953							{
954	0					0	chomp($current_line);
955	0	0				0	if ( $input_fh->eof )
956							{
957	0					0	$finished = 1;
958							}
959							}
960							else
961							{
962	0					0	warn "Could not read line from input file";
963	0					0	$finished = 1;
964							}
965							}
966							elsif ( $storage_type eq 'array' )
967							{
968	48					73	$current_line = @$input_array_ref[$current_record_number];
969	48					70	my $last_element = @$input_array_ref - 1;
970	48	100				128	if ( $current_record_number == $last_element )
		50
971							{
972	9					13	$finished = 1;
973							}
974							elsif ( $current_record_number > $last_element )
975							{
976	0					0	warn "You are trying to access beyond the input array boundaries";
977	0					0	$finished = 1;
978							}
979							}
980							else
981							{
982	0					0	warn "Illegal storage type";
983	0					0	$finished = 1;
984							}
985	48					125	return($current_line,$finished);
986							}
987
988
989							#-------------------------------------------------------------------------------
990							#
991
992							sub _classify_record
993							{
994	16			16		27	my ($dupe_type,$parent_record_number,$current_record_number,$full_key,$master_record_index) = @_;
995	16					50	$master_record_index->{$current_record_number}{key} = $full_key;
996	16					29	$master_record_index->{$current_record_number}{parent} = $parent_record_number;
997	16					28	$master_record_index->{$current_record_number}{type} = $dupe_type;
998
999							# If there is a parent, update it now, so that is marked as the first
1000							# dupe in a set (of current type alias, indentical etc). Note that
1001							# a record can be parent to several record types, eg alias and indentical
1002							# Currently only updating from last child, but may want to record all in future.
1003
1004	16	100				50	if ( $parent_record_number )
1005							{
1006	9					17	$master_record_index->{$parent_record_number}{type} = $dupe_type;
1007	9					102	$master_record_index->{$parent_record_number}{parent} = 0;
1008							}
1009							}
1010
1011							#-------------------------------------------------------------------------------
1012							#
1013
1014							sub _separate
1015							{
1016	3			3		36	my $deduper = shift;
1017	3					7	my ($storage_type,$master_record_index_ref,$input_array_ref,$input_fh,$unique_fh,$dupes_fh) = @_;
1018
1019	3	50				10	if ( $storage_type eq 'file' )
1020							{
1021	0					0	_rewind_file($input_fh);
1022							}
1023
1024	3					5	my $unique_ref = [];
1025	3					4	my $dupe_ref = [];
1026
1027
1028	3					4	my $current_record_number = 0;
1029	3					4	my $current_line;
1030	3					4	my $finished = 0;
1031
1032
1033	3					7	while ( not $finished )
1034							{
1035	16					20	my $current_line;
1036	16					26	($current_line,$finished) = _read_one_record($storage_type,$current_record_number,$input_array_ref,$input_fh);
1037	16					22	$current_record_number++;
1038	16					31	my $dupe_type = $master_record_index_ref->{$current_record_number}{type};
1039	16					26	my $parent_record_number = $master_record_index_ref->{$current_record_number}{parent};
1040
1041							# The first duplicate in a set of 1 or more dupes (the parent), is treated as a unique record
1042							# TO DO!!! allow user to define this initial dupe as not unique, and group with it's childeren dupes
1043							# TO DO!!! separate out to alias, soundex dupes to their own file if needed
1044	16	100				37	if ( $parent_record_number == 0 )
1045							{
1046	7					11	$dupe_type = 'unique';
1047							}
1048
1049	16					25	_write_one_record($storage_type,$dupe_type,$current_line,$unique_ref,$dupe_ref,$input_fh,$unique_fh,$dupes_fh);
1050							}
1051	3					10	return($unique_ref,$dupe_ref);
1052
1053							}
1054							#-------------------------------------------------------------------------------
1055							#
1056
1057							sub _write_one_record
1058							{
1059	16			16		31	my ($storage_type,$dupe_type,$current_line,$unique_ref,$dupe_ref,$input_fh,$unique_fh,$dupes_fh) = @_;
1060
1061	16	50				45	if ( $storage_type eq 'file' )
		50
1062							{
1063
1064	0	0				0	if ( $dupe_type eq 'unique' )
1065							{
1066	0					0	$unique_fh->print("$current_line\n");
1067							}
1068							else
1069							{
1070	0					0	$dupes_fh->print("$current_line\n");
1071							}
1072							}
1073							elsif ( $storage_type eq 'array' )
1074							{
1075	16	100				28	if ( $dupe_type eq 'unique' )
1076							{
1077	7					25	push(@$unique_ref,$current_line);
1078							}
1079							else
1080							{
1081							# TO DO!!! separate out to alias, soundex dupes etc if needed
1082	9					31	push(@$dupe_ref,$current_line);
1083							}
1084							}
1085							}
1086
1087
1088							1;
1089