File Coverage

blib/lib/Lingua/EN/MatchNames.pm
Criterion Covered Total %
statement 81 99 81.8
branch 46 78 58.9
condition 10 21 47.6
subroutine 12 13 92.3
pod 0 3 0.0
total 149 214 69.6


line stmt bran cond sub pod time code
1            
2             =head1 NAME
3            
4             Lingua::EN::MatchNames - Smart matching for human names.
5            
6             =head1 SYNOPSIS
7            
8             use Lingua::EN::MatchNames;
9            
10             $score= name_eq( $firstn_0, $lastn_0, $firstn_1, $lastn_1 );
11            
12             =head1 DESCRIPTION
13            
14             You have two databases of person records that need to be synchronized or matched up,
15             but they use different keys--maybe one uses SSN and the other uses employee id.
16             The only fields you have to match on are first and last name.
17            
18             That's what this module is for.
19            
20             Just feed the first and last names to the C function, and it returns
21             C for no possible match, and a percentage of certainty (rank) otherwise.
22             The ranking system isn't very scientific, and gender isn't considered, though
23             it probably should be.
24            
25             The C function, checks for:
26            
27             =over 4
28            
29             =item * inconsistent case (MacHenry = Machenry = MACHENRY)
30            
31             =item * inconsistent symbols (O'Brien = Obrien = O BRIEN)
32            
33             =item * misspellings (Grene = Green)
34            
35             =item * last name hyphenation (Smith-Curry = Curry)
36            
37             =item * similar phonetics (Hanson = Hansen)
38            
39             =item * nicknames (Midge = Peggy = Margaret)
40            
41             =item * extraneous initials (H. Ross = Ross)
42            
43             =item * extraneous suffixes (Reed, Jr. = Reed II = Reed)
44            
45             =item * and more...
46            
47             =back
48            
49             =head2 Preliminary Tests:
50            
51             Homer Simpson HOMER SIMPOSN: 77
52             Marge Simpson MIDGE SIMPSON: 81
53             Brian Lalonde BRYAN LA LONDE: 82
54             Brian Lalonde RYAN LALAND: 72
55             Peggy MacHenry Midge Machenry: 81
56             Liz Grene Elizabeth Green: 72
57             Chuck Reed, Jr. Charles Reed II: 82
58             Kathy O'Brien Catherine Obrien: 81
59             Lizzie Hanson Lisa Hanson: 91
60             H. Ross Perot Ross PEROT: 88
61             Kathy Smith-Curry KATIE CURRY: 81
62             Dina Johnson-Warner Dinah J-Warner: 80
63             Leela Miles-Conrad Leela MilesConrad: 86
64             C. Renee Smythe Cathy Smythe: 71
65             Victoria (Honey) Rider HONEY RIDER: 88
66             Bart Simpson El Barto Simpson: 80
67             Bart Simpson Lisa Simpson: (no match)
68             Arthur Dent Zaphod Beeblebrox: (no match)
69            
70             =head1 WARNING
71            
72             The scoring in this version is utterly arbitrary.
73             I made all of the numbers up.
74             The certainty percentages should be OK relative to each other, but
75             would be better if someone could give me some statistical data.
76            
77             Be sure and B this against your data first!
78             Your data may not look like my test data.
79            
80             And although I hope this is useful to many, I do not provide any
81             kind of warranty (expressed or implied), and do not suggest the
82             suitability of this module to any particular purpose.
83             This module probably should not be used for life support or military
84             purposes, and it B not be used for unsolicited commercial email
85             or other bulk advertising.
86            
87             =head1 AUTHOR
88            
89             Brian Lalonde, Ebrian@webcoder.infoE
90            
91             =head1 REQUIREMENTS
92            
93             Lingua::EN::NameParse,
94             Lingua::EN::Nickname,
95             Parse::RecDescent,
96             String::Approx,
97             Text::Metaphone,
98             Text::Soundex
99            
100             =head1 SEE ALSO
101            
102             perl(1),
103             L,
104             L,
105             L,
106             L,
107             L
108            
109             =cut
110            
111             package Lingua::EN::MatchNames;
112             require Exporter;
113 1     1   473 use Carp;
  1         1  
  1         59  
114 1     1   608 use Lingua::EN::NameParse;
  1         43968  
  1         58  
115 1     1   856 use Lingua::EN::Nickname;
  1         22453  
  1         235  
116 1     1   732 use String::Approx 'amatch';
  1         4085  
  1         81  
117 1     1   493 use Text::Metaphone;
  1         455  
  1         53  
118 1     1   400 use Text::Soundex;
  1         2310  
  1         143  
119 1     1   5 use strict;
  1         1  
  1         26  
120 1     1   3 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  1         1  
  1         68  
121 1     1   4 use vars qw($debug);
  1         1  
  1         1074  
122            
123             $VERSION= '1.34';
124             @ISA= qw(Exporter);
125             @EXPORT= qw(name_eq);
126             @EXPORT_OK= qw(fname_eq lname_eq);
127             %EXPORT_TAGS=
128             (
129             ALL => [ @EXPORT, @EXPORT_OK ],
130             );
131            
132             sub _nparse($)
133             {
134 0     0   0 local $_= shift;
135 0 0       0 my $nparse= new Lingua::EN::NameParse( auto_clean => 1, force_case => 1 )
136             or carp "Unable to set up name parser.\n$!\n";
137 0         0 $nparse->parse($_);
138 0         0 my %name= $nparse->components;
139 0 0       0 return($name{given_name_1},$name{surname_1}.
140             ( $name{surname_2} ? '-'.$name{surname_2} : '' ));
141             }
142            
143             sub fname_eq
144             {
145 20     20 0 23 my($name0,$name1,$match)= @_;
146 20 50 33     56 return unless $name0 and $name1;
147 20 100       29 return 100 if $name0 eq $name1;
148             # recurse offset nicknames
149 19 50       28 if($name0=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name1,$1); }
  1 100       5  
150 18 0       33 if($name0=~ m/"(\w+)"/) { return $match if $match= fname_eq($name1,$1); }
  0 50       0  
151 18 0       28 if($name1=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name0,$1); }
  0 50       0  
152 18 0       27 if($name1=~ m/"(\w+)"/) { return $match if $match= fname_eq($name0,$1); }
  0 50       0  
153             # strip leading/trailing initial(s) (98%)
154 18         42 $name0=~ s/\W*\b\w\b\W*//g;
155 18         23 $name1=~ s/\W*\b\w\b\W*//g;
156 18 100       25 return 98 if $name0 eq $name1;
157             # recurse separate parts
158 17 50       44 if($name0=~ /\W/)
    100          
159             { # split parts, find best match
160 0         0 my($match)= sort { $b <=> $a } map {fname_eq($name1,$_)} split /\W+/, $name0;
  0         0  
  0         0  
161 0 0       0 return $match if $match;
162             }
163             elsif($name1=~ /\W/)
164             { # split parts, find best match
165 1         4 my($match)= sort { $b <=> $a } map {fname_eq($name0,$_)} split /\W+/, $name1;
  0         0  
  2         10  
166 1 50       44 return $match if $match;
167             }
168             # all caps, no symbols (95%)
169 16         17 ($name0= uc $name0)=~ y/A-Z//cd;
170 16         17 ($name1= uc $name1)=~ y/A-Z//cd;
171 16 100       28 return 95 if $name0 eq $name1;
172             # nickname (80%)
173 14 100       32 return int 0.8*$match if $match= nickname_eq($name0,$name1);
174             # fuzzy approx (15%)
175 5 100 66     584 return 35 if amatch($name0,$name1) and amatch($name1,$name0);
176             # simple trucation
177 3 50 33     143 return 10 if $name0=~ /^$name1|$name1$/ or $name1=~ /^$name0|$name0$/;
178             # a single initial
179 3         9 ($name0,$name1)= @_;
180 3 50       11 for($name0=~ m/\b(\w)\b/) { return 5 if $name1=~ /^$_/i; }
  1         9  
181 2 0       4 for($name1=~ m/\b(\w)\b/) { return 5 if $name0=~ /^$_/i; }
  0         0  
182 2         7 return;
183             }
184            
185             sub lname_eq
186             {
187 34     34 0 34 my($name0,$name1)= @_;
188 34 50 33     105 return unless $name0 and $name1;
189 34 100       56 return 100 if $name0 eq $name1;
190             # strip trailing suffixes (95%)
191 31         53 $name0=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;
192 31         56 $name1=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;
193 31 50       43 return 95 if $name0 eq $name1;
194             # recurse hyphenated components
195 31 100       77 if($name0=~ /-/)
    50          
196             { # split hyphenation on hyphen ONLY
197 7         13 my($match)= sort { $b <=> $a } map {lname_eq($name1,$_)} split /-/, $name0;
  2         3  
  14         25  
198 7 50       26 return $match if $match;
199             }
200             elsif($name1=~ /-/)
201             { # split hyphenation on hyphen ONLY
202 0         0 my($match)= sort { $b <=> $a } map {lname_eq($name0,$_)} split /-/, $name1;
  0         0  
  0         0  
203 0 0       0 return $match if $match;
204             }
205             # all caps, no symbols (85%)
206 24         40 ($name0= uc $name0)=~ y/A-Z//cd;
207 24         27 ($name1= uc $name1)=~ y/A-Z//cd;
208 24 100       90 return 85 if $name0 eq $name1;
209             # metaphone (70%)
210 12 100       55 return 70 if Metaphone($name0) eq Metaphone($name1);
211             # soundex (40%)
212 9 50       28 return 40 if soundex($name0) eq soundex($name1);
213             # fuzzy approx (15%)
214 9 50 33     23 return 25 if amatch($name0,$name1) and amatch($name1,$name0);
215             # nonstandard 'hyphenation'/simple truncation
216 9         226 ($name0,$name1)= map {(my$n=$_)=~s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;$n=~y/A-Za-z\-//cd;$n} @_;
  18         32  
  18         25  
  18         30  
217 9 100 66     56 return int 0.8*lname_eq($name0,$name1) if $name0=~ s/(\B[A-Z][a-z]+)/-$1/g
218             or $name1=~ s/(\B[A-Z][a-z]+)/-$1/g;
219 7 100 66     160 return 10 if $name0=~ /^$name1|$name1$/i or $name1=~ /^$name0|$name0$/i;
220 6         19 return;
221             }
222            
223             sub name_eq
224             {
225 18 50   18 0 887 my($nomF0,$nomL0,$nomF1,$nomL1,$Frank,$Lrank)=
226             ( @_ < 4 ? (_nparse($_[0]),_nparse($_[1])) : @_ );
227 18 100       30 return unless $Lrank= lname_eq $nomL0, $nomL1;
228 17 100       26 return unless $Frank= fname_eq $nomF0, $nomF1;
229 16         1329 return int $Lrank*0.7 + $Frank*0.3; # another ratio I just made up
230             }
231            
232             1