| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  |  | 
| 2 |  |  |  |  |  |  | =head1 NAME | 
| 3 |  |  |  |  |  |  |  | 
| 4 |  |  |  |  |  |  | Lingua::EN::MatchNames - Smart matching for human names. | 
| 5 |  |  |  |  |  |  |  | 
| 6 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 7 |  |  |  |  |  |  |  | 
| 8 |  |  |  |  |  |  | use Lingua::EN::MatchNames; | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | $score= name_eq( $firstn_0, $lastn_0, $firstn_1, $lastn_1 ); | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 13 |  |  |  |  |  |  |  | 
| 14 |  |  |  |  |  |  | You have two databases of person records that need to be synchronized or matched up, | 
| 15 |  |  |  |  |  |  | but they use different keys--maybe one uses SSN and the other uses employee id. | 
| 16 |  |  |  |  |  |  | The only fields you have to match on are first and last name. | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | That's what this module is for. | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  | Just feed the first and last names to the C function, and it returns | 
| 21 |  |  |  |  |  |  | C for no possible match, and a percentage of certainty (rank) otherwise. | 
| 22 |  |  |  |  |  |  | The ranking system isn't very scientific, and gender isn't considered, though | 
| 23 |  |  |  |  |  |  | it probably should be. | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | The C function, checks for: | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | =over 4 | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | =item * inconsistent case (MacHenry = Machenry = MACHENRY) | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | =item * inconsistent symbols (O'Brien = Obrien = O BRIEN) | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | =item * misspellings (Grene = Green) | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | =item * last name hyphenation (Smith-Curry = Curry) | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | =item * similar phonetics (Hanson = Hansen) | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | =item * nicknames (Midge = Peggy = Margaret) | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | =item * extraneous initials (H. Ross = Ross) | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | =item * extraneous suffixes (Reed, Jr. = Reed II = Reed) | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | =item * and more... | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | =back | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  | =head2 Preliminary Tests: | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | Homer Simpson HOMER SIMPOSN: 77 | 
| 52 |  |  |  |  |  |  | Marge Simpson MIDGE SIMPSON: 81 | 
| 53 |  |  |  |  |  |  | Brian Lalonde BRYAN LA LONDE: 82 | 
| 54 |  |  |  |  |  |  | Brian Lalonde RYAN LALAND: 72 | 
| 55 |  |  |  |  |  |  | Peggy MacHenry Midge Machenry: 81 | 
| 56 |  |  |  |  |  |  | Liz Grene Elizabeth Green: 72 | 
| 57 |  |  |  |  |  |  | Chuck Reed, Jr. Charles Reed II: 82 | 
| 58 |  |  |  |  |  |  | Kathy O'Brien Catherine Obrien: 81 | 
| 59 |  |  |  |  |  |  | Lizzie Hanson Lisa Hanson: 91 | 
| 60 |  |  |  |  |  |  | H. Ross Perot Ross PEROT: 88 | 
| 61 |  |  |  |  |  |  | Kathy Smith-Curry KATIE CURRY: 81 | 
| 62 |  |  |  |  |  |  | Dina Johnson-Warner Dinah J-Warner: 80 | 
| 63 |  |  |  |  |  |  | Leela Miles-Conrad Leela MilesConrad: 86 | 
| 64 |  |  |  |  |  |  | C. Renee Smythe Cathy Smythe: 71 | 
| 65 |  |  |  |  |  |  | Victoria (Honey) Rider HONEY RIDER: 88 | 
| 66 |  |  |  |  |  |  | Bart Simpson El Barto Simpson: 80 | 
| 67 |  |  |  |  |  |  | Bart Simpson Lisa Simpson: (no match) | 
| 68 |  |  |  |  |  |  | Arthur Dent Zaphod Beeblebrox: (no match) | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | =head1 WARNING | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | The scoring in this version is utterly arbitrary. | 
| 73 |  |  |  |  |  |  | I made all of the numbers up. | 
| 74 |  |  |  |  |  |  | The certainty percentages should be OK relative to each other, but | 
| 75 |  |  |  |  |  |  | would be better if someone could give me some statistical data. | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  | Be sure and B this against your data first! | 
| 78 |  |  |  |  |  |  | Your data may not look like my test data. | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | And although I hope this is useful to many, I do not provide any | 
| 81 |  |  |  |  |  |  | kind of warranty (expressed or implied), and do not suggest the | 
| 82 |  |  |  |  |  |  | suitability of this module to any particular purpose. | 
| 83 |  |  |  |  |  |  | This module probably should not be used for life support or military | 
| 84 |  |  |  |  |  |  | purposes, and it B not be used for unsolicited commercial email | 
| 85 |  |  |  |  |  |  | or other bulk advertising. | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | =head1 AUTHOR | 
| 88 |  |  |  |  |  |  |  | 
| 89 |  |  |  |  |  |  | Brian Lalonde, Ebrian@webcoder.infoE | 
| 90 |  |  |  |  |  |  |  | 
| 91 |  |  |  |  |  |  | =head1 REQUIREMENTS | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | Lingua::EN::NameParse, | 
| 94 |  |  |  |  |  |  | Lingua::EN::Nickname, | 
| 95 |  |  |  |  |  |  | Parse::RecDescent, | 
| 96 |  |  |  |  |  |  | String::Approx, | 
| 97 |  |  |  |  |  |  | Text::Metaphone, | 
| 98 |  |  |  |  |  |  | Text::Soundex | 
| 99 |  |  |  |  |  |  |  | 
| 100 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | perl(1), | 
| 103 |  |  |  |  |  |  | L, | 
| 104 |  |  |  |  |  |  | L, | 
| 105 |  |  |  |  |  |  | L, | 
| 106 |  |  |  |  |  |  | L, | 
| 107 |  |  |  |  |  |  | L | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | =cut | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | package Lingua::EN::MatchNames; | 
| 112 |  |  |  |  |  |  | require Exporter; | 
| 113 | 1 |  |  | 1 |  | 473 | use Carp; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 59 |  | 
| 114 | 1 |  |  | 1 |  | 608 | use Lingua::EN::NameParse; | 
|  | 1 |  |  |  |  | 43968 |  | 
|  | 1 |  |  |  |  | 58 |  | 
| 115 | 1 |  |  | 1 |  | 856 | use Lingua::EN::Nickname; | 
|  | 1 |  |  |  |  | 22453 |  | 
|  | 1 |  |  |  |  | 235 |  | 
| 116 | 1 |  |  | 1 |  | 732 | use String::Approx 'amatch'; | 
|  | 1 |  |  |  |  | 4085 |  | 
|  | 1 |  |  |  |  | 81 |  | 
| 117 | 1 |  |  | 1 |  | 493 | use Text::Metaphone; | 
|  | 1 |  |  |  |  | 455 |  | 
|  | 1 |  |  |  |  | 53 |  | 
| 118 | 1 |  |  | 1 |  | 400 | use Text::Soundex; | 
|  | 1 |  |  |  |  | 2310 |  | 
|  | 1 |  |  |  |  | 143 |  | 
| 119 | 1 |  |  | 1 |  | 5 | use strict; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 26 |  | 
| 120 | 1 |  |  | 1 |  | 3 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 68 |  | 
| 121 | 1 |  |  | 1 |  | 4 | use vars qw($debug); | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 1074 |  | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | $VERSION=     '1.34'; | 
| 124 |  |  |  |  |  |  | @ISA=         qw(Exporter); | 
| 125 |  |  |  |  |  |  | @EXPORT=      qw(name_eq); | 
| 126 |  |  |  |  |  |  | @EXPORT_OK=   qw(fname_eq lname_eq); | 
| 127 |  |  |  |  |  |  | %EXPORT_TAGS= | 
| 128 |  |  |  |  |  |  | ( | 
| 129 |  |  |  |  |  |  | ALL => [ @EXPORT, @EXPORT_OK ], | 
| 130 |  |  |  |  |  |  | ); | 
| 131 |  |  |  |  |  |  |  | 
| 132 |  |  |  |  |  |  | sub _nparse($) | 
| 133 |  |  |  |  |  |  | { | 
| 134 | 0 |  |  | 0 |  | 0 | local $_= shift; | 
| 135 | 0 | 0 |  |  |  | 0 | my $nparse= new Lingua::EN::NameParse( auto_clean => 1, force_case => 1 ) | 
| 136 |  |  |  |  |  |  | or carp "Unable to set up name parser.\n$!\n"; | 
| 137 | 0 |  |  |  |  | 0 | $nparse->parse($_); | 
| 138 | 0 |  |  |  |  | 0 | my %name= $nparse->components; | 
| 139 | 0 | 0 |  |  |  | 0 | return($name{given_name_1},$name{surname_1}. | 
| 140 |  |  |  |  |  |  | ( $name{surname_2} ? '-'.$name{surname_2} : '' )); | 
| 141 |  |  |  |  |  |  | } | 
| 142 |  |  |  |  |  |  |  | 
| 143 |  |  |  |  |  |  | sub fname_eq | 
| 144 |  |  |  |  |  |  | { | 
| 145 | 20 |  |  | 20 | 0 | 23 | my($name0,$name1,$match)= @_; | 
| 146 | 20 | 50 | 33 |  |  | 56 | return unless $name0 and $name1; | 
| 147 | 20 | 100 |  |  |  | 29 | return 100 if $name0 eq $name1; | 
| 148 |  |  |  |  |  |  | # recurse offset nicknames | 
| 149 | 19 | 50 |  |  |  | 28 | if($name0=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name1,$1); } | 
|  | 1 | 100 |  |  |  | 5 |  | 
| 150 | 18 | 0 |  |  |  | 33 | if($name0=~ m/"(\w+)"/)   { return $match if $match= fname_eq($name1,$1); } | 
|  | 0 | 50 |  |  |  | 0 |  | 
| 151 | 18 | 0 |  |  |  | 28 | if($name1=~ m/\((\w+)\)/) { return $match if $match= fname_eq($name0,$1); } | 
|  | 0 | 50 |  |  |  | 0 |  | 
| 152 | 18 | 0 |  |  |  | 27 | if($name1=~ m/"(\w+)"/)   { return $match if $match= fname_eq($name0,$1); } | 
|  | 0 | 50 |  |  |  | 0 |  | 
| 153 |  |  |  |  |  |  | # strip leading/trailing initial(s) (98%) | 
| 154 | 18 |  |  |  |  | 42 | $name0=~ s/\W*\b\w\b\W*//g; | 
| 155 | 18 |  |  |  |  | 23 | $name1=~ s/\W*\b\w\b\W*//g; | 
| 156 | 18 | 100 |  |  |  | 25 | return 98 if $name0 eq $name1; | 
| 157 |  |  |  |  |  |  | # recurse separate parts | 
| 158 | 17 | 50 |  |  |  | 44 | if($name0=~ /\W/) | 
|  |  | 100 |  |  |  |  |  | 
| 159 |  |  |  |  |  |  | { # split parts, find best match | 
| 160 | 0 |  |  |  |  | 0 | my($match)= sort { $b <=> $a } map {fname_eq($name1,$_)} split /\W+/, $name0; | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 161 | 0 | 0 |  |  |  | 0 | return $match if $match; | 
| 162 |  |  |  |  |  |  | } | 
| 163 |  |  |  |  |  |  | elsif($name1=~ /\W/) | 
| 164 |  |  |  |  |  |  | { # split parts, find best match | 
| 165 | 1 |  |  |  |  | 4 | my($match)= sort { $b <=> $a } map {fname_eq($name0,$_)} split /\W+/, $name1; | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 2 |  |  |  |  | 10 |  | 
| 166 | 1 | 50 |  |  |  | 44 | return $match if $match; | 
| 167 |  |  |  |  |  |  | } | 
| 168 |  |  |  |  |  |  | # all caps, no symbols (95%) | 
| 169 | 16 |  |  |  |  | 17 | ($name0= uc $name0)=~ y/A-Z//cd; | 
| 170 | 16 |  |  |  |  | 17 | ($name1= uc $name1)=~ y/A-Z//cd; | 
| 171 | 16 | 100 |  |  |  | 28 | return 95 if $name0 eq $name1; | 
| 172 |  |  |  |  |  |  | # nickname (80%) | 
| 173 | 14 | 100 |  |  |  | 32 | return int 0.8*$match if $match= nickname_eq($name0,$name1); | 
| 174 |  |  |  |  |  |  | # fuzzy approx (15%) | 
| 175 | 5 | 100 | 66 |  |  | 584 | return 35 if amatch($name0,$name1) and amatch($name1,$name0); | 
| 176 |  |  |  |  |  |  | # simple trucation | 
| 177 | 3 | 50 | 33 |  |  | 143 | return 10 if $name0=~ /^$name1|$name1$/ or $name1=~ /^$name0|$name0$/; | 
| 178 |  |  |  |  |  |  | # a single initial | 
| 179 | 3 |  |  |  |  | 9 | ($name0,$name1)= @_; | 
| 180 | 3 | 50 |  |  |  | 11 | for($name0=~ m/\b(\w)\b/) { return 5 if $name1=~ /^$_/i; } | 
|  | 1 |  |  |  |  | 9 |  | 
| 181 | 2 | 0 |  |  |  | 4 | for($name1=~ m/\b(\w)\b/) { return 5 if $name0=~ /^$_/i; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 182 | 2 |  |  |  |  | 7 | return; | 
| 183 |  |  |  |  |  |  | } | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | sub lname_eq | 
| 186 |  |  |  |  |  |  | { | 
| 187 | 34 |  |  | 34 | 0 | 34 | my($name0,$name1)= @_; | 
| 188 | 34 | 50 | 33 |  |  | 105 | return unless $name0 and $name1; | 
| 189 | 34 | 100 |  |  |  | 56 | return 100 if $name0 eq $name1; | 
| 190 |  |  |  |  |  |  | # strip trailing suffixes (95%) | 
| 191 | 31 |  |  |  |  | 53 | $name0=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//; | 
| 192 | 31 |  |  |  |  | 56 | $name1=~ s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//; | 
| 193 | 31 | 50 |  |  |  | 43 | return 95 if $name0 eq $name1; | 
| 194 |  |  |  |  |  |  | # recurse hyphenated components | 
| 195 | 31 | 100 |  |  |  | 77 | if($name0=~ /-/) | 
|  |  | 50 |  |  |  |  |  | 
| 196 |  |  |  |  |  |  | { # split hyphenation on hyphen ONLY | 
| 197 | 7 |  |  |  |  | 13 | my($match)= sort { $b <=> $a } map {lname_eq($name1,$_)} split /-/, $name0; | 
|  | 2 |  |  |  |  | 3 |  | 
|  | 14 |  |  |  |  | 25 |  | 
| 198 | 7 | 50 |  |  |  | 26 | return $match if $match; | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  | elsif($name1=~ /-/) | 
| 201 |  |  |  |  |  |  | { # split hyphenation on hyphen ONLY | 
| 202 | 0 |  |  |  |  | 0 | my($match)= sort { $b <=> $a } map {lname_eq($name0,$_)} split /-/, $name1; | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 203 | 0 | 0 |  |  |  | 0 | return $match if $match; | 
| 204 |  |  |  |  |  |  | } | 
| 205 |  |  |  |  |  |  | # all caps, no symbols (85%) | 
| 206 | 24 |  |  |  |  | 40 | ($name0= uc $name0)=~ y/A-Z//cd; | 
| 207 | 24 |  |  |  |  | 27 | ($name1= uc $name1)=~ y/A-Z//cd; | 
| 208 | 24 | 100 |  |  |  | 90 | return 85 if $name0 eq $name1; | 
| 209 |  |  |  |  |  |  | # metaphone (70%) | 
| 210 | 12 | 100 |  |  |  | 55 | return 70 if Metaphone($name0) eq Metaphone($name1); | 
| 211 |  |  |  |  |  |  | # soundex (40%) | 
| 212 | 9 | 50 |  |  |  | 28 | return 40 if soundex($name0) eq soundex($name1); | 
| 213 |  |  |  |  |  |  | # fuzzy approx (15%) | 
| 214 | 9 | 50 | 33 |  |  | 23 | return 25 if amatch($name0,$name1) and amatch($name1,$name0); | 
| 215 |  |  |  |  |  |  | # nonstandard 'hyphenation'/simple truncation | 
| 216 | 9 |  |  |  |  | 226 | ($name0,$name1)= map {(my$n=$_)=~s/\s+([IVX]+|,.*|[JS]r\.?)\s*$//;$n=~y/A-Za-z\-//cd;$n} @_; | 
|  | 18 |  |  |  |  | 32 |  | 
|  | 18 |  |  |  |  | 25 |  | 
|  | 18 |  |  |  |  | 30 |  | 
| 217 | 9 | 100 | 66 |  |  | 56 | return int 0.8*lname_eq($name0,$name1) if $name0=~ s/(\B[A-Z][a-z]+)/-$1/g | 
| 218 |  |  |  |  |  |  | or $name1=~ s/(\B[A-Z][a-z]+)/-$1/g; | 
| 219 | 7 | 100 | 66 |  |  | 160 | return 10 if $name0=~ /^$name1|$name1$/i or $name1=~ /^$name0|$name0$/i; | 
| 220 | 6 |  |  |  |  | 19 | return; | 
| 221 |  |  |  |  |  |  | } | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | sub name_eq | 
| 224 |  |  |  |  |  |  | { | 
| 225 | 18 | 50 |  | 18 | 0 | 887 | my($nomF0,$nomL0,$nomF1,$nomL1,$Frank,$Lrank)= | 
| 226 |  |  |  |  |  |  | ( @_ < 4 ? (_nparse($_[0]),_nparse($_[1])) : @_ ); | 
| 227 | 18 | 100 |  |  |  | 30 | return unless $Lrank= lname_eq $nomL0, $nomL1; | 
| 228 | 17 | 100 |  |  |  | 26 | return unless $Frank= fname_eq $nomF0, $nomF1; | 
| 229 | 16 |  |  |  |  | 1329 | return int $Lrank*0.7 + $Frank*0.3; # another ratio I just made up | 
| 230 |  |  |  |  |  |  | } | 
| 231 |  |  |  |  |  |  |  | 
| 232 |  |  |  |  |  |  | 1 |