| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Lingua::EN::SimilarNames::Levenshtein; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
2029
|
use MooseX::Declare; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
use Text::LevenshteinXS qw(distance); |
|
5
|
|
|
|
|
|
|
use Math::Combinatorics; |
|
6
|
|
|
|
|
|
|
use strict; |
|
7
|
|
|
|
|
|
|
use warnings; |
|
8
|
|
|
|
|
|
|
use 5.010; |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '0.10'; |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 Name |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Lingua::EN::SimilarNames::Levenshtein - Compare people first and last names. |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 Synopsis |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my $people = [ |
|
19
|
|
|
|
|
|
|
[ 'John', 'Wayne' ], |
|
20
|
|
|
|
|
|
|
[ 'Sundance', 'Kid' ], |
|
21
|
|
|
|
|
|
|
[ 'Jose', 'Wales' ], |
|
22
|
|
|
|
|
|
|
[ 'John', 'Wall' ], |
|
23
|
|
|
|
|
|
|
]; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my @people_objects = map { |
|
26
|
|
|
|
|
|
|
Person->new( |
|
27
|
|
|
|
|
|
|
first_name => $_->[0], |
|
28
|
|
|
|
|
|
|
last_name => $_->[1], |
|
29
|
|
|
|
|
|
|
) |
|
30
|
|
|
|
|
|
|
} @{$people}; |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# Build list of name pairs within 5 character edits of each other |
|
33
|
|
|
|
|
|
|
my $similar_people = SimilarNames->new( |
|
34
|
|
|
|
|
|
|
list_of_people => \@people_objects, |
|
35
|
|
|
|
|
|
|
maximum_distance => 5 |
|
36
|
|
|
|
|
|
|
); |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# Get the people name pairs as an ArrayRef[ArrayRef[ArrayRef[Str]]] |
|
39
|
|
|
|
|
|
|
print Dumper $similar_people->list_of_similar_name_pairs; |
|
40
|
|
|
|
|
|
|
# which results in: |
|
41
|
|
|
|
|
|
|
[ |
|
42
|
|
|
|
|
|
|
[ [ "Jose", "Wales" ], [ "John", "Wall" ] ], |
|
43
|
|
|
|
|
|
|
[ [ "Jose", "Wales" ], [ "John", "Wayne" ] ], |
|
44
|
|
|
|
|
|
|
[ [ "John", "Wall" ], [ "John", "Wayne" ] ] |
|
45
|
|
|
|
|
|
|
] |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 Description |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
Given a list of people objects, find the people whose names are within a |
|
50
|
|
|
|
|
|
|
specified edit distance. |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=cut |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head1 Classes |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head2 Person |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
This class defines people objects with first and last name attributes. |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=cut |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
class Person { |
|
63
|
|
|
|
|
|
|
has 'first_name' => (isa => 'Str', is => 'ro', default => ''); |
|
64
|
|
|
|
|
|
|
has 'last_name' => (isa => 'Str', is => 'ro', default => ''); |
|
65
|
|
|
|
|
|
|
has 'full_name' => ( |
|
66
|
|
|
|
|
|
|
isa => 'Str', |
|
67
|
|
|
|
|
|
|
is => 'ro', |
|
68
|
|
|
|
|
|
|
lazy_build => 1, |
|
69
|
|
|
|
|
|
|
); |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
method say_name() { |
|
72
|
|
|
|
|
|
|
say $self->full_name; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
method _build_full_name { |
|
76
|
|
|
|
|
|
|
return $self->first_name . ' ' . $self->last_name; |
|
77
|
|
|
|
|
|
|
} |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head2 CompareTwoNames |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
This class defines comparator objects. Given two Person objects, |
|
83
|
|
|
|
|
|
|
it computes the edit distance between their names. |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=cut |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
class CompareTwoNames { |
|
88
|
|
|
|
|
|
|
has 'one_person' => (isa => 'Person', is => 'rw'); |
|
89
|
|
|
|
|
|
|
has 'another_person' => (isa => 'Person', is => 'rw'); |
|
90
|
|
|
|
|
|
|
has 'distance_between' => ( |
|
91
|
|
|
|
|
|
|
isa => 'Int', |
|
92
|
|
|
|
|
|
|
is => 'ro', |
|
93
|
|
|
|
|
|
|
lazy_build => 1, |
|
94
|
|
|
|
|
|
|
); |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
method _build_distance_between() { |
|
97
|
|
|
|
|
|
|
return Text::LevenshteinXS::distance($self->one_person->first_name, |
|
98
|
|
|
|
|
|
|
$self->another_person->first_name) + |
|
99
|
|
|
|
|
|
|
Text::LevenshteinXS::distance($self->one_person->last_name, |
|
100
|
|
|
|
|
|
|
$self->another_person->last_name); |
|
101
|
|
|
|
|
|
|
}; |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head2 SimilarNames |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
This class takes a list of Person objects and uses CompareTwoNames to |
|
107
|
|
|
|
|
|
|
generate a list of people with similar names based on an edit distance range. |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
One can get at the list of Person object pairs with similar name via the |
|
110
|
|
|
|
|
|
|
C<list_of_people_with_similar_names> attribute. Alternatively, one can |
|
111
|
|
|
|
|
|
|
get at list of the names pairs themselves (no Person object) via the |
|
112
|
|
|
|
|
|
|
C<list_of_similar_name_pairs> attribute. |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=cut |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
class SimilarNames { |
|
117
|
|
|
|
|
|
|
has 'list_of_people' => ( |
|
118
|
|
|
|
|
|
|
isa => 'ArrayRef[Person]', |
|
119
|
|
|
|
|
|
|
is => 'ro', |
|
120
|
|
|
|
|
|
|
lazy_build => 1 |
|
121
|
|
|
|
|
|
|
); |
|
122
|
|
|
|
|
|
|
has 'minimum_distance' => (isa => 'Int', is => 'rw', default => 1); |
|
123
|
|
|
|
|
|
|
has 'maximum_distance' => (isa => 'Int', is => 'rw', default => 3); |
|
124
|
|
|
|
|
|
|
has 'list_of_people_with_similar_names' => ( |
|
125
|
|
|
|
|
|
|
isa => 'ArrayRef[ArrayRef[Person]]', |
|
126
|
|
|
|
|
|
|
is => 'ro', |
|
127
|
|
|
|
|
|
|
lazy_build => 1 |
|
128
|
|
|
|
|
|
|
); |
|
129
|
|
|
|
|
|
|
has 'list_of_similar_name_pairs' => ( |
|
130
|
|
|
|
|
|
|
isa => 'ArrayRef[ArrayRef[ArrayRef[Str]]]', |
|
131
|
|
|
|
|
|
|
is => 'ro', |
|
132
|
|
|
|
|
|
|
lazy_build => 1 |
|
133
|
|
|
|
|
|
|
); |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
method _build_list_of_people_with_similar_names() { |
|
136
|
|
|
|
|
|
|
my $people_tuples = Math::Combinatorics->new( |
|
137
|
|
|
|
|
|
|
count => 2, # This could be abstracted |
|
138
|
|
|
|
|
|
|
data => $self->list_of_people, |
|
139
|
|
|
|
|
|
|
); |
|
140
|
|
|
|
|
|
|
my @list_of_people_with_similar_names; |
|
141
|
|
|
|
|
|
|
while (my ($first_person, $second_person) = $people_tuples->next_combination()) { |
|
142
|
|
|
|
|
|
|
my $name_comparison = CompareTwoNames->new( |
|
143
|
|
|
|
|
|
|
one_person => $first_person, |
|
144
|
|
|
|
|
|
|
another_person => $second_person, |
|
145
|
|
|
|
|
|
|
); |
|
146
|
|
|
|
|
|
|
my $distance_between_names = $name_comparison->distance_between(); |
|
147
|
|
|
|
|
|
|
if ( ($distance_between_names >= $self->minimum_distance) |
|
148
|
|
|
|
|
|
|
&& ($distance_between_names <= $self->maximum_distance)) |
|
149
|
|
|
|
|
|
|
{ |
|
150
|
|
|
|
|
|
|
push @list_of_people_with_similar_names, [ $first_person, $second_person ]; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
return \@list_of_people_with_similar_names |
|
155
|
|
|
|
|
|
|
}; |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
method _build_list_of_similar_name_pairs() { |
|
158
|
|
|
|
|
|
|
my @list_of_similar_name_pairs; |
|
159
|
|
|
|
|
|
|
foreach my $pair_of_people (@{ $self->list_of_people_with_similar_names }) { |
|
160
|
|
|
|
|
|
|
push @list_of_similar_name_pairs, |
|
161
|
|
|
|
|
|
|
[ |
|
162
|
|
|
|
|
|
|
[ $pair_of_people->[0]->first_name, $pair_of_people->[0]->last_name ], |
|
163
|
|
|
|
|
|
|
[ $pair_of_people->[1]->first_name, $pair_of_people->[1]->last_name ] |
|
164
|
|
|
|
|
|
|
]; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
return \@list_of_similar_name_pairs |
|
167
|
|
|
|
|
|
|
}; |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
__END__ |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 Accessors |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=head2 list_of_similar_name_pairs |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
This is called on a SimilarNames object to return a list of similar |
|
177
|
|
|
|
|
|
|
name pairs for the list of Person objects passed in. It uses the Levenshtein |
|
178
|
|
|
|
|
|
|
edit distance. This means the names are close to one another in spelling. |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head2 list_of_people_with_similar_names |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
This accessor is similar to the C<list_of_similar_name_pairs> but returns a |
|
183
|
|
|
|
|
|
|
list of Person object pairs instead of the names. |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head1 Authors |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
Mateu X. Hunter C<hunter@missoula.org> |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=head1 Copyright |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Copyright 2010, Mateu X. Hunter |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head1 License |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
You may distribute this code under the same terms as Perl itself. |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 Code Repository |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
http://github.com/mateu/Lingua-EN-SimilarNames-Levenshtein |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=cut |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
1 |