File Coverage

blib/lib/Lingua/EN/SimilarNames/Levenshtein.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             package Lingua::EN::SimilarNames::Levenshtein;
2              
3 1     1   2029 use MooseX::Declare;
  0            
  0            
4             use Text::LevenshteinXS qw(distance);
5             use Math::Combinatorics;
6             use strict;
7             use warnings;
8             use 5.010;
9              
10             our $VERSION = '0.10';
11              
12             =head1 Name
13            
14             Lingua::EN::SimilarNames::Levenshtein - Compare people first and last names.
15              
16             =head1 Synopsis
17              
18             my $people = [
19             [ 'John', 'Wayne' ],
20             [ 'Sundance', 'Kid' ],
21             [ 'Jose', 'Wales' ],
22             [ 'John', 'Wall' ],
23             ];
24            
25             my @people_objects = map {
26             Person->new(
27             first_name => $_->[0],
28             last_name => $_->[1],
29             )
30             } @{$people};
31            
32             # Build list of name pairs within 5 character edits of each other
33             my $similar_people = SimilarNames->new(
34             list_of_people => \@people_objects,
35             maximum_distance => 5
36             );
37            
38             # Get the people name pairs as an ArrayRef[ArrayRef[ArrayRef[Str]]]
39             print Dumper $similar_people->list_of_similar_name_pairs;
40             # which results in:
41             [
42             [ [ "Jose", "Wales" ], [ "John", "Wall" ] ],
43             [ [ "Jose", "Wales" ], [ "John", "Wayne" ] ],
44             [ [ "John", "Wall" ], [ "John", "Wayne" ] ]
45             ]
46            
47             =head1 Description
48            
49             Given a list of people objects, find the people whose names are within a
50             specified edit distance.
51            
52             =cut
53              
54             =head1 Classes
55              
56             =head2 Person
57              
58             This class defines people objects with first and last name attributes.
59              
60             =cut
61              
62             class Person {
63             has 'first_name' => (isa => 'Str', is => 'ro', default => '');
64             has 'last_name' => (isa => 'Str', is => 'ro', default => '');
65             has 'full_name' => (
66             isa => 'Str',
67             is => 'ro',
68             lazy_build => 1,
69             );
70              
71             method say_name() {
72             say $self->full_name;
73             }
74              
75             method _build_full_name {
76             return $self->first_name . ' ' . $self->last_name;
77             }
78             }
79              
80             =head2 CompareTwoNames
81              
82             This class defines comparator objects. Given two Person objects,
83             it computes the edit distance between their names.
84              
85             =cut
86              
87             class CompareTwoNames {
88             has 'one_person' => (isa => 'Person', is => 'rw');
89             has 'another_person' => (isa => 'Person', is => 'rw');
90             has 'distance_between' => (
91             isa => 'Int',
92             is => 'ro',
93             lazy_build => 1,
94             );
95              
96             method _build_distance_between() {
97             return Text::LevenshteinXS::distance($self->one_person->first_name,
98             $self->another_person->first_name) +
99             Text::LevenshteinXS::distance($self->one_person->last_name,
100             $self->another_person->last_name);
101             };
102             }
103              
104             =head2 SimilarNames
105              
106             This class takes a list of Person objects and uses CompareTwoNames to
107             generate a list of people with similar names based on an edit distance range.
108              
109             One can get at the list of Person object pairs with similar name via the
110             C<list_of_people_with_similar_names> attribute. Alternatively, one can
111             get at list of the names pairs themselves (no Person object) via the
112             C<list_of_similar_name_pairs> attribute.
113              
114             =cut
115              
116             class SimilarNames {
117             has 'list_of_people' => (
118             isa => 'ArrayRef[Person]',
119             is => 'ro',
120             lazy_build => 1
121             );
122             has 'minimum_distance' => (isa => 'Int', is => 'rw', default => 1);
123             has 'maximum_distance' => (isa => 'Int', is => 'rw', default => 3);
124             has 'list_of_people_with_similar_names' => (
125             isa => 'ArrayRef[ArrayRef[Person]]',
126             is => 'ro',
127             lazy_build => 1
128             );
129             has 'list_of_similar_name_pairs' => (
130             isa => 'ArrayRef[ArrayRef[ArrayRef[Str]]]',
131             is => 'ro',
132             lazy_build => 1
133             );
134              
135             method _build_list_of_people_with_similar_names() {
136             my $people_tuples = Math::Combinatorics->new(
137             count => 2, # This could be abstracted
138             data => $self->list_of_people,
139             );
140             my @list_of_people_with_similar_names;
141             while (my ($first_person, $second_person) = $people_tuples->next_combination()) {
142             my $name_comparison = CompareTwoNames->new(
143             one_person => $first_person,
144             another_person => $second_person,
145             );
146             my $distance_between_names = $name_comparison->distance_between();
147             if ( ($distance_between_names >= $self->minimum_distance)
148             && ($distance_between_names <= $self->maximum_distance))
149             {
150             push @list_of_people_with_similar_names, [ $first_person, $second_person ];
151             }
152             }
153              
154             return \@list_of_people_with_similar_names
155             };
156              
157             method _build_list_of_similar_name_pairs() {
158             my @list_of_similar_name_pairs;
159             foreach my $pair_of_people (@{ $self->list_of_people_with_similar_names }) {
160             push @list_of_similar_name_pairs,
161             [
162             [ $pair_of_people->[0]->first_name, $pair_of_people->[0]->last_name ],
163             [ $pair_of_people->[1]->first_name, $pair_of_people->[1]->last_name ]
164             ];
165             }
166             return \@list_of_similar_name_pairs
167             };
168             }
169              
170             __END__
171              
172             =head1 Accessors
173            
174             =head2 list_of_similar_name_pairs
175              
176             This is called on a SimilarNames object to return a list of similar
177             name pairs for the list of Person objects passed in. It uses the Levenshtein
178             edit distance. This means the names are close to one another in spelling.
179              
180             =head2 list_of_people_with_similar_names
181              
182             This accessor is similar to the C<list_of_similar_name_pairs> but returns a
183             list of Person object pairs instead of the names.
184              
185             =head1 Authors
186            
187             Mateu X. Hunter C<hunter@missoula.org>
188            
189             =head1 Copyright
190            
191             Copyright 2010, Mateu X. Hunter
192            
193             =head1 License
194            
195             You may distribute this code under the same terms as Perl itself.
196              
197             =head1 Code Repository
198              
199             http://github.com/mateu/Lingua-EN-SimilarNames-Levenshtein
200              
201             =cut
202              
203             1