File Coverage

blib/lib/Lingua/EN/NameLookup.pm
Criterion Covered Total %
statement 85 94 90.4
branch 23 26 88.4
condition 1 3 33.3
subroutine 13 14 92.8
pod 9 9 100.0
total 131 146 89.7


line stmt bran cond sub pod time code
1             # $Header$
2             # $Log$
3             #
4             package Lingua::EN::NameLookup;
5 6     6   3134 use warnings;
  6         9  
  6         154  
6 6     6   28 use strict;
  6         9  
  6         164  
7              
8 6     6   26 use vars qw($VERSION);
  6         14  
  6         357  
9             $VERSION = '0.01';
10              
11 6     6   5189 use Text::Soundex;
  6         22108  
  6         817  
12 6     6   44 use Carp;
  6         11  
  6         6334  
13              
14             sub new {
15 6     6 1 165 my $this = shift;
16 6   33     52 my $class = ref($this) || $this;
17 6         15 my $self = {};
18 6         19 bless $self, $class;
19 6         18 return $self;
20             }
21              
22             sub lookup {
23 9     9 1 67 my ($self,$name) = @_;
24 9         32 my $code = soundex($name);
25 9 100       24 return 0 if (!$code);
26 8 100       32 return 0 if (!exists($self->{$code}));
27 5         7 foreach my $item (sort @{ $self->{$code}}) {
  5         26  
28 22 50       38 return 0 if ($item gt $name);
29 22 100       55 return 1 if ($item eq $name);
30             }
31 1         4 return 0;
32             }
33              
34             sub ilookup {
35 6     6 1 50 my ($self,$name) = @_;
36 6         26 my $code = soundex($name);
37 6 100       17 return 0 if (!$code);
38 5 100       20 return 0 if (!exists($self->{$code}));
39 4         6 foreach my $item (sort @{ $self->{$code}}) {
  4         23  
40 8 100       91 return 1 if ($item =~ /$name/i);
41             }
42 0         0 return 0;
43             }
44            
45             sub add {
46 2     2 1 13 my ($self,$name) = @_;
47 2         5 my $code = soundex($name);
48 2 50       8 if (!exists($self->{$code})) {
49 2         8 $self->{$code} = [ $name ];
50             } else {
51 0         0 my @array = @{ $self->{$code}};
  0         0  
52 0         0 push @array,$name;
53 0         0 $self->{$code} = [ sort(@array) ];
54             }
55             }
56            
57             sub dump {
58 2     2 1 27 my ($self,$filename) = @_;
59 2 100       261 unless (open(_DICT,">$filename")) {
60 1         56 warn "Can't open $filename for dump $!";
61 1         19 return 0;
62             }
63 1         1303 foreach my $family ( sort keys %$self ) {
64 1988         2071 print _DICT "$family ",join(":",sort(@{ $self->{$family} })),"\n";
  1988         6569  
65             }
66 1         185 close(_DICT);
67 1         12 return 1;
68             }
69              
70             sub load {
71 7     7 1 64 my ($self,$filename) = @_;
72 7         1846 %$self = ();
73 7 100       387 unless (open(_DICT,"$filename")) {
74 1         36 warn "Can't open $filename for load $!";
75 1         16 return 0;
76             }
77 6         779 while (<_DICT>) {
78 11928         13292 chomp;
79 11928 50       45335 next unless s/^([A-Z]\d{3})\s*//;
80 11928         94716 $self->{$1} = [ split(/:/) ];
81             }
82 6         248 close(_DICT);
83 6         48 return 1;
84             }
85              
86             sub init {
87 2     2 1 23 my ($self,$filename) = @_;
88 2         2675 %$self = ();
89 2 100       90 unless (open(_DICT,"$filename")) {
90 1         61 warn "Can't open $filename for init $!";
91 1         19 return 0;
92             }
93 1         33 while (<_DICT>) {
94 11761         12945 chomp;
95 11761         37534 my $code = soundex($_);
96 11761         12827 push @{ $self->{"$code"} }, "$_";
  11761         52272  
97             }
98 1         50 close(_DICT);
99 1         9 return 1;
100             }
101              
102             sub print {
103 0     0 1 0 my ($self) = @_;
104 0         0 foreach my $family ( sort keys %$self ) {
105 0         0 print "$family: ",join(" ",sort(@{ $self->{$family} })),"\n"
  0         0  
106             }
107             }
108              
109             sub report {
110 1     1 1 12 my ($self) = @_;
111 1         3 my $key_count = 0;
112 1         2 my $longest_array = 0;
113 1         2 my $name_count = 0;
114 1         397 foreach my $key (keys %$self) {
115 1988         1771 $key_count++;
116 1988         2640 my @array = @{$self->{$key}};
  1988         6082  
117 1988         2010 my $array_length = $#array;
118 1988         6077 $name_count += $array_length;
119 1988 100       4298 $longest_array = $array_length if ($array_length > $longest_array);
120             }
121 1         208 $longest_array++;
122 1         9 return ($key_count, $name_count, $longest_array);
123             }
124              
125             1;
126              
127             =pod
128             =head1 Name
129              
130             Lingua::EN::NameLookup - a simple dictionary search and manipulation class.
131              
132             =head1 Synopsis
133              
134             use Lingua::EN::NameLookup;
135             $dict = new Lingua::EN::NameLookup;
136             $dict->load("mydict.dat");
137             $res = $dict->lookup("FOO");
138             $res = $dict->ilookup("Foo");
139             $dict->add("Bar");
140             $dict->dump("mynewdict.dat");
141              
142             =head1 Description
143              
144             This class provides the ability to search and manipulate a simple dictionary.
145             It was originally designed for checking surnames encountered during the
146             preparation of census indices. It works best with small data sets and where the
147             names in the data set generate many distinct soundex values. The dictionary is
148             maintained in memory and hence the memory usage depends on the number of names.
149              
150             =head2 Technique
151              
152             Here's how data is stored in the dictionary:
153              
154             Firstly the soundex value of the name is calculated. If there is no key in the
155             hash with the soundex then the name is stored as a one element array. If there
156             already is a key in the hash with the soundex the name is added to the end of
157             the existing array. Then the array is sorted and stored back in the hash. Hence
158             for a name such as BARLOW we might have the following in the hash:
159              
160             B640 => (BARIL, BARLEY, BARLOW, BERLE,...)
161              
162             Here's how we look up a name:
163              
164             First the soundex of the name is calculated. If there is no key in the hash
165             with that soundex then the name is not in the dictionary. If there is a key in
166             the hash with that soundex then the array is retrieved and searched for the
167             name. Since we know that the array is sorted then the search can terminate as
168             soon as an array element greater than the name being searched for is found as
169             we then know that it cannot be in the array. This speeds things up when the
170             individual arrays are large.
171              
172             =head1 Methods
173              
174             =head2 new
175              
176             Creates a dictionary object and initialises it (to be empty). Options are
177             passed as keyword value pairs. Recognised options are:
178              
179             =head2 lookup($name)
180              
181             Looks up the name in the dictionary, returns true if it is found or false if it
182             is not found.
183              
184             =head2 ilookup($name)
185              
186             Looks up the name in the dictionary but with a case insensitive match, returns
187             true if it is found or false if it is not found. Not as efficient as lookup.
188              
189             =head2 add($name)
190              
191             Add one name to the dictionary. Probably called after B q.v. has failed
192             to find a name.
193              
194             =head2 dump($file)
195              
196             Dumps the dictionary to a file suitable for subsequent reading by B q.v.
197             Each line of the file looks like:
198              
199             soundex name1:name2:name3...
200              
201             If the file cannot be opened for writing then this method will croak.
202              
203             =head2 load($file)
204              
205             Load the dictionary from a file produced by B q.v. This is more efficient
206             than using the B method as it saves having to calculate the soundex for
207             each name. Each line of the file looks like:
208              
209             soundex name1:name2:name3...
210              
211             If the file cannot be opened for reading then this method will croak.
212              
213             =head2 init($file)
214              
215             Initialise the dictionary from a file containing one name on each line.
216              
217             If the file cannot be opened for reading then this method will croak.
218              
219             =head2 print
220              
221             Produce a human readable form of the dictionary on standard output. This method
222             was originally designed for debugging but may have other uses.
223              
224             =head2 report
225              
226             Returns a list containing the number of keys in the hash, the number of names
227             in the hash and the length of the longest has entry. This method was originally
228             designed for performance testing but may have other uses.
229              
230             =head1 Copyright
231              
232             Copyright (c) 2002 Pete Barlow . All rights reserved.
233             This program is free software; you can redistribute it and/or modify it under
234             the same terms as Perl itself.