line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# ============================================================================ |
2
|
|
|
|
|
|
|
package Text::Phonetic::Phonix; |
3
|
|
|
|
|
|
|
# ============================================================================ |
4
|
4
|
|
|
4
|
|
87089
|
use utf8; |
|
4
|
|
|
|
|
16
|
|
|
4
|
|
|
|
|
21
|
|
5
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
391
|
use Moo; |
|
4
|
|
|
|
|
7631
|
|
|
4
|
|
|
|
|
18
|
|
7
|
|
|
|
|
|
|
extends qw(Text::Phonetic); |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our $VERSION = $Text::Phonetic::VERSION; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VOVEL = '[AEIOU]'; |
12
|
|
|
|
|
|
|
our $VOVEL_WITHY = '[AEIOUY]'; |
13
|
|
|
|
|
|
|
our $CONSONANT = '[BCDFGHJLMNPQRSTVXZXY]'; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our @VALUES = ( |
16
|
|
|
|
|
|
|
[qr/[AEIOUHWY]/,0], |
17
|
|
|
|
|
|
|
[qr/[BP]/,1], |
18
|
|
|
|
|
|
|
[qr/[CGJKQ]/,2], |
19
|
|
|
|
|
|
|
[qr/[DT]/,3], |
20
|
|
|
|
|
|
|
[qr/L/,4], |
21
|
|
|
|
|
|
|
[qr/[MN]/,5], |
22
|
|
|
|
|
|
|
[qr/R/,6], |
23
|
|
|
|
|
|
|
[qr/[FV]/,7], |
24
|
|
|
|
|
|
|
[qr/[SXZ]/,8], |
25
|
|
|
|
|
|
|
); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
our @RULES = ( |
28
|
|
|
|
|
|
|
[qr/DG/,'G'], |
29
|
|
|
|
|
|
|
[qr/C([OAU])/,'K1'], |
30
|
|
|
|
|
|
|
[qr/C[YI]/,'SI'], |
31
|
|
|
|
|
|
|
[qr/CE/,'SE'], |
32
|
|
|
|
|
|
|
[qr/^CL($VOVEL)/,'KL1'], |
33
|
|
|
|
|
|
|
[qr/CK/,'K'], |
34
|
|
|
|
|
|
|
[qr/[GJ]C$/,'K'], |
35
|
|
|
|
|
|
|
[qr/^CH?R($VOVEL)/,'KR1'], |
36
|
|
|
|
|
|
|
[qr/^WR/,'R'], |
37
|
|
|
|
|
|
|
[qr/NC/,'NK'], |
38
|
|
|
|
|
|
|
[qr/CT/,'KT'], |
39
|
|
|
|
|
|
|
[qr/PH/,'F'], |
40
|
|
|
|
|
|
|
[qr/AA/,'AR'], #neu |
41
|
|
|
|
|
|
|
[qr/SCH/,'SH'], |
42
|
|
|
|
|
|
|
[qr/BTL/,'TL'], |
43
|
|
|
|
|
|
|
[qr/GHT/,'T'], |
44
|
|
|
|
|
|
|
[qr/AUGH/,'ARF'], |
45
|
|
|
|
|
|
|
[qr/($VOVEL)LJ($VOVEL)/,'1LD2'], |
46
|
|
|
|
|
|
|
[qr/LOUGH/,'LOW'], |
47
|
|
|
|
|
|
|
[qr/^Q/,'KW'], |
48
|
|
|
|
|
|
|
[qr/^KN/,'N'], |
49
|
|
|
|
|
|
|
[qr/GN$/,'N'], |
50
|
|
|
|
|
|
|
[qr/GHN/,'N'], |
51
|
|
|
|
|
|
|
[qr/GNE$/,'N'], |
52
|
|
|
|
|
|
|
[qr/GHNE/,'NE'], |
53
|
|
|
|
|
|
|
[qr/GNES$/,'NS'], |
54
|
|
|
|
|
|
|
[qr/^GN/,'N'], |
55
|
|
|
|
|
|
|
[qr/(\w)GN($CONSONANT)/,'1N2'], |
56
|
|
|
|
|
|
|
[qr/^PS/,'S'], |
57
|
|
|
|
|
|
|
[qr/^PT/,'T'], |
58
|
|
|
|
|
|
|
[qr/^CZ/,'C'], |
59
|
|
|
|
|
|
|
[qr/($VOVEL)WZ(\w)/,'1Z2'], |
60
|
|
|
|
|
|
|
[qr/(\w)CZ(\w)/,'1CH2'], |
61
|
|
|
|
|
|
|
[qr/LZ/,'LSH'], |
62
|
|
|
|
|
|
|
[qr/RZ/,'RSH'], |
63
|
|
|
|
|
|
|
[qr/(\w)Z($VOVEL)/,'1S2'], |
64
|
|
|
|
|
|
|
[qr/ZZ/,'TS'], |
65
|
|
|
|
|
|
|
[qr/($CONSONANT)Z(\w)/,'1TS2'], |
66
|
|
|
|
|
|
|
[qr/HROUGH/,'REW'], |
67
|
|
|
|
|
|
|
[qr/OUGH/,'OF'], |
68
|
|
|
|
|
|
|
[qr/($VOVEL)Q($VOVEL)/,'1KW2'], |
69
|
|
|
|
|
|
|
[qr/($VOVEL)J($VOVEL)/,'1Y2'], |
70
|
|
|
|
|
|
|
[qr/^YJ($VOVEL)/,'Y1'], |
71
|
|
|
|
|
|
|
[qr/^GH/,'G'], |
72
|
|
|
|
|
|
|
[qr/($VOVEL)E$/,'1GH'], |
73
|
|
|
|
|
|
|
[qr/^CY/,'S'], |
74
|
|
|
|
|
|
|
[qr/NX/,'NKS'], |
75
|
|
|
|
|
|
|
[qr/^PF/,'F'], |
76
|
|
|
|
|
|
|
[qr/DT$/,'T'], |
77
|
|
|
|
|
|
|
[qr/(T|D)L$/,'1IL'], |
78
|
|
|
|
|
|
|
[qr/YTH/,'ITH'], |
79
|
|
|
|
|
|
|
[qr/^TS?J($VOVEL)/,'CH1'], |
80
|
|
|
|
|
|
|
[qr/^TS($VOVEL)/,'T1'], |
81
|
|
|
|
|
|
|
[qr/TCH/,'CH'], # old che |
82
|
|
|
|
|
|
|
[qr/($VOVEL)WSK/,'1VSIKE'], |
83
|
|
|
|
|
|
|
[qr/^[PM]N($VOVEL)/,'N1'], |
84
|
|
|
|
|
|
|
[qr/($VOVEL)STL/,'1SL'], |
85
|
|
|
|
|
|
|
[qr/TNT$/,'ENT'], |
86
|
|
|
|
|
|
|
[qr/EAUX$/,'OH'], |
87
|
|
|
|
|
|
|
[qr/EXCI/,'ECS'], |
88
|
|
|
|
|
|
|
[qr/X/,'ECS'], |
89
|
|
|
|
|
|
|
[qr/NED$/,'ND'], |
90
|
|
|
|
|
|
|
[qr/JR/,'DR'], |
91
|
|
|
|
|
|
|
[qr/EE$/,'EA'], |
92
|
|
|
|
|
|
|
[qr/ZS/,'S'], |
93
|
|
|
|
|
|
|
[qr/($VOVEL)H?R($CONSONANT)/,'1AH2'], |
94
|
|
|
|
|
|
|
[qr/($VOVEL)HR$/,'1AH'], |
95
|
|
|
|
|
|
|
[qr/RE$/,'AR'], |
96
|
|
|
|
|
|
|
[qr/($VOVEL)R$/,'1AH'], |
97
|
|
|
|
|
|
|
[qr/LLE/,'LE'], |
98
|
|
|
|
|
|
|
[qr/($CONSONANT)LE(S?)$/,'1ILE2'], |
99
|
|
|
|
|
|
|
[qr/E$/,''], |
100
|
|
|
|
|
|
|
[qr/ES$/,'S'], |
101
|
|
|
|
|
|
|
[qr/($VOVEL)SS/,'1AS'], |
102
|
|
|
|
|
|
|
[qr/($VOVEL)MB$/,'1M'], |
103
|
|
|
|
|
|
|
[qr/MPTS/,'MPS'], |
104
|
|
|
|
|
|
|
[qr/MPS/,'MS'], |
105
|
|
|
|
|
|
|
[qr/MPT/,'MT'], |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
); |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
#sub _do_compare { |
110
|
|
|
|
|
|
|
# my $obj = shift; |
111
|
|
|
|
|
|
|
# my $result1 = shift; |
112
|
|
|
|
|
|
|
# my $result2 = shift; |
113
|
|
|
|
|
|
|
# |
114
|
|
|
|
|
|
|
# # Main values are different |
115
|
|
|
|
|
|
|
# return 0 unless ($result1->[0] eq $result2->[0]); |
116
|
|
|
|
|
|
|
# |
117
|
|
|
|
|
|
|
# # Ending values the same |
118
|
|
|
|
|
|
|
# return 75 if ($result1->[1] eq $result2->[1]); |
119
|
|
|
|
|
|
|
# |
120
|
|
|
|
|
|
|
# # Ending values differ in length, and are same for the shorter |
121
|
|
|
|
|
|
|
# my $length1 = length $result1->[1]; |
122
|
|
|
|
|
|
|
# my $length2 = length $result2->[1]; |
123
|
|
|
|
|
|
|
# if ($length1 > $length2 |
124
|
|
|
|
|
|
|
# && $length1 - $length2 == 1) { |
125
|
|
|
|
|
|
|
# return 50 if (substr($result1->[1],0,$length2) eq $result2->[1]); |
126
|
|
|
|
|
|
|
# }elsif ($length2 > $length1 |
127
|
|
|
|
|
|
|
# && $length2 - $length1 == 1) { |
128
|
|
|
|
|
|
|
# return 50 if (substr($result2->[1],0,$length1) eq $result1->[1]); |
129
|
|
|
|
|
|
|
# } |
130
|
|
|
|
|
|
|
# |
131
|
|
|
|
|
|
|
# return 25; |
132
|
|
|
|
|
|
|
#} |
133
|
|
|
|
|
|
|
#The algorithm always returns either a scalar value or an array reference with |
134
|
|
|
|
|
|
|
#two elements. The fist element represents the sound of the name without the |
135
|
|
|
|
|
|
|
#ending sound, and the second element represents the ending sound. To get a |
136
|
|
|
|
|
|
|
#full representation of the name you need to concat the two elements. |
137
|
|
|
|
|
|
|
# |
138
|
|
|
|
|
|
|
#If you want to compare two names the following rules apply: |
139
|
|
|
|
|
|
|
# |
140
|
|
|
|
|
|
|
#=over |
141
|
|
|
|
|
|
|
# |
142
|
|
|
|
|
|
|
#=item * If the ending sound values of an entered name and a retrieved name are |
143
|
|
|
|
|
|
|
#the same, the retrieved name is a LIKELY candidate. |
144
|
|
|
|
|
|
|
# |
145
|
|
|
|
|
|
|
#=item * If an entered name has an ending-sound value, and the retrieved name |
146
|
|
|
|
|
|
|
#does not, then the retrieved name is a LEAST-LIKELY candidate. |
147
|
|
|
|
|
|
|
# |
148
|
|
|
|
|
|
|
#=item * If the two ending-sound values are the same for the length of the |
149
|
|
|
|
|
|
|
#shorter, and the difference in length between the two ending-sound is one |
150
|
|
|
|
|
|
|
#digit only, then the retrieved name isa LESS-LIKELY candidate. |
151
|
|
|
|
|
|
|
# |
152
|
|
|
|
|
|
|
#=item * All other cases result in LEAST-LIKELY candidates. |
153
|
|
|
|
|
|
|
# |
154
|
|
|
|
|
|
|
#=back |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub _do_encode { |
157
|
26
|
|
|
26
|
|
54
|
my ($self,$string) = @_; |
158
|
|
|
|
|
|
|
|
159
|
26
|
|
|
|
|
29
|
my ($original_string, $first_char); |
160
|
26
|
|
|
|
|
38
|
$original_string = $string; |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# To uppercase and remove other characters |
163
|
26
|
|
|
|
|
49
|
$string = uc($string); |
164
|
26
|
|
|
|
|
48
|
$string =~ tr/A-Z//cd; |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# RULE 1: Replcace rule |
167
|
26
|
|
|
|
|
44
|
foreach my $rule (@RULES) { |
168
|
2028
|
|
|
|
|
2503
|
my $regexp = $rule->[0]; |
169
|
2028
|
|
|
|
|
2197
|
my $replace = $rule->[1]; |
170
|
2028
|
|
|
|
|
3835
|
$string =~ s/$regexp/_replace($replace,$1,$2)/ge; |
|
40
|
|
|
|
|
85
|
|
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# RULE 2: Fetch first character |
174
|
26
|
|
|
|
|
68
|
$first_char = substr($string,0,1,''); |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# RULE 3: Exceptions for first character rule |
177
|
26
|
100
|
100
|
|
|
40
|
if (grep { $first_char eq $_ } qw(A E I O U Y)) { |
|
156
|
100
|
|
|
|
306
|
|
178
|
1
|
|
|
|
|
2
|
$first_char = 'v'; |
179
|
1
|
|
|
|
|
22
|
$string =~ s/^$VOVEL_WITHY//; |
180
|
|
|
|
|
|
|
} elsif ($first_char eq 'W' || $first_char eq 'H') { |
181
|
|
|
|
|
|
|
#$string =~ s/^[WH]//; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# RULE 4 |
185
|
26
|
|
|
|
|
45
|
$string =~ s/ES$/S/; |
186
|
|
|
|
|
|
|
# RULE 5 |
187
|
26
|
|
|
|
|
119
|
$string =~ s/($VOVEL_WITHY)$/$1E/; |
188
|
|
|
|
|
|
|
# RULE 6 |
189
|
|
|
|
|
|
|
#$string =~ s/\w$//; # This rule seems kind of strict |
190
|
|
|
|
|
|
|
# RULE 7-8 |
191
|
|
|
|
|
|
|
# if ($string =~ s/($VOVEL_WITHY)([A-Z]+)$/$2/) { |
192
|
|
|
|
|
|
|
# # RULE 13 |
193
|
|
|
|
|
|
|
# $last_string = _transform($2); |
194
|
|
|
|
|
|
|
# } |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# RULE 9-11 |
197
|
26
|
|
|
|
|
49
|
$string = _transform($string); |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# RULE 12 |
200
|
26
|
|
|
|
|
47
|
$string = $first_char.$string; |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
#$string .= $last_string if (defined $last_string); |
203
|
26
|
|
|
|
|
62
|
$string .= '0' x (8-length $string); |
204
|
26
|
|
|
|
|
44
|
$string = substr($string,0,8); |
205
|
|
|
|
|
|
|
|
206
|
26
|
|
|
|
|
113
|
return $string; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub _transform { |
210
|
26
|
|
|
26
|
|
37
|
my $string = shift; |
211
|
26
|
50
|
|
|
|
57
|
return unless defined $string; |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# RULE 9 |
214
|
26
|
|
|
|
|
96
|
$string =~ s/([AEIOUYHW])//g; |
215
|
|
|
|
|
|
|
# RULE 10 |
216
|
26
|
|
|
|
|
120
|
$string =~ s/($CONSONANT+)\1/$1/g; |
217
|
|
|
|
|
|
|
# RULE 11 |
218
|
26
|
|
|
|
|
53
|
foreach my $value (@VALUES) { |
219
|
234
|
|
|
|
|
317
|
my $regexp = $value->[0]; |
220
|
234
|
|
|
|
|
534
|
$string =~ s/$regexp/$value->[1]/g; |
221
|
|
|
|
|
|
|
} |
222
|
26
|
|
|
|
|
62
|
return $string; |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
sub _replace { |
226
|
40
|
|
|
40
|
|
49
|
my $replace = shift; |
227
|
40
|
|
|
|
|
76
|
my $pos1 = shift; |
228
|
40
|
|
|
|
|
49
|
my $pos2 = shift; |
229
|
|
|
|
|
|
|
|
230
|
40
|
100
|
|
|
|
106
|
$replace =~ s/1/$pos1/ if (defined $pos1); |
231
|
40
|
100
|
|
|
|
77
|
$replace =~ s/2/$pos2/ if (defined $pos2); |
232
|
|
|
|
|
|
|
|
233
|
40
|
|
|
|
|
118
|
return $replace; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
1; |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=encoding utf8 |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=pod |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
=head1 NAME |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
Text::Phonetic::Phonix - Phonix algorithm |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=head1 DESCRIPTION |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Phonix is an improved version of Soundex, developed by T.N. Gadd. Phonix |
249
|
|
|
|
|
|
|
has been incorporated into a number of WAIS implementations, including |
250
|
|
|
|
|
|
|
freeWAIS. |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
There seem to be two variants of the Phonix algorithm. One which also includes |
253
|
|
|
|
|
|
|
the first letter in the numeric code, and one that doesn't. This module is |
254
|
|
|
|
|
|
|
using the later variant. |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=head1 AUTHOR |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
Maroš Kollár |
259
|
|
|
|
|
|
|
CPAN ID: MAROS |
260
|
|
|
|
|
|
|
maros [at] k-1.com |
261
|
|
|
|
|
|
|
http://www.k-1.com |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head1 COPYRIGHT |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Text::Phonetic::Phonix is Copyright (c) 2006,2007 Maroš. Kollár. |
266
|
|
|
|
|
|
|
All rights reserved. |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
This program is free software; you can redistribute |
269
|
|
|
|
|
|
|
it and/or modify it under the same terms as Perl itself. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
The full text of the license can be found in the |
272
|
|
|
|
|
|
|
LICENSE file included with this module. |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=head1 SEE ALSO |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=cut |