File Coverage

blib/lib/Text/Phonetic/Phonix.pm

Criterion	Covered	Total	%
statement	42	42	100.0
branch	9	10	90.0
condition	3	3	100.0
subroutine	5	5	100.0
pod			n/a
total	59	60	98.3

line	stmt	bran	cond	sub	time	code
1						# ============================================================================
2						package Text::Phonetic::Phonix;
3						# ============================================================================
4	4			4	99179	use utf8;
	4				10
	4				22
5
6	4			4	606	use Moo;
	4				8915
	4				21
7						extends qw(Text::Phonetic);
8
9						__PACKAGE__->meta->make_immutable;
10
11						our $VERSION = $Text::Phonetic::VERSION;
12
13						our $VOVEL = '[AEIOU]';
14						our $VOVEL_WITHY = '[AEIOUY]';
15						our $CONSONANT = '[BCDFGHJLMNPQRSTVXZXY]';
16
17						our @VALUES = (
18						[qr/[AEIOUHWY]/,0],
19						[qr/[BP]/,1],
20						[qr/[CGJKQ]/,2],
21						[qr/[DT]/,3],
22						[qr/L/,4],
23						[qr/[MN]/,5],
24						[qr/R/,6],
25						[qr/[FV]/,7],
26						[qr/[SXZ]/,8],
27						);
28
29						our @RULES = (
30						[qr/DG/,'G'],
31						[qr/C([OAU])/,'K1'],
32						[qr/C[YI]/,'SI'],
33						[qr/CE/,'SE'],
34						[qr/^CL($VOVEL)/,'KL1'],
35						[qr/CK/,'K'],
36						[qr/[GJ]C$/,'K'],
37						[qr/^CH?R($VOVEL)/,'KR1'],
38						[qr/^WR/,'R'],
39						[qr/NC/,'NK'],
40						[qr/CT/,'KT'],
41						[qr/PH/,'F'],
42						[qr/AA/,'AR'], #neu
43						[qr/SCH/,'SH'],
44						[qr/BTL/,'TL'],
45						[qr/GHT/,'T'],
46						[qr/AUGH/,'ARF'],
47						[qr/($VOVEL)LJ($VOVEL)/,'1LD2'],
48						[qr/LOUGH/,'LOW'],
49						[qr/^Q/,'KW'],
50						[qr/^KN/,'N'],
51						[qr/GN$/,'N'],
52						[qr/GHN/,'N'],
53						[qr/GNE$/,'N'],
54						[qr/GHNE/,'NE'],
55						[qr/GNES$/,'NS'],
56						[qr/^GN/,'N'],
57						[qr/(\w)GN($CONSONANT)/,'1N2'],
58						[qr/^PS/,'S'],
59						[qr/^PT/,'T'],
60						[qr/^CZ/,'C'],
61						[qr/($VOVEL)WZ(\w)/,'1Z2'],
62						[qr/(\w)CZ(\w)/,'1CH2'],
63						[qr/LZ/,'LSH'],
64						[qr/RZ/,'RSH'],
65						[qr/(\w)Z($VOVEL)/,'1S2'],
66						[qr/ZZ/,'TS'],
67						[qr/($CONSONANT)Z(\w)/,'1TS2'],
68						[qr/HROUGH/,'REW'],
69						[qr/OUGH/,'OF'],
70						[qr/($VOVEL)Q($VOVEL)/,'1KW2'],
71						[qr/($VOVEL)J($VOVEL)/,'1Y2'],
72						[qr/^YJ($VOVEL)/,'Y1'],
73						[qr/^GH/,'G'],
74						[qr/($VOVEL)E$/,'1GH'],
75						[qr/^CY/,'S'],
76						[qr/NX/,'NKS'],
77						[qr/^PF/,'F'],
78						[qr/DT$/,'T'],
79						[qr/(T\|D)L$/,'1IL'],
80						[qr/YTH/,'ITH'],
81						[qr/^TS?J($VOVEL)/,'CH1'],
82						[qr/^TS($VOVEL)/,'T1'],
83						[qr/TCH/,'CH'], # old che
84						[qr/($VOVEL)WSK/,'1VSIKE'],
85						[qr/^[PM]N($VOVEL)/,'N1'],
86						[qr/($VOVEL)STL/,'1SL'],
87						[qr/TNT$/,'ENT'],
88						[qr/EAUX$/,'OH'],
89						[qr/EXCI/,'ECS'],
90						[qr/X/,'ECS'],
91						[qr/NED$/,'ND'],
92						[qr/JR/,'DR'],
93						[qr/EE$/,'EA'],
94						[qr/ZS/,'S'],
95						[qr/($VOVEL)H?R($CONSONANT)/,'1AH2'],
96						[qr/($VOVEL)HR$/,'1AH'],
97						[qr/RE$/,'AR'],
98						[qr/($VOVEL)R$/,'1AH'],
99						[qr/LLE/,'LE'],
100						[qr/($CONSONANT)LE(S?)$/,'1ILE2'],
101						[qr/E$/,''],
102						[qr/ES$/,'S'],
103						[qr/($VOVEL)SS/,'1AS'],
104						[qr/($VOVEL)MB$/,'1M'],
105						[qr/MPTS/,'MPS'],
106						[qr/MPS/,'MS'],
107						[qr/MPT/,'MT'],
108
109						);
110
111						#sub _do_compare {
112						# my $obj = shift;
113						# my $result1 = shift;
114						# my $result2 = shift;
115						#
116						# # Main values are different
117						# return 0 unless ($result1->[0] eq $result2->[0]);
118						#
119						# # Ending values the same
120						# return 75 if ($result1->[1] eq $result2->[1]);
121						#
122						# # Ending values differ in length, and are same for the shorter
123						# my $length1 = length $result1->[1];
124						# my $length2 = length $result2->[1];
125						# if ($length1 > $length2
126						# && $length1 - $length2 == 1) {
127						# return 50 if (substr($result1->[1],0,$length2) eq $result2->[1]);
128						# }elsif ($length2 > $length1
129						# && $length2 - $length1 == 1) {
130						# return 50 if (substr($result2->[1],0,$length1) eq $result1->[1]);
131						# }
132						#
133						# return 25;
134						#}
135						#The algorithm always returns either a scalar value or an array reference with
136						#two elements. The fist element represents the sound of the name without the
137						#ending sound, and the second element represents the ending sound. To get a
138						#full representation of the name you need to concat the two elements.
139						#
140						#If you want to compare two names the following rules apply:
141						#
142						#=over
143						#
144						#=item * If the ending sound values of an entered name and a retrieved name are
145						#the same, the retrieved name is a LIKELY candidate.
146						#
147						#=item * If an entered name has an ending-sound value, and the retrieved name
148						#does not, then the retrieved name is a LEAST-LIKELY candidate.
149						#
150						#=item * If the two ending-sound values are the same for the length of the
151						#shorter, and the difference in length between the two ending-sound is one
152						#digit only, then the retrieved name isa LESS-LIKELY candidate.
153						#
154						#=item * All other cases result in LEAST-LIKELY candidates.
155						#
156						#=back
157
158						sub _do_encode {
159	26			26	70	my ($self,$string) = @_;
160
161	26				59	my ($original_string, $first_char);
162	26				44	$original_string = $string;
163
164						# To uppercase and remove other characters
165	26				64	$string = uc($string);
166	26				60	$string =~ tr/A-Z//cd;
167
168						# RULE 1: Replcace rule
169	26				58	foreach my $rule (@RULES) {
170	2028				2941	my $regexp = $rule->[0];
171	2028				2671	my $replace = $rule->[1];
172	2028				4345	$string =~ s/$regexp/_replace($replace,$1,$2)/ge;
	40				97
173						}
174
175						# RULE 2: Fetch first character
176	26				68	$first_char = substr($string,0,1,'');
177
178						# RULE 3: Exceptions for first character rule
179	26	100	100		55	if (grep { $first_char eq $_ } qw(A E I O U Y)) {
	156	100			369
180	1				2	$first_char = 'v';
181	1				23	$string =~ s/^$VOVEL_WITHY//;
182						} elsif ($first_char eq 'W' \|\| $first_char eq 'H') {
183						#$string =~ s/^[WH]//;
184						}
185
186						# RULE 4
187	26				47	$string =~ s/ES$/S/;
188						# RULE 5
189	26				139	$string =~ s/($VOVEL_WITHY)$/$1E/;
190						# RULE 6
191						#$string =~ s/\w$//; # This rule seems kind of strict
192						# RULE 7-8
193						# if ($string =~ s/($VOVEL_WITHY)([A-Z]+)$/$2/) {
194						# # RULE 13
195						# $last_string = _transform($2);
196						# }
197
198						# RULE 9-11
199	26				55	$string = _transform($string);
200
201						# RULE 12
202	26				65	$string = $first_char.$string;
203
204						#$string .= $last_string if (defined $last_string);
205	26				63	$string .= '0' x (8-length $string);
206	26				50	$string = substr($string,0,8);
207
208	26				94	return $string;
209						}
210
211						sub _transform {
212	26			26	43	my $string = shift;
213	26	50			62	return unless defined $string;
214
215						# RULE 9
216	26				103	$string =~ s/([AEIOUYHW])//g;
217						# RULE 10
218	26				132	$string =~ s/($CONSONANT+)\1/$1/g;
219						# RULE 11
220	26				58	foreach my $value (@VALUES) {
221	234				385	my $regexp = $value->[0];
222	234				634	$string =~ s/$regexp/$value->[1]/g;
223						}
224	26				69	return $string;
225						}
226
227						sub _replace {
228	40			40	61	my $replace = shift;
229	40				79	my $pos1 = shift;
230	40				62	my $pos2 = shift;
231
232	40	100			107	$replace =~ s/1/$pos1/ if (defined $pos1);
233	40	100			89	$replace =~ s/2/$pos2/ if (defined $pos2);
234
235	40				120	return $replace;
236						}
237
238						1;
239
240						=encoding utf8
241
242						=pod
243
244						=head1 NAME
245
246						Text::Phonetic::Phonix - Phonix algorithm
247
248						=head1 DESCRIPTION
249
250						Phonix is an improved version of Soundex, developed by T.N. Gadd. Phonix
251						has been incorporated into a number of WAIS implementations, including
252						freeWAIS.
253
254						There seem to be two variants of the Phonix algorithm. One which also includes
255						the first letter in the numeric code, and one that doesn't. This module is
256						using the later variant.
257
258						=head1 AUTHOR
259
260						Maroš Kollár
261						CPAN ID: MAROS
262						maros [at] k-1.com
263						http://www.k-1.com
264
265						=head1 COPYRIGHT
266
267						Text::Phonetic::Phonix is Copyright (c) 2006,2007 Maroš. Kollár.
268						All rights reserved.
269
270						This program is free software; you can redistribute
271						it and/or modify it under the same terms as Perl itself.
272
273						The full text of the license can be found in the
274						LICENSE file included with this module.
275
276						=head1 SEE ALSO
277
278
279						=cut