File Coverage

blib/lib/Text/Phonetic/Phonix.pm

Criterion	Covered	Total	%
statement	42	42	100.0
branch	9	10	90.0
condition	3	3	100.0
subroutine	5	5	100.0
pod			n/a
total	59	60	98.3

line	stmt	bran	cond	sub	time	code
1						# ============================================================================
2						package Text::Phonetic::Phonix;
3						# ============================================================================
4	4			4	87089	use utf8;
	4				16
	4				21
5
6	4			4	391	use Moo;
	4				7631
	4				18
7						extends qw(Text::Phonetic);
8
9						our $VERSION = $Text::Phonetic::VERSION;
10
11						our $VOVEL = '[AEIOU]';
12						our $VOVEL_WITHY = '[AEIOUY]';
13						our $CONSONANT = '[BCDFGHJLMNPQRSTVXZXY]';
14
15						our @VALUES = (
16						[qr/[AEIOUHWY]/,0],
17						[qr/[BP]/,1],
18						[qr/[CGJKQ]/,2],
19						[qr/[DT]/,3],
20						[qr/L/,4],
21						[qr/[MN]/,5],
22						[qr/R/,6],
23						[qr/[FV]/,7],
24						[qr/[SXZ]/,8],
25						);
26
27						our @RULES = (
28						[qr/DG/,'G'],
29						[qr/C([OAU])/,'K1'],
30						[qr/C[YI]/,'SI'],
31						[qr/CE/,'SE'],
32						[qr/^CL($VOVEL)/,'KL1'],
33						[qr/CK/,'K'],
34						[qr/[GJ]C$/,'K'],
35						[qr/^CH?R($VOVEL)/,'KR1'],
36						[qr/^WR/,'R'],
37						[qr/NC/,'NK'],
38						[qr/CT/,'KT'],
39						[qr/PH/,'F'],
40						[qr/AA/,'AR'], #neu
41						[qr/SCH/,'SH'],
42						[qr/BTL/,'TL'],
43						[qr/GHT/,'T'],
44						[qr/AUGH/,'ARF'],
45						[qr/($VOVEL)LJ($VOVEL)/,'1LD2'],
46						[qr/LOUGH/,'LOW'],
47						[qr/^Q/,'KW'],
48						[qr/^KN/,'N'],
49						[qr/GN$/,'N'],
50						[qr/GHN/,'N'],
51						[qr/GNE$/,'N'],
52						[qr/GHNE/,'NE'],
53						[qr/GNES$/,'NS'],
54						[qr/^GN/,'N'],
55						[qr/(\w)GN($CONSONANT)/,'1N2'],
56						[qr/^PS/,'S'],
57						[qr/^PT/,'T'],
58						[qr/^CZ/,'C'],
59						[qr/($VOVEL)WZ(\w)/,'1Z2'],
60						[qr/(\w)CZ(\w)/,'1CH2'],
61						[qr/LZ/,'LSH'],
62						[qr/RZ/,'RSH'],
63						[qr/(\w)Z($VOVEL)/,'1S2'],
64						[qr/ZZ/,'TS'],
65						[qr/($CONSONANT)Z(\w)/,'1TS2'],
66						[qr/HROUGH/,'REW'],
67						[qr/OUGH/,'OF'],
68						[qr/($VOVEL)Q($VOVEL)/,'1KW2'],
69						[qr/($VOVEL)J($VOVEL)/,'1Y2'],
70						[qr/^YJ($VOVEL)/,'Y1'],
71						[qr/^GH/,'G'],
72						[qr/($VOVEL)E$/,'1GH'],
73						[qr/^CY/,'S'],
74						[qr/NX/,'NKS'],
75						[qr/^PF/,'F'],
76						[qr/DT$/,'T'],
77						[qr/(T\|D)L$/,'1IL'],
78						[qr/YTH/,'ITH'],
79						[qr/^TS?J($VOVEL)/,'CH1'],
80						[qr/^TS($VOVEL)/,'T1'],
81						[qr/TCH/,'CH'], # old che
82						[qr/($VOVEL)WSK/,'1VSIKE'],
83						[qr/^[PM]N($VOVEL)/,'N1'],
84						[qr/($VOVEL)STL/,'1SL'],
85						[qr/TNT$/,'ENT'],
86						[qr/EAUX$/,'OH'],
87						[qr/EXCI/,'ECS'],
88						[qr/X/,'ECS'],
89						[qr/NED$/,'ND'],
90						[qr/JR/,'DR'],
91						[qr/EE$/,'EA'],
92						[qr/ZS/,'S'],
93						[qr/($VOVEL)H?R($CONSONANT)/,'1AH2'],
94						[qr/($VOVEL)HR$/,'1AH'],
95						[qr/RE$/,'AR'],
96						[qr/($VOVEL)R$/,'1AH'],
97						[qr/LLE/,'LE'],
98						[qr/($CONSONANT)LE(S?)$/,'1ILE2'],
99						[qr/E$/,''],
100						[qr/ES$/,'S'],
101						[qr/($VOVEL)SS/,'1AS'],
102						[qr/($VOVEL)MB$/,'1M'],
103						[qr/MPTS/,'MPS'],
104						[qr/MPS/,'MS'],
105						[qr/MPT/,'MT'],
106
107						);
108
109						#sub _do_compare {
110						# my $obj = shift;
111						# my $result1 = shift;
112						# my $result2 = shift;
113						#
114						# # Main values are different
115						# return 0 unless ($result1->[0] eq $result2->[0]);
116						#
117						# # Ending values the same
118						# return 75 if ($result1->[1] eq $result2->[1]);
119						#
120						# # Ending values differ in length, and are same for the shorter
121						# my $length1 = length $result1->[1];
122						# my $length2 = length $result2->[1];
123						# if ($length1 > $length2
124						# && $length1 - $length2 == 1) {
125						# return 50 if (substr($result1->[1],0,$length2) eq $result2->[1]);
126						# }elsif ($length2 > $length1
127						# && $length2 - $length1 == 1) {
128						# return 50 if (substr($result2->[1],0,$length1) eq $result1->[1]);
129						# }
130						#
131						# return 25;
132						#}
133						#The algorithm always returns either a scalar value or an array reference with
134						#two elements. The fist element represents the sound of the name without the
135						#ending sound, and the second element represents the ending sound. To get a
136						#full representation of the name you need to concat the two elements.
137						#
138						#If you want to compare two names the following rules apply:
139						#
140						#=over
141						#
142						#=item * If the ending sound values of an entered name and a retrieved name are
143						#the same, the retrieved name is a LIKELY candidate.
144						#
145						#=item * If an entered name has an ending-sound value, and the retrieved name
146						#does not, then the retrieved name is a LEAST-LIKELY candidate.
147						#
148						#=item * If the two ending-sound values are the same for the length of the
149						#shorter, and the difference in length between the two ending-sound is one
150						#digit only, then the retrieved name isa LESS-LIKELY candidate.
151						#
152						#=item * All other cases result in LEAST-LIKELY candidates.
153						#
154						#=back
155
156						sub _do_encode {
157	26			26	54	my ($self,$string) = @_;
158
159	26				29	my ($original_string, $first_char);
160	26				38	$original_string = $string;
161
162						# To uppercase and remove other characters
163	26				49	$string = uc($string);
164	26				48	$string =~ tr/A-Z//cd;
165
166						# RULE 1: Replcace rule
167	26				44	foreach my $rule (@RULES) {
168	2028				2503	my $regexp = $rule->[0];
169	2028				2197	my $replace = $rule->[1];
170	2028				3835	$string =~ s/$regexp/_replace($replace,$1,$2)/ge;
	40				85
171						}
172
173						# RULE 2: Fetch first character
174	26				68	$first_char = substr($string,0,1,'');
175
176						# RULE 3: Exceptions for first character rule
177	26	100	100		40	if (grep { $first_char eq $_ } qw(A E I O U Y)) {
	156	100			306
178	1				2	$first_char = 'v';
179	1				22	$string =~ s/^$VOVEL_WITHY//;
180						} elsif ($first_char eq 'W' \|\| $first_char eq 'H') {
181						#$string =~ s/^[WH]//;
182						}
183
184						# RULE 4
185	26				45	$string =~ s/ES$/S/;
186						# RULE 5
187	26				119	$string =~ s/($VOVEL_WITHY)$/$1E/;
188						# RULE 6
189						#$string =~ s/\w$//; # This rule seems kind of strict
190						# RULE 7-8
191						# if ($string =~ s/($VOVEL_WITHY)([A-Z]+)$/$2/) {
192						# # RULE 13
193						# $last_string = _transform($2);
194						# }
195
196						# RULE 9-11
197	26				49	$string = _transform($string);
198
199						# RULE 12
200	26				47	$string = $first_char.$string;
201
202						#$string .= $last_string if (defined $last_string);
203	26				62	$string .= '0' x (8-length $string);
204	26				44	$string = substr($string,0,8);
205
206	26				113	return $string;
207						}
208
209						sub _transform {
210	26			26	37	my $string = shift;
211	26	50			57	return unless defined $string;
212
213						# RULE 9
214	26				96	$string =~ s/([AEIOUYHW])//g;
215						# RULE 10
216	26				120	$string =~ s/($CONSONANT+)\1/$1/g;
217						# RULE 11
218	26				53	foreach my $value (@VALUES) {
219	234				317	my $regexp = $value->[0];
220	234				534	$string =~ s/$regexp/$value->[1]/g;
221						}
222	26				62	return $string;
223						}
224
225						sub _replace {
226	40			40	49	my $replace = shift;
227	40				76	my $pos1 = shift;
228	40				49	my $pos2 = shift;
229
230	40	100			106	$replace =~ s/1/$pos1/ if (defined $pos1);
231	40	100			77	$replace =~ s/2/$pos2/ if (defined $pos2);
232
233	40				118	return $replace;
234						}
235
236						1;
237
238						=encoding utf8
239
240						=pod
241
242						=head1 NAME
243
244						Text::Phonetic::Phonix - Phonix algorithm
245
246						=head1 DESCRIPTION
247
248						Phonix is an improved version of Soundex, developed by T.N. Gadd. Phonix
249						has been incorporated into a number of WAIS implementations, including
250						freeWAIS.
251
252						There seem to be two variants of the Phonix algorithm. One which also includes
253						the first letter in the numeric code, and one that doesn't. This module is
254						using the later variant.
255
256						=head1 AUTHOR
257
258						Maroš Kollár
259						CPAN ID: MAROS
260						maros [at] k-1.com
261						http://www.k-1.com
262
263						=head1 COPYRIGHT
264
265						Text::Phonetic::Phonix is Copyright (c) 2006,2007 Maroš. Kollár.
266						All rights reserved.
267
268						This program is free software; you can redistribute
269						it and/or modify it under the same terms as Perl itself.
270
271						The full text of the license can be found in the
272						LICENSE file included with this module.
273
274						=head1 SEE ALSO
275
276
277						=cut