File Coverage

blib/lib/Lingua/JA/Gairaigo/Fuzzy.pm

Criterion	Covered	Total	%
statement	117	131	89.3
branch	55	76	72.3
condition	36	51	70.5
subroutine	13	13	100.0
pod	1	8	12.5
total	222	279	79.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::JA::Gairaigo::Fuzzy;
2	1			1		55205	use warnings;
	1					2
	1					38
3	1			1		6	use strict;
	1					3
	1					82
4							require Exporter;
5							our @ISA = qw(Exporter);
6							our @EXPORT_OK = qw/same_gairaigo/;
7							our %EXPORT_TAGS = (
8							all => \@EXPORT_OK,
9							);
10							our $VERSION = '0.08';
11	1			1		514	use utf8;
	1					17
	1					7
12
13	1			1		509	use Text::Fuzzy 'fuzzy_index';
	1					1511
	1					84
14	1			1		495	use Lingua::JA::Moji 'kana2romaji';
	1					62177
	1					1769
15
16							sub same_gairaigo
17							{
18	13			13	1	927	my ($kana, $n, $debug) = @_;
19	13	100				57	if ($kana eq $n) {
20	1					6	return 1;
21							}
22	12	100				40	if (chouon ($kana, $n)) {
23	11					43	my $gotcha = usual_suspect ($kana, $n, $debug);
24	11	100				39	if ($gotcha) {
25	10					89	return 1;
26							}
27							}
28	2					14	return undef;
29							}
30
31							# Check a few likely things
32
33							sub usual_suspect
34							{
35	11			11	0	34	my ($kana, $n, $debug) = @_;
36
37							# The following is an undocumented routine in Text::Fuzzy.
38
39	11					53	my ($dist, $edits) = fuzzy_index ($kana, $n, 1);
40
41							# Is this a likely candidate?
42
43	11					14750	my $gotcha;
44
45	11	100				84	if ($edits =~ /ii\|dd\|rr/) {
46
47							# A double delete, double insertion, or double replace means
48							# this is unlikely to be the same word.
49
50	1	50				7	if ($debug) {
51	0					0	print "$kana has double delete, insert, or replace; rejecting.\n";
52							}
53
54	1					5	return undef;
55							}
56	10					58	my @kana = split //, $kana;
57	10					73	my @nkana = split //, $n;
58	10					40	my @edits = split //, $edits;
59
60	10	50				36	if ($debug) {
61	0					0	printf ("Lengths of a and b: %d %d\n", scalar (@kana), scalar (@nkana));
62							}
63
64							# $i is the offset in @kana, and $j is the offset in @nkana. Note
65							# that @kana and @nkana may have different lengths and the offsets
66							# are adjusted as we look though what edit is necessary to change
67							# "$kana" to "$n".
68
69	10					27	my $i = 0;
70	10					21	my $j = 0;
71
72	10					33	for my $edit (@edits) {
73
74	62	50				154	if ($debug) {
75	0					0	print "Offsets i = $i, j = $j, edit = $edit\n";
76							}
77	62	100				252	if ($edit eq 'r') {
		100
		50
		50
78
79							# Replaced $k with $q.
80
81	6					21	my $k = $kana[$i];
82	6					16	my $q = $nkana[$j];
83	6	50				17	if ($debug) {
84	0					0	print "Replace $k with $q\n";
85							}
86	6	100	66			105	if ($k =~ /[ーィイ]/ && $q =~ /[ーィイ]/) {
87
88							# Check whether the previous kana ends in "e", so it
89							# is something like "ヘイ" and "ヘー".
90
91	3	100				17	if (ends_in_e (\@kana, $i)) {
92	2					6	$gotcha = 1;
93							}
94	3	50	66			37	if (($k eq 'ー' && $q eq 'イ') \|\|
			33
			66
95							($q eq 'ー' && $k eq 'イ')) {
96	3	100				15	if (ends_in_i (\@kana, $i)) {
97	1					4	$gotcha = 1;
98							}
99							}
100							}
101	6	100	100			56	if ($k =~ /[ーッ]/ && $q =~ /[ーッ]/) {
102
103							# A chouon has been replaced with a sokuon, or
104							# vice-versa.
105
106	1					4	$gotcha = 1;
107							}
108	6	100	100			60	if (($k eq 'ー' && $q eq 'ウ') \|\|
			66
			66
109							($q eq 'ー' && $k eq 'ウ')) {
110	2	50				10	if (ends_in_ou (\@kana, $i)) {
111	2					5	$gotcha = 1;
112							}
113							}
114
115							# Whatever we had, increment $i and $j equally because a
116							# character was replaced.
117
118	6					17	$i++;
119	6					18	$j++;
120							}
121							elsif ($edit eq 'd') {
122
123							# Character $k was deleted from $kana to get $n, so we
124							# just increment $i.
125
126	5					19	my $k = $kana[$i];
127	5	50	100			39	if ($k eq 'ー' \|\| $k eq '・' \|\| $k eq 'ッ') {
			66
128
129							# A chouon, nakaguro, or sokuon was deleted from $kana
130							# to get $n.
131
132	5					13	$gotcha = 1;
133							}
134							# Check we are not at the end of the string.
135	5	50				20	if ($j < scalar (@kana)) {
136	5					14	my $q = $kana[$j];
137	5	100				39	if ($q =~ /[ーィイ]/) {
138	2	50				9	if (ends_in_e (\@kana, $i)) {
139	0					0	$gotcha = 1;
140							}
141							}
142							}
143	5					16	$i++;
144							}
145							elsif ($edit eq 'i') {
146
147							# Character $k was inserted into $n, so we just increment
148							# $j, not $i.
149
150	0					0	my $k = $nkana[$j];
151	0	0	0			0	if ($k eq 'ー' \|\| $k eq '・' \|\| $k eq 'ッ') {
			0
152
153							# A chouon, nakaguro, or sokuon was inserted into
154							# $kana to get $n.
155
156	0					0	$gotcha = 1;
157							}
158	0					0	$j++;
159							}
160							elsif ($edit eq 'k') {
161
162							# The two strings are the same at this point, so do not do
163							# any checking but just increment the offsets.
164
165	51					102	$i++;
166	51					107	$j++;
167							}
168							}
169
170							# Check we did not make a mistake scanning the two strings.
171
172	10	50				40	if ($i != scalar @kana) {
173	0					0	warn "Bug: Mismatch $i";
174							}
175	10	50				38	if ($j != scalar @nkana) {
176	0					0	warn "Bug: Mismatch $j";
177							}
178	10					61	return $gotcha;
179							}
180
181							# Work out whether the kana before the one at $i ends in "e".
182
183							sub ends_in_e
184							{
185	5			5	0	20	my ($kana_ref, $i) = @_;
186	5					15	my $prev;
187	5	50				20	if ($i >= 1) {
188	5					17	$prev = $kana_ref->[$i - 1];
189	5					28	$prev = kana2romaji ($prev);
190	5	100				4630	if ($prev =~ /e$/) {
191	2					14	return 1;
192							}
193							}
194	3					17	return undef;
195							}
196
197							# Work out whether the kana before the one at $i ends in "ou".
198
199							sub ends_in_ou
200							{
201	2			2	0	6	my ($kana_ref, $i) = @_;
202	2					6	my $prev;
203	2	50				9	if ($i >= 1) {
204	2					7	$prev = $kana_ref->[$i - 1];
205	2					9	$prev = kana2romaji ($prev);
206	2	50				1113	if ($prev =~ /[ou]$/) {
207	2					11	return 1;
208							}
209							}
210	0					0	return undef;
211							}
212							# Work out whether the kana before the one at $i ends in "e".
213
214							sub ends_in_i
215							{
216	3			3	0	13	my ($kana_ref, $i) = @_;
217	3					7	my $prev;
218	3	50				24	if ($i >= 1) {
219	3					13	$prev = $kana_ref->[$i - 1];
220	3					14	$prev = kana2romaji ($prev);
221	3	100				1745	if ($prev =~ /i$/) {
222	1					8	return 1;
223							}
224							}
225	2					11	return undef;
226							}
227							# Work out whether $x and $y differ in the ways we expect.
228
229							# The name "chouon" is a misnomer.
230
231							sub chouon
232							{
233	12			12	0	33	my ($x, $y) = @_;
234	12					51	my %xa = alph ($x);
235	12					46	my %ya = alph ($y);
236	12					46	my $found;
237	12					46	my $mismatch = check (\%xa, \%ya, \$found);
238	12	100				42	if ($mismatch) {
239	1					7	return undef;
240							}
241	11					35	$mismatch = check (\%ya, \%xa, \$found);
242	11	50				42	if ($mismatch) {
243	0					0	return undef;
244							}
245	11	50				33	if ($found) {
246	11					74	return 1;
247							}
248	0					0	return undef;
249							}
250
251							# Given a word $x, make an alphabet of its consituent letters.
252
253							sub alph
254							{
255	24			24	0	67	my ($x) = @_;
256	24					50	my %xa;
257	24					125	my @xl = split //, $x;
258	24					166	@xa{@xl} = @xl;
259	24					216	return %xa;
260							}
261
262							# Go through the keys of $ya, and check whether the keys which are not
263							# in $xa are the right kind of keys.
264
265							sub check
266							{
267	23			23	0	70	my ($xa, $ya, $found) = @_;
268	23					52	my $ok;
269	23					137	for my $k (keys %$ya) {
270	129	100				355	next if $xa->{$k};
271	18	100	100			148	if ($k eq 'ー' \|\|
			66
			100
			100
			100
272							$k eq 'イ' \|\|
273							$k eq 'ィ' \|\|
274							$k eq '・' \|\|
275							$k eq 'ッ' \|\|
276							$k eq 'ウ') {
277	17					41	$ok = 1;
278	17					46	next;
279							}
280	1					8	return $k;
281							}
282	22	100				69	if ($ok) {
283	16					39	$$found = $ok;
284							}
285	22					65	return;
286							}
287
288							1;