File Coverage

blib/lib/Lingua/JA/Gairaigo/Fuzzy.pm

Criterion	Covered	Total	%
statement	117	131	89.3
branch	55	76	72.3
condition	36	51	70.5
subroutine	13	13	100.0
pod	1	8	12.5
total	222	279	79.5

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::JA::Gairaigo::Fuzzy;
2	1			1		12407	use warnings;
	1					1
	1					24
3	1			1		3	use strict;
	1					1
	1					56
4							require Exporter;
5							our @ISA = qw(Exporter);
6							our @EXPORT_OK = qw/same_gairaigo/;
7							our %EXPORT_TAGS = (
8							all => \@EXPORT_OK,
9							);
10							our $VERSION = '0.07';
11	1			1		491	use utf8;
	1					10
	1					3
12
13	1			1		405	use Text::Fuzzy 'fuzzy_index';
	1					906
	1					48
14	1			1		481	use Lingua::JA::Moji 'kana2romaji';
	1					26864
	1					826
15
16							sub same_gairaigo
17							{
18	13			13	1	62	my ($kana, $n, $debug) = @_;
19	13	100				24	if ($kana eq $n) {
20	1					3	return 1;
21							}
22	12	100				22	if (chouon ($kana, $n)) {
23	11					14	my $gotcha = usual_suspect ($kana, $n, $debug);
24	11	100				17	if ($gotcha) {
25	10					31	return 1;
26							}
27							}
28	2					5	return undef;
29							}
30
31							# Check a few likely things
32
33							sub usual_suspect
34							{
35	11			11	0	11	my ($kana, $n, $debug) = @_;
36
37							# The following is an undocumented routine in Text::Fuzzy.
38
39	11					22	my ($dist, $edits) = fuzzy_index ($kana, $n, 1);
40
41							# Is this a likely candidate?
42
43	11					4147	my $gotcha;
44
45	11	100				32	if ($edits =~ /ii\|dd\|rr/) {
46
47							# A double delete, double insertion, or double replace means
48							# this is unlikely to be the same word.
49
50	1	50				4	if ($debug) {
51	0					0	print "$kana has double delete, insert, or replace; rejecting.\n";
52							}
53
54	1					2	return undef;
55							}
56	10					20	my @kana = split //, $kana;
57	10					14	my @nkana = split //, $n;
58	10					17	my @edits = split //, $edits;
59
60	10	50				13	if ($debug) {
61	0					0	printf ("Lengths of a and b: %d %d\n", scalar (@kana), scalar (@nkana));
62							}
63
64							# $i is the offset in @kana, and $j is the offset in @nkana. Note
65							# that @kana and @nkana may have different lengths and the offsets
66							# are adjusted as we look though what edit is necessary to change
67							# "$kana" to "$n".
68
69	10					9	my $i = 0;
70	10					7	my $j = 0;
71
72	10					11	for my $edit (@edits) {
73
74	62	50				60	if ($debug) {
75	0					0	print "Offsets i = $i, j = $j, edit = $edit\n";
76							}
77	62	100				150	if ($edit eq 'r') {
		100
		50
		50
78
79							# Replaced $k with $q.
80
81	6					8	my $k = $kana[$i];
82	6					4	my $q = $nkana[$j];
83	6	50				8	if ($debug) {
84	0					0	print "Replace $k with $q\n";
85							}
86	6	100	66			38	if ($k =~ /[ーィイ]/ && $q =~ /[ーィイ]/) {
87
88							# Check whether the previous kana ends in "e", so it
89							# is something like "ヘイ" and "ヘー".
90
91	3	100				7	if (ends_in_e (\@kana, $i)) {
92	2					3	$gotcha = 1;
93							}
94	3	50	66			20	if (($k eq 'ー' && $q eq 'イ') \|\|
			33
			66
95							($q eq 'ー' && $k eq 'イ')) {
96	3	100				7	if (ends_in_i (\@kana, $i)) {
97	1					1	$gotcha = 1;
98							}
99							}
100							}
101	6	100	100			25	if ($k =~ /[ーッ]/ && $q =~ /[ーッ]/) {
102
103							# A chouon has been replaced with a sokuon, or
104							# vice-versa.
105
106	1					1	$gotcha = 1;
107							}
108	6	100	100			30	if (($k eq 'ー' && $q eq 'ウ') \|\|
			66
			66
109							($q eq 'ー' && $k eq 'ウ')) {
110	2	50				4	if (ends_in_ou (\@kana, $i)) {
111	2					1	$gotcha = 1;
112							}
113							}
114
115							# Whatever we had, increment $i and $j equally because a
116							# character was replaced.
117
118	6					5	$i++;
119	6					8	$j++;
120							}
121							elsif ($edit eq 'd') {
122
123							# Character $k was deleted from $kana to get $n, so we
124							# just increment $i.
125
126	5					6	my $k = $kana[$i];
127	5	50	100			26	if ($k eq 'ー' \|\| $k eq '・' \|\| $k eq 'ッ') {
			66
128
129							# A chouon, nakaguro, or sokuon was deleted from $kana
130							# to get $n.
131
132	5					4	$gotcha = 1;
133							}
134							# Check we are not at the end of the string.
135	5	50				9	if ($j < scalar (@kana)) {
136	5					3	my $q = $kana[$j];
137	5	100				15	if ($q =~ /[ーィイ]/) {
138	2	50				5	if (ends_in_e (\@kana, $i)) {
139	0					0	$gotcha = 1;
140							}
141							}
142							}
143	5					7	$i++;
144							}
145							elsif ($edit eq 'i') {
146
147							# Character $k was inserted into $n, so we just increment
148							# $j, not $i.
149
150	0					0	my $k = $nkana[$j];
151	0	0	0			0	if ($k eq 'ー' \|\| $k eq '・' \|\| $k eq 'ッ') {
			0
152
153							# A chouon, nakaguro, or sokuon was inserted into
154							# $kana to get $n.
155
156	0					0	$gotcha = 1;
157							}
158	0					0	$j++;
159							}
160							elsif ($edit eq 'k') {
161
162							# The two strings are the same at this point, so do not do
163							# any checking but just increment the offsets.
164
165	51					29	$i++;
166	51					41	$j++;
167							}
168							}
169
170							# Check we did not make a mistake scanning the two strings.
171
172	10	50				15	if ($i != scalar @kana) {
173	0					0	warn "Bug: Mismatch $i";
174							}
175	10	50				12	if ($j != scalar @nkana) {
176	0					0	warn "Bug: Mismatch $j";
177							}
178	10					24	return $gotcha;
179							}
180
181							# Work out whether the kana before the one at $i ends in "e".
182
183							sub ends_in_e
184							{
185	5			5	0	17	my ($kana_ref, $i) = @_;
186	5					4	my $prev;
187	5	50				8	if ($i >= 1) {
188	5					6	$prev = $kana_ref->[$i - 1];
189	5					12	$prev = kana2romaji ($prev);
190	5	100				1966	if ($prev =~ /e$/) {
191	2					5	return 1;
192							}
193							}
194	3					8	return undef;
195							}
196
197							# Work out whether the kana before the one at $i ends in "ou".
198
199							sub ends_in_ou
200							{
201	2			2	0	3	my ($kana_ref, $i) = @_;
202	2					1	my $prev;
203	2	50				6	if ($i >= 1) {
204	2					3	$prev = $kana_ref->[$i - 1];
205	2					4	$prev = kana2romaji ($prev);
206	2	50				415	if ($prev =~ /[ou]$/) {
207	2					6	return 1;
208							}
209							}
210	0					0	return undef;
211							}
212							# Work out whether the kana before the one at $i ends in "e".
213
214							sub ends_in_i
215							{
216	3			3	0	3	my ($kana_ref, $i) = @_;
217	3					3	my $prev;
218	3	50				6	if ($i >= 1) {
219	3					4	$prev = $kana_ref->[$i - 1];
220	3					5	$prev = kana2romaji ($prev);
221	3	100				590	if ($prev =~ /i$/) {
222	1					3	return 1;
223							}
224							}
225	2					5	return undef;
226							}
227							# Work out whether $x and $y differ in the ways we expect.
228
229							# The name "chouon" is a misnomer.
230
231							sub chouon
232							{
233	12			12	0	11	my ($x, $y) = @_;
234	12					15	my %xa = alph ($x);
235	12					15	my %ya = alph ($y);
236	12					10	my $found;
237	12					19	my $mismatch = check (\%xa, \%ya, \$found);
238	12	100				16	if ($mismatch) {
239	1					3	return undef;
240							}
241	11					14	$mismatch = check (\%ya, \%xa, \$found);
242	11	50				16	if ($mismatch) {
243	0					0	return undef;
244							}
245	11	50				13	if ($found) {
246	11					32	return 1;
247							}
248	0					0	return undef;
249							}
250
251							# Given a word $x, make an alphabet of its consituent letters.
252
253							sub alph
254							{
255	24			24	0	18	my ($x) = @_;
256	24					15	my %xa;
257	24					49	my @xl = split //, $x;
258	24					78	@xa{@xl} = @xl;
259	24					98	return %xa;
260							}
261
262							# Go through the keys of $ya, and check whether the keys which are not
263							# in $xa are the right kind of keys.
264
265							sub check
266							{
267	23			23	0	18	my ($xa, $ya, $found) = @_;
268	23					15	my $ok;
269	23					37	for my $k (keys %$ya) {
270	128	100				153	next if $xa->{$k};
271	18	100	100			98	if ($k eq 'ー' \|\|
			66
			100
			100
			100
272							$k eq 'イ' \|\|
273							$k eq 'ィ' \|\|
274							$k eq '・' \|\|
275							$k eq 'ッ' \|\|
276							$k eq 'ウ') {
277	17					11	$ok = 1;
278	17					16	next;
279							}
280	1					3	return $k;
281							}
282	22	100				32	if ($ok) {
283	16					12	$$found = $ok;
284							}
285	22					22	return;
286							}
287
288							1;