File Coverage

blib/lib/Data/Kanji/Kanjidic.pm

Criterion	Covered	Total	%
statement	107	177	60.4
branch	37	76	48.6
condition	3	6	50.0
subroutine	12	19	63.1
pod	7	11	63.6
total	166	289	57.4

line	stmt	bran	cond	sub	pod	time	code
1							# See Kanjidic.pod for documentation
2
3							package Data::Kanji::Kanjidic;
4							require Exporter;
5	3			3		177621	use warnings;
	3					26
	3					87
6	3			3		15	use strict;
	3					4
	3					237
7							our @ISA = qw(Exporter);
8							our @EXPORT_OK = qw/parse_kanjidic
9							parse_entry
10							kanji_dictionary_order
11							grade_stroke_order
12							kanjidic_order
13							stroke_radical_jis_order
14							%codes
15							%has_dupes
16							grade
17							/;
18
19							our %EXPORT_TAGS = (
20							all => \@EXPORT_OK,
21							);
22							our $VERSION = '0.16';
23	3			3		17	use strict;
	3					5
	3					63
24	3			3		14	use warnings;
	3					5
	3					82
25	3			3		1471	use Encode;
	3					26595
	3					186
26	3			3		991	use utf8;
	3					28
	3					17
27	3			3		77	use Carp;
	3					5
	3					5803
28
29							our $AUTHOR;
30
31							# Parse one string from kanjidic and return it in an associative array.
32
33							our %codes = (
34							'W' => 'Korean pronunciation',
35							'Y' => 'Pinyin pronunciation',
36							'B' => 'Bushu (radical as defined by the Nelson kanji dictionary)',
37							'C' => 'Classic radical (the usual radical, where this is different from the Nelson radical)',
38							'U' => 'Unicode code point as a hexadecimal number',
39							'G' => 'Year of elementary school this kanji is taught',
40							'Q' => 'Four-corner code',
41							'S' => 'Stroke count',
42							'P' => 'SKIP code',
43							'J' => 'Japanese proficiency test level',
44							'N' => 'Nelson code from original Nelson dictionary',
45							'V' => 'Nelson code from the "New Nelson" dictionary',
46							'L' => 'Code from "Remembering the Kanji" by James Heisig',
47							'O' => 'The numbers used in P.G. O\'Neill\'s "Japanese Names"',
48							'K' => 'The index in the Gakken Kanji Dictionary (A New Dictionary of Kanji Usage)',
49							'E' => 'The numbers used in Kenneth Henshall\'s kanji book',
50							'I' => 'The Spahn-Hadamitzky book number',
51							'IN' => 'The Spahn-Hadamitzky kanji-kana book number',
52
53							'MP' => 'Morohashi volume/page',
54							'MN' => 'Morohashi index number',
55							'H' => 'Number in Jack Halpern dictionary',
56							'F' => 'Frequency of kanji',
57
58							'X' => 'Cross reference',
59							'DA' => 'The index numbers used in the 2011 edition of the Kanji & Kana book, by Spahn & Hadamitzky',
60							'DB' => 'Japanese for Busy People textbook numbers',
61							'DC' => 'The index numbers used in "The Kanji Way to Japanese Language Power" by Dale Crowley',
62							'DF' => '"Japanese Kanji Flashcards", by Max Hodges and Tomoko Okazaki',
63							'DG' => 'The index numbers used in the "Kodansha Compact Kanji Guide"',
64							'DH' => 'The index numbers used in the 3rd edition of "A Guide To Reading and Writing Japanese" edited by Kenneth Hensall et al',
65							'DJ' => 'The index numbers used in the "Kanji in Context" by Nishiguchi and Kono',
66							'DK' => 'The index numbers used by Jack Halpern in his Kanji Learners Dictionary',
67							'DL' => 'The index numbers used in the 2013 edition of Halpern\'s Kanji Learners Dictionary',
68							'DM' => 'The index numbers from the French-language version of "Remembering the kanji"',
69							'DN' => 'The index number used in "Remembering The Kanji, 6th Edition" by James Heisig',
70							'DP' => 'The index numbers used by Jack Halpern in his Kodansha Kanji Dictionary (2013), which is the revised version of the "New Japanese-English Kanji Dictionary" of 1990',
71							'DO' => 'The index numbers used in P.G. O\'Neill\'s Essential Kanji',
72							'DR' => 'The codes developed by Father Joseph De Roo, and published in his book "2001 Kanji" (Bonjinsha)',
73							'DS' => 'The index numbers used in the early editions of "A Guide To Reading and Writing Japanese" edited by Florence Sakade',
74							'DT' => 'The index numbers used in the Tuttle Kanji Cards, compiled by Alexander Kask',
75							'XJ' => 'Cross-reference',
76							'XO' => 'Cross-reference',
77							'XH' => 'Cross-reference',
78							'XI' => 'Cross-reference',
79							'XN' => 'Nelson cross-reference',
80							'XDR' => 'De Roo cross-reference',
81							'T' => 'SPECIAL',
82							'ZPP' => 'SKIP misclassification by position',
83							'ZRP' => 'SKIP classification disagreement',
84							'ZSP' => 'SKIP misclassification by stroke count',
85							'ZBP' => 'SKIP misclassification by both stroke count and position',
86							);
87
88							# Fields which are allowed to have duplicates.
89
90							our @dupes = qw/
91							DA
92							O
93							Q
94							S
95							V
96							W
97							XDR
98							XH
99							XJ
100							XN
101							Y
102							ZBP
103							ZPP
104							ZRP
105							ZSP
106							/;
107
108							our %has_dupes;
109
110							@has_dupes{@dupes} = @dupes;
111
112							sub parse_entry
113							{
114	57			57	1	106	my ($input) = @_;
115
116							# Remove the English entries first.
117
118	57					184	my @english;
119							my @onyomi;
120	57					0	my @kunyomi;
121	57					0	my @nanori;
122
123							# Return value
124
125	57					0	my %values;
126
127							# The English-language "meanings" are between { and }.
128
129	57					433	while ($input =~ s/\{([^\}]+)\}//) {
130	165					350	my $meaning = $1;
131
132							# Mark as a "kokuji".
133
134	165	50				281	if ($meaning =~ m/$kokuji$/) {
135	0					0	$values{"kokuji"} = 1;
136							}
137							else {
138	165					880	push (@english, $meaning);
139							}
140							}
141
142	57					489	(my $kanji, $values{"jiscode"}, my @entries) = split (" ", $input);
143	57					113	$values{kanji} = $kanji;
144							# Flag to detect the start of nanori readings.
145	57					73	my $in_nanori;
146	57					84	foreach my $entry (@entries) {
147	1719					1824	my $found;
148	1719	100				3832	if ($entry =~ m/(^[A-Z]+)(.*)/ ) {
149	1431	100				2220	if ($entry eq 'T1') {
150	30					42	$in_nanori = 1;
151	30					48	next;
152							}
153	1401					1994	my $field = $1;
154	1401					1723	my $value = $2;
155	1401	50				2346	if ($codes{$field}) {
156	1401	100				1986	if ($has_dupes{$field}) {
157	429					487	push @{$values{$field}}, $value;
	429					1216
158							}
159							else {
160	972	50				1356	if (!$values{$field}) {
161	972					1788	$values{$field} = $2;
162							}
163							else {
164	0					0	die "duplicate values for key $field.\n";
165							}
166							}
167	1401					1793	$found = 1;
168							}
169							else {
170							# Unknown field is ignored.
171							}
172
173							# Kanjidic contains hiragana, katakana, ".", "-" and "ー" (Japanese
174							# "chouon") characters.
175							}
176							else {
177	288	100				379	if ($in_nanori) {
178	90					132	push @nanori, $entry;
179	90					111	$found = 1;
180							}
181							else {
182	198	100				633	if ($entry =~ m/^([あ-ん\.-]+)$/) {
		50
183	123					193	push @kunyomi, $entry;
184	123					151	$found = 1;
185							}
186							elsif ($entry =~ m/^([ア-ンー\.-]+)$/) {
187	75					138	push @onyomi, $entry;
188	75					97	$found = 1;
189							}
190							}
191							}
192	1689	50	33			2865	if ($AUTHOR && ! $found) {
193	0					0	die "kanjidic:$.: Mystery entry \"$entry\"\n";
194							}
195							}
196	57					67	my %morohashi;
197	57	100				105	if ($values{MP}) {
198	54					249	@morohashi{qw/volume page/} = ($values{MP} =~ /(\d+)\.(\d+)/);
199							}
200	57	100				158	if ($values{MN}) {
201	54					100	$morohashi{index} = $values{MN};
202							}
203	57	100	66			134	if ($values{MN} \|\| $values{MP}) {
204	54					139	$values{morohashi} = \%morohashi;
205							}
206	57	50				121	if (@english) {
207	57					96	$values{"english"} = \@english;
208							}
209	57	50				100	if (@onyomi) {
210	57					89	$values{"onyomi"} = \@onyomi;
211							}
212	57	50				99	if (@kunyomi) {
213	57					70	$values{"kunyomi"} = \@kunyomi;
214							}
215	57	100				101	if (@nanori) {
216	30					42	$values{"nanori"} = \@nanori;
217							}
218
219							# Kanjidic uses the bogus radical numbers of Nelson rather than
220							# the correct ones.
221
222	57					112	$values{radical} = $values{B};
223	57	100				101	$values{radical} = $values{C} if $values{C};
224
225							# Just in case there is a problem in kanjidic, this will tell us
226							# the line where the problem was:
227
228	57					107	$values{"line_number"} = $.;
229	57					1498	return %values;
230							}
231
232							# Order of kanji in a kanji dictionary.
233
234							sub kanji_dictionary_order
235							{
236	0			0	1	0	my ($kanjidic_ref, $a, $b) = @_;
237							# print "$a, $b,\n";
238	0					0	my $valuea = $kanjidic_ref->{$a};
239	0					0	my $valueb = $kanjidic_ref->{$b};
240	0					0	my $radval = $$valuea{radical} - $$valueb{radical};
241	0	0				0	return $radval if $radval;
242	0					0	my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
243	0	0				0	return $strokeval if $strokeval;
244	0					0	my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
245	0	0				0	return $jisval if $jisval;
246	0					0	return 0;
247							}
248
249							# Order of kanji in a kanji dictionary.
250
251							sub stroke_radical_jis_order
252							{
253	0			0	1	0	my ($kanjidic_ref, $a, $b) = @_;
254							# print "$a, $b,\n";
255	0					0	my $valuea = $kanjidic_ref->{$a};
256	0					0	my $valueb = $kanjidic_ref->{$b};
257	0					0	my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
258	0	0				0	return $strokeval if $strokeval;
259	0					0	my $radval = $$valuea{radical} - $$valueb{radical};
260	0	0				0	return $radval if $radval;
261	0					0	my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
262	0	0				0	return $jisval if $jisval;
263							# They must be the same kanji.
264	0					0	return 0;
265							}
266
267							# Comparison function to sort by grade and then stroke order, then JIS
268							# code value if those are both the same.
269
270							sub grade_stroke_order
271							{
272	0			0	1	0	my ($kanjidic_ref, $a, $b) = @_;
273							# print "$a, $b,\n";
274	0					0	my $valuea = $kanjidic_ref->{$a};
275	0					0	my $valueb = $kanjidic_ref->{$b};
276	0	0				0	if ($valuea->{G}) {
		0
277	0	0				0	if ($valueb->{G}) {
278	0					0	my $gradeval = $$valuea{G} - $$valueb{G};
279	0	0				0	return $gradeval if $gradeval;
280							}
281							else {
282	0					0	return -1;
283							}
284							}
285							elsif ($valueb->{G}) {
286	0					0	return 1;
287							}
288	0					0	my $strokeval = $$valuea{S} - $$valueb{S};
289	0	0				0	return $strokeval if $strokeval;
290	0					0	my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
291	0	0				0	return $jisval if $jisval;
292	0					0	return 0;
293							}
294
295							sub parse_kanjidic
296							{
297	3			3	1	706	my ($file_name) = @_;
298	3	50				13	if (! $file_name) {
299	0					0	croak "Please supply a file name";
300							}
301	3					9	my $KANJIDIC;
302
303							my %kanjidic;
304
305	3	50				62	if (! -f $file_name) {
306	0					0	croak "No such file '$file_name'";
307							}
308
309	3	50		3		42	open $KANJIDIC, "<:encoding(euc-jp)", $file_name
	3					6
	3					22
	3					92
310							or die "Could not open '$file_name': $!";
311	3					22250	while (<$KANJIDIC>) {
312							# Skip the comment line.
313	60	100				162	next if ( m/^\#/ );
314	57					114	my %values = parse_entry ($_);
315	57					337	my @skip = split ("-", $values{P});
316	57					119	$values{skip} = \@skip;
317	57					509	$kanjidic{$values{kanji}} = \%values;
318							}
319	3					37	close $KANJIDIC;
320	3					28	return \%kanjidic;
321							}
322
323							sub kanjidic_order
324							{
325	1			1	1	6	my ($kanjidic_ref) = @_;
326							my @kanjidic_order =
327							sort {
328	1					11	hex ($kanjidic_ref->{$a}->{jiscode}) <=>
329							hex ($kanjidic_ref->{$b}->{jiscode})
330	59					128	}
331							keys %$kanjidic_ref;
332	1					4	my $count = 0;
333	1					3	for my $kanji (@kanjidic_order) {
334	19					26	$kanjidic_ref->{$kanji}->{kanji_id} = $count;
335	19					20	$count++;
336							}
337	1					6	return @kanjidic_order;
338							}
339
340							sub new
341							{
342	0			0	0	0	my ($package, $file) = @_;
343	0					0	my $kanjidic = {};
344	0					0	$kanjidic->{file} = $file;
345	0					0	undef $file;
346	0					0	$kanjidic->{data} = parse_kanjidic ($kanjidic->{file});
347	0					0	bless $kanjidic;
348	0					0	return $kanjidic;
349							}
350
351							# Make indices going from each type of key back to the data.
352
353							sub make_indices
354							{
355	0			0	0	0	my ($kanjidic) = @_;
356	0					0	my %indices;
357	0					0	my $data = $kanjidic->{data};
358	0					0	for my $kanji (keys %$data) {
359	0					0	my $kdata = $data->{$kanji};
360	0					0	for my $key (keys %$kdata) {
361	0					0	$indices{$key}{$kdata->{$key}} = $kdata;
362							}
363							}
364	0					0	$kanjidic->{indices} = \%indices;
365							}
366
367							sub find_key
368							{
369	0			0	0	0	my ($kanjidic, $key, $value) = @_;
370	0	0				0	if (! $kanjidic->{indices}) {
371	0					0	make_indices ($kanjidic);
372							}
373	0					0	my $index = $kanjidic->{indices}{$key};
374	0					0	return $index->{$value};
375							}
376
377							sub kanji_to_order
378							{
379	0			0	0	0	my ($kanjidic, $kanji) = @_;
380	0	0				0	if (! $kanjidic->{order}) {
381	0					0	my @order = kanjidic_order ($kanjidic->{data});
382	0					0	my %index;
383	0					0	my $count = 0;
384	0					0	for my $k (@order) {
385	0					0	$index{$k} = $count;
386	0					0	$count++;
387							}
388	0					0	$kanjidic->{order} = \@order;
389	0					0	$kanjidic->{index} = \%index;
390							}
391	0					0	return $kanjidic->{index}->{$kanji};
392							}
393
394							sub grade
395							{
396	1			1	1	7	my ($kanjidic, $grade) = @_;
397	1					2	my @grade_kanjis;
398	1					6	for my $k (keys %$kanjidic) {
399	19					23	my $kgrade = $kanjidic->{$k}->{G};
400	19	100				25	next unless $kgrade;
401	15	100				28	push @grade_kanjis, $k if $kgrade == $grade;
402							}
403	1					4	return \@grade_kanjis;
404							}
405
406							1;
407