File Coverage

blib/lib/Data/Kanji/Kanjidic.pm

Criterion	Covered	Total	%
statement	104	178	58.4
branch	37	76	48.6
condition	3	6	50.0
subroutine	12	19	63.1
pod	7	11	63.6
total	163	290	56.2

line	stmt	bran	cond	sub	pod	time	code
1							# See Kanjidic.pod for documentation
2
3							package Data::Kanji::Kanjidic;
4							require Exporter;
5	3			3		45407	use warnings;
	3					6
	3					86
6	3			3		9	use strict;
	3					3
	3					209
7							our @ISA = qw(Exporter);
8							our @EXPORT_OK = qw/parse_kanjidic
9							parse_entry
10							kanji_dictionary_order
11							grade_stroke_order
12							kanjidic_order
13							stroke_radical_jis_order
14							%codes
15							%has_dupes
16							grade
17							/;
18
19							our %EXPORT_TAGS = (
20							all => \@EXPORT_OK,
21							);
22							our $VERSION = '0.15';
23	3			3		9	use strict;
	3					7
	3					46
24	3			3		10	use warnings;
	3					3
	3					78
25	3			3		1426	use Encode;
	3					21464
	3					185
26	3			3		923	use utf8;
	3					18
	3					11
27	3			3		65	use Carp;
	3					3
	3					5658
28
29							our $AUTHOR;
30
31							# Parse one string from kanjidic and return it in an associative array.
32
33							our %codes = (
34							'W' => 'Korean pronunciation',
35							'Y' => 'Pinyin pronunciation',
36							'B' => 'Bushu (radical as defined by the Nelson kanji dictionary)',
37							'C' => 'Classic radical (the usual radical, where this is different from the Nelson radical)',
38							'U' => 'Unicode code point as a hexadecimal number',
39							'G' => 'Year of elementary school this kanji is taught',
40							'Q' => 'Four-corner code',
41							'S' => 'Stroke count',
42							'P' => 'SKIP code',
43							'J' => 'Japanese proficiency test level',
44							'N' => 'Nelson code from original Nelson dictionary',
45							'V' => 'Nelson code from the "New Nelson" dictionary',
46							'L' => 'Code from "Remembering the Kanji" by James Heisig',
47							'O' => 'The numbers used in P.G. O\'Neill\'s "Japanese Names"',
48							'K' => 'The index in the Gakken Kanji Dictionary (A New Dictionary of Kanji Usage)',
49							'E' => 'The numbers used in Kenneth Henshall\'s kanji book',
50							'I' => 'The Spahn-Hadamitzky book number',
51							'IN' => 'The Spahn-Hadamitzky kanji-kana book number',
52
53							'MP' => 'Morohashi volume/page',
54							'MN' => 'Morohashi index number',
55							'H' => 'Number in Jack Halpern dictionary',
56							'F' => 'Frequency of kanji',
57
58							'X' => 'Cross reference',
59							'DA' => 'The index numbers used in the 2011 edition of the Kanji & Kana book, by Spahn & Hadamitzky',
60							'DB' => 'Japanese for Busy People textbook numbers',
61							'DC' => 'The index numbers used in "The Kanji Way to Japanese Language Power" by Dale Crowley',
62							'DF' => '"Japanese Kanji Flashcards", by Max Hodges and Tomoko Okazaki',
63							'DG' => 'The index numbers used in the "Kodansha Compact Kanji Guide"',
64							'DH' => 'The index numbers used in the 3rd edition of "A Guide To Reading and Writing Japanese" edited by Kenneth Hensall et al',
65							'DJ' => 'The index numbers used in the "Kanji in Context" by Nishiguchi and Kono',
66							'DK' => 'The index numbers used by Jack Halpern in his Kanji Learners Dictionary',
67							'DL' => 'The index numbers used in the 2013 edition of Halpern\'s Kanji Learners Dictionary',
68							'DM' => 'The index numbers from the French-language version of "Remembering the kanji"',
69							'DN' => 'The index number used in "Remembering The Kanji, 6th Edition" by James Heisig',
70							'DP' => 'The index numbers used by Jack Halpern in his Kodansha Kanji Dictionary (2013), which is the revised version of the "New Japanese-English Kanji Dictionary" of 1990',
71							'DO' => 'The index numbers used in P.G. O\'Neill\'s Essential Kanji',
72							'DR' => 'The codes developed by Father Joseph De Roo, and published in his book "2001 Kanji" (Bonjinsha)',
73							'DS' => 'The index numbers used in the early editions of "A Guide To Reading and Writing Japanese" edited by Florence Sakade',
74							'DT' => 'The index numbers used in the Tuttle Kanji Cards, compiled by Alexander Kask',
75							'XJ' => 'Cross-reference',
76							'XO' => 'Cross-reference',
77							'XH' => 'Cross-reference',
78							'XI' => 'Cross-reference',
79							'XN' => 'Nelson cross-reference',
80							'XDR' => 'De Roo cross-reference',
81							'T' => 'SPECIAL',
82							'ZPP' => 'SKIP misclassification by position',
83							'ZRP' => 'SKIP classification disagreement',
84							'ZSP' => 'SKIP misclassification by stroke count',
85							'ZBP' => 'SKIP misclassification by both stroke count and position',
86							);
87
88							# Fields which are allowed to have duplicates.
89
90							our @dupes = qw/
91							DA
92							O
93							Q
94							S
95							V
96							W
97							XDR
98							XH
99							XJ
100							XN
101							Y
102							ZBP
103							ZPP
104							ZRP
105							ZSP
106							/;
107
108							our %has_dupes;
109
110							@has_dupes{@dupes} = @dupes;
111
112							sub parse_entry
113							{
114	57			57	1	58	my ($input) = @_;
115
116							# Remove the English entries first.
117
118	57					46	my $counter;
119							my @english;
120	0					0	my @onyomi;
121	0					0	my @kunyomi;
122	0					0	my @nanori;
123
124							# Return value
125
126	0					0	my %values;
127
128							# The English-language "meanings" are between { and }.
129
130	57					361	while ($input =~ s/\{([^\}]+)\}//) {
131	165					194	my $meaning = $1;
132
133							# Mark as a "kokuji".
134
135	165	50				208	if ($meaning =~ m/$kokuji$/) {
136	0					0	$values{"kokuji"} = 1;
137							}
138							else {
139	165					719	push (@english, $meaning);
140							}
141							}
142
143	57					618	(my $kanji, $values{"jiscode"}, my @entries) = split (" ", $input);
144	57					171	$values{kanji} = $kanji;
145							# Flag to detect the start of nanori readings.
146	57					28	my $in_nanori;
147	57					69	foreach my $entry (@entries) {
148	1719					943	my $found;
149	1719	100				2600	if ($entry =~ m/(^[A-Z]+)(.*)/ ) {
150	1431	100				1619	if ($entry eq 'T1') {
151	30					22	$in_nanori = 1;
152	30					31	next;
153							}
154	1401					1133	my $field = $1;
155	1401					941	my $value = $2;
156	1401	50				1535	if ($codes{$field}) {
157	1401	100				1378	if ($has_dupes{$field}) {
158	429					242	push @{$values{$field}}, $value;
	429					705
159							}
160							else {
161	972	50				921	if (!$values{$field}) {
162	972					1162	$values{$field} = $2;
163							}
164							else {
165	0					0	die "duplicate values for key $field.\n";
166							}
167							}
168	1401					999	$found = 1;
169							}
170							else {
171							# Unknown field is ignored.
172							}
173
174							# Kanjidic contains hiragana, katakana, ".", "-" and "ー" (Japanese
175							# "chouon") characters.
176							}
177							else {
178	288	100				280	if ($in_nanori) {
179	90					85	push @nanori, $entry;
180	90					66	$found = 1;
181							}
182							else {
183	198	100				514	if ($entry =~ m/^([あ-ん\.-]+)$/) {
		50
184	123					123	push @kunyomi, $entry;
185	123					94	$found = 1;
186							}
187							elsif ($entry =~ m/^([ア-ンー\.-]+)$/) {
188	75					83	push @onyomi, $entry;
189	75					66	$found = 1;
190							}
191							}
192							}
193	1689	50	33			2378	if ($AUTHOR && ! $found) {
194	0					0	die "kanjidic:$.: Mystery entry \"$entry\"\n";
195							}
196							}
197	57					38	my %morohashi;
198	57	100				84	if ($values{MP}) {
199	54					213	@morohashi{qw/volume page/} = ($values{MP} =~ /(\d+)\.(\d+)/);
200							}
201	57	100				93	if ($values{MN}) {
202	54					62	$morohashi{index} = $values{MN};
203							}
204	57	100	66			104	if ($values{MN} \|\| $values{MP}) {
205	54					62	$values{morohashi} = \%morohashi;
206							}
207	57	50				75	if (@english) {
208	57					53	$values{"english"} = \@english;
209							}
210	57	50				79	if (@onyomi) {
211	57					48	$values{"onyomi"} = \@onyomi;
212							}
213	57	50				66	if (@kunyomi) {
214	57					52	$values{"kunyomi"} = \@kunyomi;
215							}
216	57	100				68	if (@nanori) {
217	30					28	$values{"nanori"} = \@nanori;
218							}
219
220							# Kanjidic uses the bogus radical numbers of Nelson rather than
221							# the correct ones.
222
223	57					75	$values{radical} = $values{B};
224	57	100				78	$values{radical} = $values{C} if $values{C};
225
226							# Just in case there is a problem in kanjidic, this will tell us
227							# the line where the problem was:
228
229	57					71	$values{"line_number"} = $.;
230	57					1264	return %values;
231							}
232
233							# Order of kanji in a kanji dictionary.
234
235							sub kanji_dictionary_order
236							{
237	0			0	1	0	my ($kanjidic_ref, $a, $b) = @_;
238							# print "$a, $b,\n";
239	0					0	my $valuea = $kanjidic_ref->{$a};
240	0					0	my $valueb = $kanjidic_ref->{$b};
241	0					0	my $radval = $$valuea{radical} - $$valueb{radical};
242	0	0				0	return $radval if $radval;
243	0					0	my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
244	0	0				0	return $strokeval if $strokeval;
245	0					0	my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
246	0	0				0	return $jisval if $jisval;
247	0					0	return 0;
248							}
249
250							# Order of kanji in a kanji dictionary.
251
252							sub stroke_radical_jis_order
253							{
254	0			0	1	0	my ($kanjidic_ref, $a, $b) = @_;
255							# print "$a, $b,\n";
256	0					0	my $valuea = $kanjidic_ref->{$a};
257	0					0	my $valueb = $kanjidic_ref->{$b};
258	0					0	my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
259	0	0				0	return $strokeval if $strokeval;
260	0					0	my $radval = $$valuea{radical} - $$valueb{radical};
261	0	0				0	return $radval if $radval;
262	0					0	my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
263	0	0				0	return $jisval if $jisval;
264							# They must be the same kanji.
265	0					0	return 0;
266							}
267
268							# Comparison function to sort by grade and then stroke order, then JIS
269							# code value if those are both the same.
270
271							sub grade_stroke_order
272							{
273	0			0	1	0	my ($kanjidic_ref, $a, $b) = @_;
274							# print "$a, $b,\n";
275	0					0	my $valuea = $kanjidic_ref->{$a};
276	0					0	my $valueb = $kanjidic_ref->{$b};
277	0	0				0	if ($valuea->{G}) {
		0
278	0	0				0	if ($valueb->{G}) {
279	0					0	my $gradeval = $$valuea{G} - $$valueb{G};
280	0	0				0	return $gradeval if $gradeval;
281							}
282							else {
283	0					0	return -1;
284							}
285							}
286							elsif ($valueb->{G}) {
287	0					0	return 1;
288							}
289	0					0	my $strokeval = $$valuea{S} - $$valueb{S};
290	0	0				0	return $strokeval if $strokeval;
291	0					0	my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
292	0	0				0	return $jisval if $jisval;
293	0					0	return 0;
294							}
295
296							sub parse_kanjidic
297							{
298	3			3	1	71	my ($file_name) = @_;
299	3	50				8	if (! $file_name) {
300	0					0	croak "Please supply a file name";
301							}
302	3					4	my $KANJIDIC;
303
304							my %kanjidic;
305
306	3	50				96	if (! -f $file_name) {
307	0					0	croak "No such file '$file_name'";
308							}
309
310	3	50		3		23	open $KANJIDIC, "<:encoding(euc-jp)", $file_name
	3					3
	3					21
	3					120
311							or die "Could not open '$file_name': $!";
312	3					21210	while (<$KANJIDIC>) {
313							# Skip the comment line.
314	60	100				114	next if ( m/^\#/ );
315	57					80	my %values = parse_entry ($_);
316	57					256	my @skip = split ("-", $values{P});
317	57					68	$values{skip} = \@skip;
318	57					472	$kanjidic{$values{kanji}} = \%values;
319							}
320	3					97	close $KANJIDIC;
321	3					28	return \%kanjidic;
322							}
323
324							sub kanjidic_order
325							{
326	1			1	1	5	my ($kanjidic_ref) = @_;
327							my @kanjidic_order =
328							sort {
329	1					9	hex ($kanjidic_ref->{$a}->{jiscode}) <=>
330							hex ($kanjidic_ref->{$b}->{jiscode})
331	63					81	}
332							keys %$kanjidic_ref;
333	1					3	my $count = 0;
334	1					2	for my $kanji (@kanjidic_order) {
335	19					17	$kanjidic_ref->{$kanji}->{kanji_id} = $count;
336	19					11	$count++;
337							}
338	1					6	return @kanjidic_order;
339							}
340
341							sub new
342							{
343	0			0	0	0	my ($package, $file) = @_;
344	0					0	my $kanjidic = {};
345	0					0	$kanjidic->{file} = $file;
346	0					0	undef $file;
347	0					0	$kanjidic->{data} = parse_kanjidic ($kanjidic->{file});
348	0					0	bless $kanjidic;
349	0					0	return $kanjidic;
350							}
351
352							# Make indices going from each type of key back to the data.
353
354							sub make_indices
355							{
356	0			0	0	0	my ($kanjidic) = @_;
357	0					0	my %indices;
358	0					0	my $data = $kanjidic->{data};
359	0					0	for my $kanji (keys %$data) {
360	0					0	my $kdata = $data->{$kanji};
361	0					0	for my $key (keys %$kdata) {
362	0					0	$indices{$key}{$kdata->{$key}} = $kdata;
363							}
364							}
365	0					0	$kanjidic->{indices} = \%indices;
366							}
367
368							sub find_key
369							{
370	0			0	0	0	my ($kanjidic, $key, $value) = @_;
371	0	0				0	if (! $kanjidic->{indices}) {
372	0					0	make_indices ($kanjidic);
373							}
374	0					0	my $index = $kanjidic->{indices}{$key};
375	0					0	return $index->{$value};
376							}
377
378							sub kanji_to_order
379							{
380	0			0	0	0	my ($kanjidic, $kanji) = @_;
381	0	0				0	if (! $kanjidic->{order}) {
382	0					0	my @order = kanjidic_order ($kanjidic->{data});
383	0					0	my %index;
384	0					0	my $count = 0;
385	0					0	for my $k (@order) {
386	0					0	$index{$k} = $count;
387	0					0	$count++;
388							}
389	0					0	$kanjidic->{order} = \@order;
390	0					0	$kanjidic->{index} = \%index;
391							}
392	0					0	return $kanjidic->{index}->{$kanji};
393							}
394
395							sub grade
396							{
397	1			1	1	5	my ($kanjidic, $grade) = @_;
398	1					1	my @grade_kanjis;
399	1					5	for my $k (keys %$kanjidic) {
400	19					16	my $kgrade = $kanjidic->{$k}->{G};
401	19	100				23	next unless $kgrade;
402	15	100				23	push @grade_kanjis, $k if $kgrade == $grade;
403							}
404	1					3	return \@grade_kanjis;
405							}
406
407							1;
408