File Coverage

blib/lib/Data/Kanji/Kanjidic.pm
Criterion Covered Total %
statement 107 177 60.4
branch 37 76 48.6
condition 3 6 50.0
subroutine 12 19 63.1
pod 7 11 63.6
total 166 289 57.4


line stmt bran cond sub pod time code
1             # See Kanjidic.pod for documentation
2              
3             package Data::Kanji::Kanjidic;
4             require Exporter;
5 3     3   177621 use warnings;
  3         26  
  3         87  
6 3     3   15 use strict;
  3         4  
  3         237  
7             our @ISA = qw(Exporter);
8             our @EXPORT_OK = qw/parse_kanjidic
9             parse_entry
10             kanji_dictionary_order
11             grade_stroke_order
12             kanjidic_order
13             stroke_radical_jis_order
14             %codes
15             %has_dupes
16             grade
17             /;
18              
19             our %EXPORT_TAGS = (
20             all => \@EXPORT_OK,
21             );
22             our $VERSION = '0.16';
23 3     3   17 use strict;
  3         5  
  3         63  
24 3     3   14 use warnings;
  3         5  
  3         82  
25 3     3   1471 use Encode;
  3         26595  
  3         186  
26 3     3   991 use utf8;
  3         28  
  3         17  
27 3     3   77 use Carp;
  3         5  
  3         5803  
28              
29             our $AUTHOR;
30              
31             # Parse one string from kanjidic and return it in an associative array.
32              
33             our %codes = (
34             'W' => 'Korean pronunciation',
35             'Y' => 'Pinyin pronunciation',
36             'B' => 'Bushu (radical as defined by the Nelson kanji dictionary)',
37             'C' => 'Classic radical (the usual radical, where this is different from the Nelson radical)',
38             'U' => 'Unicode code point as a hexadecimal number',
39             'G' => 'Year of elementary school this kanji is taught',
40             'Q' => 'Four-corner code',
41             'S' => 'Stroke count',
42             'P' => 'SKIP code',
43             'J' => 'Japanese proficiency test level',
44             'N' => 'Nelson code from original Nelson dictionary',
45             'V' => 'Nelson code from the "New Nelson" dictionary',
46             'L' => 'Code from "Remembering the Kanji" by James Heisig',
47             'O' => 'The numbers used in P.G. O\'Neill\'s "Japanese Names"',
48             'K' => 'The index in the Gakken Kanji Dictionary (A New Dictionary of Kanji Usage)',
49             'E' => 'The numbers used in Kenneth Henshall\'s kanji book',
50             'I' => 'The Spahn-Hadamitzky book number',
51             'IN' => 'The Spahn-Hadamitzky kanji-kana book number',
52              
53             'MP' => 'Morohashi volume/page',
54             'MN' => 'Morohashi index number',
55             'H' => 'Number in Jack Halpern dictionary',
56             'F' => 'Frequency of kanji',
57              
58             'X' => 'Cross reference',
59             'DA' => 'The index numbers used in the 2011 edition of the Kanji & Kana book, by Spahn & Hadamitzky',
60             'DB' => 'Japanese for Busy People textbook numbers',
61             'DC' => 'The index numbers used in "The Kanji Way to Japanese Language Power" by Dale Crowley',
62             'DF' => '"Japanese Kanji Flashcards", by Max Hodges and Tomoko Okazaki',
63             'DG' => 'The index numbers used in the "Kodansha Compact Kanji Guide"',
64             'DH' => 'The index numbers used in the 3rd edition of "A Guide To Reading and Writing Japanese" edited by Kenneth Hensall et al',
65             'DJ' => 'The index numbers used in the "Kanji in Context" by Nishiguchi and Kono',
66             'DK' => 'The index numbers used by Jack Halpern in his Kanji Learners Dictionary',
67             'DL' => 'The index numbers used in the 2013 edition of Halpern\'s Kanji Learners Dictionary',
68             'DM' => 'The index numbers from the French-language version of "Remembering the kanji"',
69             'DN' => 'The index number used in "Remembering The Kanji, 6th Edition" by James Heisig',
70             'DP' => 'The index numbers used by Jack Halpern in his Kodansha Kanji Dictionary (2013), which is the revised version of the "New Japanese-English Kanji Dictionary" of 1990',
71             'DO' => 'The index numbers used in P.G. O\'Neill\'s Essential Kanji',
72             'DR' => 'The codes developed by Father Joseph De Roo, and published in his book "2001 Kanji" (Bonjinsha)',
73             'DS' => 'The index numbers used in the early editions of "A Guide To Reading and Writing Japanese" edited by Florence Sakade',
74             'DT' => 'The index numbers used in the Tuttle Kanji Cards, compiled by Alexander Kask',
75             'XJ' => 'Cross-reference',
76             'XO' => 'Cross-reference',
77             'XH' => 'Cross-reference',
78             'XI' => 'Cross-reference',
79             'XN' => 'Nelson cross-reference',
80             'XDR' => 'De Roo cross-reference',
81             'T' => 'SPECIAL',
82             'ZPP' => 'SKIP misclassification by position',
83             'ZRP' => 'SKIP classification disagreement',
84             'ZSP' => 'SKIP misclassification by stroke count',
85             'ZBP' => 'SKIP misclassification by both stroke count and position',
86             );
87              
88             # Fields which are allowed to have duplicates.
89              
90             our @dupes = qw/
91             DA
92             O
93             Q
94             S
95             V
96             W
97             XDR
98             XH
99             XJ
100             XN
101             Y
102             ZBP
103             ZPP
104             ZRP
105             ZSP
106             /;
107              
108             our %has_dupes;
109              
110             @has_dupes{@dupes} = @dupes;
111              
112             sub parse_entry
113             {
114 57     57 1 106 my ($input) = @_;
115              
116             # Remove the English entries first.
117              
118 57         184 my @english;
119             my @onyomi;
120 57         0 my @kunyomi;
121 57         0 my @nanori;
122              
123             # Return value
124              
125 57         0 my %values;
126              
127             # The English-language "meanings" are between { and }.
128              
129 57         433 while ($input =~ s/\{([^\}]+)\}//) {
130 165         350 my $meaning = $1;
131              
132             # Mark as a "kokuji".
133              
134 165 50       281 if ($meaning =~ m/\(kokuji\)/) {
135 0         0 $values{"kokuji"} = 1;
136             }
137             else {
138 165         880 push (@english, $meaning);
139             }
140             }
141              
142 57         489 (my $kanji, $values{"jiscode"}, my @entries) = split (" ", $input);
143 57         113 $values{kanji} = $kanji;
144             # Flag to detect the start of nanori readings.
145 57         73 my $in_nanori;
146 57         84 foreach my $entry (@entries) {
147 1719         1824 my $found;
148 1719 100       3832 if ($entry =~ m/(^[A-Z]+)(.*)/ ) {
149 1431 100       2220 if ($entry eq 'T1') {
150 30         42 $in_nanori = 1;
151 30         48 next;
152             }
153 1401         1994 my $field = $1;
154 1401         1723 my $value = $2;
155 1401 50       2346 if ($codes{$field}) {
156 1401 100       1986 if ($has_dupes{$field}) {
157 429         487 push @{$values{$field}}, $value;
  429         1216  
158             }
159             else {
160 972 50       1356 if (!$values{$field}) {
161 972         1788 $values{$field} = $2;
162             }
163             else {
164 0         0 die "duplicate values for key $field.\n";
165             }
166             }
167 1401         1793 $found = 1;
168             }
169             else {
170             # Unknown field is ignored.
171             }
172              
173             # Kanjidic contains hiragana, katakana, ".", "-" and "ー" (Japanese
174             # "chouon") characters.
175             }
176             else {
177 288 100       379 if ($in_nanori) {
178 90         132 push @nanori, $entry;
179 90         111 $found = 1;
180             }
181             else {
182 198 100       633 if ($entry =~ m/^([あ-ん\.-]+)$/) {
    50          
183 123         193 push @kunyomi, $entry;
184 123         151 $found = 1;
185             }
186             elsif ($entry =~ m/^([ア-ンー\.-]+)$/) {
187 75         138 push @onyomi, $entry;
188 75         97 $found = 1;
189             }
190             }
191             }
192 1689 50 33     2865 if ($AUTHOR && ! $found) {
193 0         0 die "kanjidic:$.: Mystery entry \"$entry\"\n";
194             }
195             }
196 57         67 my %morohashi;
197 57 100       105 if ($values{MP}) {
198 54         249 @morohashi{qw/volume page/} = ($values{MP} =~ /(\d+)\.(\d+)/);
199             }
200 57 100       158 if ($values{MN}) {
201 54         100 $morohashi{index} = $values{MN};
202             }
203 57 100 66     134 if ($values{MN} || $values{MP}) {
204 54         139 $values{morohashi} = \%morohashi;
205             }
206 57 50       121 if (@english) {
207 57         96 $values{"english"} = \@english;
208             }
209 57 50       100 if (@onyomi) {
210 57         89 $values{"onyomi"} = \@onyomi;
211             }
212 57 50       99 if (@kunyomi) {
213 57         70 $values{"kunyomi"} = \@kunyomi;
214             }
215 57 100       101 if (@nanori) {
216 30         42 $values{"nanori"} = \@nanori;
217             }
218              
219             # Kanjidic uses the bogus radical numbers of Nelson rather than
220             # the correct ones.
221              
222 57         112 $values{radical} = $values{B};
223 57 100       101 $values{radical} = $values{C} if $values{C};
224              
225             # Just in case there is a problem in kanjidic, this will tell us
226             # the line where the problem was:
227              
228 57         107 $values{"line_number"} = $.;
229 57         1498 return %values;
230             }
231              
232             # Order of kanji in a kanji dictionary.
233              
234             sub kanji_dictionary_order
235             {
236 0     0 1 0 my ($kanjidic_ref, $a, $b) = @_;
237             # print "$a, $b,\n";
238 0         0 my $valuea = $kanjidic_ref->{$a};
239 0         0 my $valueb = $kanjidic_ref->{$b};
240 0         0 my $radval = $$valuea{radical} - $$valueb{radical};
241 0 0       0 return $radval if $radval;
242 0         0 my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
243 0 0       0 return $strokeval if $strokeval;
244 0         0 my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
245 0 0       0 return $jisval if $jisval;
246 0         0 return 0;
247             }
248              
249             # Order of kanji in a kanji dictionary.
250              
251             sub stroke_radical_jis_order
252             {
253 0     0 1 0 my ($kanjidic_ref, $a, $b) = @_;
254             # print "$a, $b,\n";
255 0         0 my $valuea = $kanjidic_ref->{$a};
256 0         0 my $valueb = $kanjidic_ref->{$b};
257 0         0 my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
258 0 0       0 return $strokeval if $strokeval;
259 0         0 my $radval = $$valuea{radical} - $$valueb{radical};
260 0 0       0 return $radval if $radval;
261 0         0 my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
262 0 0       0 return $jisval if $jisval;
263             # They must be the same kanji.
264 0         0 return 0;
265             }
266              
267             # Comparison function to sort by grade and then stroke order, then JIS
268             # code value if those are both the same.
269              
270             sub grade_stroke_order
271             {
272 0     0 1 0 my ($kanjidic_ref, $a, $b) = @_;
273             # print "$a, $b,\n";
274 0         0 my $valuea = $kanjidic_ref->{$a};
275 0         0 my $valueb = $kanjidic_ref->{$b};
276 0 0       0 if ($valuea->{G}) {
    0          
277 0 0       0 if ($valueb->{G}) {
278 0         0 my $gradeval = $$valuea{G} - $$valueb{G};
279 0 0       0 return $gradeval if $gradeval;
280             }
281             else {
282 0         0 return -1;
283             }
284             }
285             elsif ($valueb->{G}) {
286 0         0 return 1;
287             }
288 0         0 my $strokeval = $$valuea{S} - $$valueb{S};
289 0 0       0 return $strokeval if $strokeval;
290 0         0 my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
291 0 0       0 return $jisval if $jisval;
292 0         0 return 0;
293             }
294              
295             sub parse_kanjidic
296             {
297 3     3 1 706 my ($file_name) = @_;
298 3 50       13 if (! $file_name) {
299 0         0 croak "Please supply a file name";
300             }
301 3         9 my $KANJIDIC;
302              
303             my %kanjidic;
304              
305 3 50       62 if (! -f $file_name) {
306 0         0 croak "No such file '$file_name'";
307             }
308              
309 3 50   3   42 open $KANJIDIC, "<:encoding(euc-jp)", $file_name
  3         6  
  3         22  
  3         92  
310             or die "Could not open '$file_name': $!";
311 3         22250 while (<$KANJIDIC>) {
312             # Skip the comment line.
313 60 100       162 next if ( m/^\#/ );
314 57         114 my %values = parse_entry ($_);
315 57         337 my @skip = split ("-", $values{P});
316 57         119 $values{skip} = \@skip;
317 57         509 $kanjidic{$values{kanji}} = \%values;
318             }
319 3         37 close $KANJIDIC;
320 3         28 return \%kanjidic;
321             }
322              
323             sub kanjidic_order
324             {
325 1     1 1 6 my ($kanjidic_ref) = @_;
326             my @kanjidic_order =
327             sort {
328 1         11 hex ($kanjidic_ref->{$a}->{jiscode}) <=>
329             hex ($kanjidic_ref->{$b}->{jiscode})
330 59         128 }
331             keys %$kanjidic_ref;
332 1         4 my $count = 0;
333 1         3 for my $kanji (@kanjidic_order) {
334 19         26 $kanjidic_ref->{$kanji}->{kanji_id} = $count;
335 19         20 $count++;
336             }
337 1         6 return @kanjidic_order;
338             }
339              
340             sub new
341             {
342 0     0 0 0 my ($package, $file) = @_;
343 0         0 my $kanjidic = {};
344 0         0 $kanjidic->{file} = $file;
345 0         0 undef $file;
346 0         0 $kanjidic->{data} = parse_kanjidic ($kanjidic->{file});
347 0         0 bless $kanjidic;
348 0         0 return $kanjidic;
349             }
350              
351             # Make indices going from each type of key back to the data.
352              
353             sub make_indices
354             {
355 0     0 0 0 my ($kanjidic) = @_;
356 0         0 my %indices;
357 0         0 my $data = $kanjidic->{data};
358 0         0 for my $kanji (keys %$data) {
359 0         0 my $kdata = $data->{$kanji};
360 0         0 for my $key (keys %$kdata) {
361 0         0 $indices{$key}{$kdata->{$key}} = $kdata;
362             }
363             }
364 0         0 $kanjidic->{indices} = \%indices;
365             }
366              
367             sub find_key
368             {
369 0     0 0 0 my ($kanjidic, $key, $value) = @_;
370 0 0       0 if (! $kanjidic->{indices}) {
371 0         0 make_indices ($kanjidic);
372             }
373 0         0 my $index = $kanjidic->{indices}{$key};
374 0         0 return $index->{$value};
375             }
376              
377             sub kanji_to_order
378             {
379 0     0 0 0 my ($kanjidic, $kanji) = @_;
380 0 0       0 if (! $kanjidic->{order}) {
381 0         0 my @order = kanjidic_order ($kanjidic->{data});
382 0         0 my %index;
383 0         0 my $count = 0;
384 0         0 for my $k (@order) {
385 0         0 $index{$k} = $count;
386 0         0 $count++;
387             }
388 0         0 $kanjidic->{order} = \@order;
389 0         0 $kanjidic->{index} = \%index;
390             }
391 0         0 return $kanjidic->{index}->{$kanji};
392             }
393              
394             sub grade
395             {
396 1     1 1 7 my ($kanjidic, $grade) = @_;
397 1         2 my @grade_kanjis;
398 1         6 for my $k (keys %$kanjidic) {
399 19         23 my $kgrade = $kanjidic->{$k}->{G};
400 19 100       25 next unless $kgrade;
401 15 100       28 push @grade_kanjis, $k if $kgrade == $grade;
402             }
403 1         4 return \@grade_kanjis;
404             }
405              
406             1;
407