File Coverage

blib/lib/Data/Kanji/Kanjidic.pm
Criterion Covered Total %
statement 104 178 58.4
branch 37 76 48.6
condition 3 6 50.0
subroutine 12 19 63.1
pod 7 11 63.6
total 163 290 56.2


line stmt bran cond sub pod time code
1             # See Kanjidic.pod for documentation
2              
3             package Data::Kanji::Kanjidic;
4             require Exporter;
5 3     3   45407 use warnings;
  3         6  
  3         86  
6 3     3   9 use strict;
  3         3  
  3         209  
7             our @ISA = qw(Exporter);
8             our @EXPORT_OK = qw/parse_kanjidic
9             parse_entry
10             kanji_dictionary_order
11             grade_stroke_order
12             kanjidic_order
13             stroke_radical_jis_order
14             %codes
15             %has_dupes
16             grade
17             /;
18              
19             our %EXPORT_TAGS = (
20             all => \@EXPORT_OK,
21             );
22             our $VERSION = '0.15';
23 3     3   9 use strict;
  3         7  
  3         46  
24 3     3   10 use warnings;
  3         3  
  3         78  
25 3     3   1426 use Encode;
  3         21464  
  3         185  
26 3     3   923 use utf8;
  3         18  
  3         11  
27 3     3   65 use Carp;
  3         3  
  3         5658  
28              
29             our $AUTHOR;
30              
31             # Parse one string from kanjidic and return it in an associative array.
32              
33             our %codes = (
34             'W' => 'Korean pronunciation',
35             'Y' => 'Pinyin pronunciation',
36             'B' => 'Bushu (radical as defined by the Nelson kanji dictionary)',
37             'C' => 'Classic radical (the usual radical, where this is different from the Nelson radical)',
38             'U' => 'Unicode code point as a hexadecimal number',
39             'G' => 'Year of elementary school this kanji is taught',
40             'Q' => 'Four-corner code',
41             'S' => 'Stroke count',
42             'P' => 'SKIP code',
43             'J' => 'Japanese proficiency test level',
44             'N' => 'Nelson code from original Nelson dictionary',
45             'V' => 'Nelson code from the "New Nelson" dictionary',
46             'L' => 'Code from "Remembering the Kanji" by James Heisig',
47             'O' => 'The numbers used in P.G. O\'Neill\'s "Japanese Names"',
48             'K' => 'The index in the Gakken Kanji Dictionary (A New Dictionary of Kanji Usage)',
49             'E' => 'The numbers used in Kenneth Henshall\'s kanji book',
50             'I' => 'The Spahn-Hadamitzky book number',
51             'IN' => 'The Spahn-Hadamitzky kanji-kana book number',
52              
53             'MP' => 'Morohashi volume/page',
54             'MN' => 'Morohashi index number',
55             'H' => 'Number in Jack Halpern dictionary',
56             'F' => 'Frequency of kanji',
57              
58             'X' => 'Cross reference',
59             'DA' => 'The index numbers used in the 2011 edition of the Kanji & Kana book, by Spahn & Hadamitzky',
60             'DB' => 'Japanese for Busy People textbook numbers',
61             'DC' => 'The index numbers used in "The Kanji Way to Japanese Language Power" by Dale Crowley',
62             'DF' => '"Japanese Kanji Flashcards", by Max Hodges and Tomoko Okazaki',
63             'DG' => 'The index numbers used in the "Kodansha Compact Kanji Guide"',
64             'DH' => 'The index numbers used in the 3rd edition of "A Guide To Reading and Writing Japanese" edited by Kenneth Hensall et al',
65             'DJ' => 'The index numbers used in the "Kanji in Context" by Nishiguchi and Kono',
66             'DK' => 'The index numbers used by Jack Halpern in his Kanji Learners Dictionary',
67             'DL' => 'The index numbers used in the 2013 edition of Halpern\'s Kanji Learners Dictionary',
68             'DM' => 'The index numbers from the French-language version of "Remembering the kanji"',
69             'DN' => 'The index number used in "Remembering The Kanji, 6th Edition" by James Heisig',
70             'DP' => 'The index numbers used by Jack Halpern in his Kodansha Kanji Dictionary (2013), which is the revised version of the "New Japanese-English Kanji Dictionary" of 1990',
71             'DO' => 'The index numbers used in P.G. O\'Neill\'s Essential Kanji',
72             'DR' => 'The codes developed by Father Joseph De Roo, and published in his book "2001 Kanji" (Bonjinsha)',
73             'DS' => 'The index numbers used in the early editions of "A Guide To Reading and Writing Japanese" edited by Florence Sakade',
74             'DT' => 'The index numbers used in the Tuttle Kanji Cards, compiled by Alexander Kask',
75             'XJ' => 'Cross-reference',
76             'XO' => 'Cross-reference',
77             'XH' => 'Cross-reference',
78             'XI' => 'Cross-reference',
79             'XN' => 'Nelson cross-reference',
80             'XDR' => 'De Roo cross-reference',
81             'T' => 'SPECIAL',
82             'ZPP' => 'SKIP misclassification by position',
83             'ZRP' => 'SKIP classification disagreement',
84             'ZSP' => 'SKIP misclassification by stroke count',
85             'ZBP' => 'SKIP misclassification by both stroke count and position',
86             );
87              
88             # Fields which are allowed to have duplicates.
89              
90             our @dupes = qw/
91             DA
92             O
93             Q
94             S
95             V
96             W
97             XDR
98             XH
99             XJ
100             XN
101             Y
102             ZBP
103             ZPP
104             ZRP
105             ZSP
106             /;
107              
108             our %has_dupes;
109              
110             @has_dupes{@dupes} = @dupes;
111              
112             sub parse_entry
113             {
114 57     57 1 58 my ($input) = @_;
115              
116             # Remove the English entries first.
117              
118 57         46 my $counter;
119             my @english;
120 0         0 my @onyomi;
121 0         0 my @kunyomi;
122 0         0 my @nanori;
123              
124             # Return value
125              
126 0         0 my %values;
127              
128             # The English-language "meanings" are between { and }.
129              
130 57         361 while ($input =~ s/\{([^\}]+)\}//) {
131 165         194 my $meaning = $1;
132              
133             # Mark as a "kokuji".
134              
135 165 50       208 if ($meaning =~ m/\(kokuji\)/) {
136 0         0 $values{"kokuji"} = 1;
137             }
138             else {
139 165         719 push (@english, $meaning);
140             }
141             }
142              
143 57         618 (my $kanji, $values{"jiscode"}, my @entries) = split (" ", $input);
144 57         171 $values{kanji} = $kanji;
145             # Flag to detect the start of nanori readings.
146 57         28 my $in_nanori;
147 57         69 foreach my $entry (@entries) {
148 1719         943 my $found;
149 1719 100       2600 if ($entry =~ m/(^[A-Z]+)(.*)/ ) {
150 1431 100       1619 if ($entry eq 'T1') {
151 30         22 $in_nanori = 1;
152 30         31 next;
153             }
154 1401         1133 my $field = $1;
155 1401         941 my $value = $2;
156 1401 50       1535 if ($codes{$field}) {
157 1401 100       1378 if ($has_dupes{$field}) {
158 429         242 push @{$values{$field}}, $value;
  429         705  
159             }
160             else {
161 972 50       921 if (!$values{$field}) {
162 972         1162 $values{$field} = $2;
163             }
164             else {
165 0         0 die "duplicate values for key $field.\n";
166             }
167             }
168 1401         999 $found = 1;
169             }
170             else {
171             # Unknown field is ignored.
172             }
173              
174             # Kanjidic contains hiragana, katakana, ".", "-" and "ー" (Japanese
175             # "chouon") characters.
176             }
177             else {
178 288 100       280 if ($in_nanori) {
179 90         85 push @nanori, $entry;
180 90         66 $found = 1;
181             }
182             else {
183 198 100       514 if ($entry =~ m/^([あ-ん\.-]+)$/) {
    50          
184 123         123 push @kunyomi, $entry;
185 123         94 $found = 1;
186             }
187             elsif ($entry =~ m/^([ア-ンー\.-]+)$/) {
188 75         83 push @onyomi, $entry;
189 75         66 $found = 1;
190             }
191             }
192             }
193 1689 50 33     2378 if ($AUTHOR && ! $found) {
194 0         0 die "kanjidic:$.: Mystery entry \"$entry\"\n";
195             }
196             }
197 57         38 my %morohashi;
198 57 100       84 if ($values{MP}) {
199 54         213 @morohashi{qw/volume page/} = ($values{MP} =~ /(\d+)\.(\d+)/);
200             }
201 57 100       93 if ($values{MN}) {
202 54         62 $morohashi{index} = $values{MN};
203             }
204 57 100 66     104 if ($values{MN} || $values{MP}) {
205 54         62 $values{morohashi} = \%morohashi;
206             }
207 57 50       75 if (@english) {
208 57         53 $values{"english"} = \@english;
209             }
210 57 50       79 if (@onyomi) {
211 57         48 $values{"onyomi"} = \@onyomi;
212             }
213 57 50       66 if (@kunyomi) {
214 57         52 $values{"kunyomi"} = \@kunyomi;
215             }
216 57 100       68 if (@nanori) {
217 30         28 $values{"nanori"} = \@nanori;
218             }
219              
220             # Kanjidic uses the bogus radical numbers of Nelson rather than
221             # the correct ones.
222              
223 57         75 $values{radical} = $values{B};
224 57 100       78 $values{radical} = $values{C} if $values{C};
225              
226             # Just in case there is a problem in kanjidic, this will tell us
227             # the line where the problem was:
228              
229 57         71 $values{"line_number"} = $.;
230 57         1264 return %values;
231             }
232              
233             # Order of kanji in a kanji dictionary.
234              
235             sub kanji_dictionary_order
236             {
237 0     0 1 0 my ($kanjidic_ref, $a, $b) = @_;
238             # print "$a, $b,\n";
239 0         0 my $valuea = $kanjidic_ref->{$a};
240 0         0 my $valueb = $kanjidic_ref->{$b};
241 0         0 my $radval = $$valuea{radical} - $$valueb{radical};
242 0 0       0 return $radval if $radval;
243 0         0 my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
244 0 0       0 return $strokeval if $strokeval;
245 0         0 my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
246 0 0       0 return $jisval if $jisval;
247 0         0 return 0;
248             }
249              
250             # Order of kanji in a kanji dictionary.
251              
252             sub stroke_radical_jis_order
253             {
254 0     0 1 0 my ($kanjidic_ref, $a, $b) = @_;
255             # print "$a, $b,\n";
256 0         0 my $valuea = $kanjidic_ref->{$a};
257 0         0 my $valueb = $kanjidic_ref->{$b};
258 0         0 my $strokeval = $valuea->{S}[0] - $valueb->{S}[0];
259 0 0       0 return $strokeval if $strokeval;
260 0         0 my $radval = $$valuea{radical} - $$valueb{radical};
261 0 0       0 return $radval if $radval;
262 0         0 my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
263 0 0       0 return $jisval if $jisval;
264             # They must be the same kanji.
265 0         0 return 0;
266             }
267              
268             # Comparison function to sort by grade and then stroke order, then JIS
269             # code value if those are both the same.
270              
271             sub grade_stroke_order
272             {
273 0     0 1 0 my ($kanjidic_ref, $a, $b) = @_;
274             # print "$a, $b,\n";
275 0         0 my $valuea = $kanjidic_ref->{$a};
276 0         0 my $valueb = $kanjidic_ref->{$b};
277 0 0       0 if ($valuea->{G}) {
    0          
278 0 0       0 if ($valueb->{G}) {
279 0         0 my $gradeval = $$valuea{G} - $$valueb{G};
280 0 0       0 return $gradeval if $gradeval;
281             }
282             else {
283 0         0 return -1;
284             }
285             }
286             elsif ($valueb->{G}) {
287 0         0 return 1;
288             }
289 0         0 my $strokeval = $$valuea{S} - $$valueb{S};
290 0 0       0 return $strokeval if $strokeval;
291 0         0 my $jisval = hex ($$valuea{jiscode}) - hex ($$valueb{jiscode});
292 0 0       0 return $jisval if $jisval;
293 0         0 return 0;
294             }
295              
296             sub parse_kanjidic
297             {
298 3     3 1 71 my ($file_name) = @_;
299 3 50       8 if (! $file_name) {
300 0         0 croak "Please supply a file name";
301             }
302 3         4 my $KANJIDIC;
303              
304             my %kanjidic;
305              
306 3 50       96 if (! -f $file_name) {
307 0         0 croak "No such file '$file_name'";
308             }
309              
310 3 50   3   23 open $KANJIDIC, "<:encoding(euc-jp)", $file_name
  3         3  
  3         21  
  3         120  
311             or die "Could not open '$file_name': $!";
312 3         21210 while (<$KANJIDIC>) {
313             # Skip the comment line.
314 60 100       114 next if ( m/^\#/ );
315 57         80 my %values = parse_entry ($_);
316 57         256 my @skip = split ("-", $values{P});
317 57         68 $values{skip} = \@skip;
318 57         472 $kanjidic{$values{kanji}} = \%values;
319             }
320 3         97 close $KANJIDIC;
321 3         28 return \%kanjidic;
322             }
323              
324             sub kanjidic_order
325             {
326 1     1 1 5 my ($kanjidic_ref) = @_;
327             my @kanjidic_order =
328             sort {
329 1         9 hex ($kanjidic_ref->{$a}->{jiscode}) <=>
330             hex ($kanjidic_ref->{$b}->{jiscode})
331 63         81 }
332             keys %$kanjidic_ref;
333 1         3 my $count = 0;
334 1         2 for my $kanji (@kanjidic_order) {
335 19         17 $kanjidic_ref->{$kanji}->{kanji_id} = $count;
336 19         11 $count++;
337             }
338 1         6 return @kanjidic_order;
339             }
340              
341             sub new
342             {
343 0     0 0 0 my ($package, $file) = @_;
344 0         0 my $kanjidic = {};
345 0         0 $kanjidic->{file} = $file;
346 0         0 undef $file;
347 0         0 $kanjidic->{data} = parse_kanjidic ($kanjidic->{file});
348 0         0 bless $kanjidic;
349 0         0 return $kanjidic;
350             }
351              
352             # Make indices going from each type of key back to the data.
353              
354             sub make_indices
355             {
356 0     0 0 0 my ($kanjidic) = @_;
357 0         0 my %indices;
358 0         0 my $data = $kanjidic->{data};
359 0         0 for my $kanji (keys %$data) {
360 0         0 my $kdata = $data->{$kanji};
361 0         0 for my $key (keys %$kdata) {
362 0         0 $indices{$key}{$kdata->{$key}} = $kdata;
363             }
364             }
365 0         0 $kanjidic->{indices} = \%indices;
366             }
367              
368             sub find_key
369             {
370 0     0 0 0 my ($kanjidic, $key, $value) = @_;
371 0 0       0 if (! $kanjidic->{indices}) {
372 0         0 make_indices ($kanjidic);
373             }
374 0         0 my $index = $kanjidic->{indices}{$key};
375 0         0 return $index->{$value};
376             }
377              
378             sub kanji_to_order
379             {
380 0     0 0 0 my ($kanjidic, $kanji) = @_;
381 0 0       0 if (! $kanjidic->{order}) {
382 0         0 my @order = kanjidic_order ($kanjidic->{data});
383 0         0 my %index;
384 0         0 my $count = 0;
385 0         0 for my $k (@order) {
386 0         0 $index{$k} = $count;
387 0         0 $count++;
388             }
389 0         0 $kanjidic->{order} = \@order;
390 0         0 $kanjidic->{index} = \%index;
391             }
392 0         0 return $kanjidic->{index}->{$kanji};
393             }
394              
395             sub grade
396             {
397 1     1 1 5 my ($kanjidic, $grade) = @_;
398 1         1 my @grade_kanjis;
399 1         5 for my $k (keys %$kanjidic) {
400 19         16 my $kgrade = $kanjidic->{$k}->{G};
401 19 100       23 next unless $kgrade;
402 15 100       23 push @grade_kanjis, $k if $kgrade == $grade;
403             }
404 1         3 return \@grade_kanjis;
405             }
406              
407             1;
408