File Coverage

blib/lib/Lingua/JA/Sort/ReadableKey.pm
Criterion Covered Total %
statement 18 61 29.5
branch 0 8 0.0
condition 0 2 0.0
subroutine 6 10 60.0
pod 3 3 100.0
total 27 84 32.1


line stmt bran cond sub pod time code
1             package Lingua::JA::Sort::ReadableKey;
2              
3 1     1   1033 use 5.006;
  1         4  
  1         44  
4 1     1   6 use strict;
  1         2  
  1         39  
5 1     1   24 use warnings;
  1         2  
  1         114  
6              
7             require Exporter;
8             our @ISA = qw(Exporter);
9             our @EXPORT = qw( kanji_to_kana japanese_pronunciation japanese_sort_order );
10             our $VERSION = '0.01';
11 1     1   1091 use Encode;
  1         13867  
  1         162  
12 1     1   1285 use Encode::JP;
  1         11685  
  1         718  
13              
14             my $repeat = qr/[\x{3033}\x{309d}\x{30fd}]/;
15             my $voiced_repeat = qr/[\x{309e}\x{30fe}]/;
16              
17             my %romanize;
18             my %rank;
19             while () {
20             Encode::_utf8_on($_);
21             /^(\w)\s+(\w+\d)/ or die "Bad line $_";
22             $romanize{$1} = $2;
23             $rank{$2} = $.;
24             }
25              
26             sub japanese_sort_order {
27 0     0 1   my $string = _key(shift);
28 0 0         $string =~ s#([A-Za-z]+\d)#exists $rank{$1} ? chr(33+$rank{$1}) : $1#eg;
  0            
29 0           return $string;
30             }
31              
32             my %mutations = (qw(
33             K G
34             T D
35             S Z
36             H B
37             ));
38             sub japanese_pronunciation {
39 0     0 1   my $string = _key(shift);
40 0           $string =~ s/\/.*?$//;
41             # First deal with tenten
42 0           $string =~ s/0//g;
43 0           while (my ($k,$v) = each %mutations) {
44 0           $string =~ s/$k(.)1/$v$1/g;
45             }
46 0           $string =~ s/H(.)2/p$1/g;
47 0           $string =~ s/ZI/JI/g;
48             # Now geminate consonants
49 0           $string =~ s/tu(.)/$1$1/g;
50             # Now smallchars
51 0           $string =~ s/(.)(y?[aeiou])/$2/g;
52              
53             # Finally, Kunreishiki->Hepburn
54 0           $string = lc $string;
55 0           $string =~ s/si/shi/g;
56 0           $string =~ s/sy/sh/g;
57 0           $string =~ s/jy/j/g;
58 0           $string =~ s/ti/chi/g;
59 0           $string =~ s/tu/tsu/g;
60 0           $string =~ s/n([bp])/m$1/g;
61 0           return $string;
62             }
63              
64             sub kanji_to_kana {
65 0     0 1   my $string = shift;
66 0 0         return $string unless /[\x{4e00}-\x{9fff}]/;
67 0           require Text::ChaSen;
68 0           my $string_euc = encode("euc-jp", $string);
69 0           Text::ChaSen::getopt_argv('chasen-perl', '-F', '%a0');
70 0   0       $string = decode("euc-jp", Text::ChaSen::sparse_tostr($string_euc))
71             || return $string;
72 0           chomp $string;
73 1     1   995 $string =~ tr/\x{30fc}/\x{30a6}/; # Hack
  1         10  
  1         106  
  0            
74 0           $string =~ tr/\x{30a1}-\x{30ff}/\x{3041}-\x{309f}/;
75 0           $string;
76             }
77              
78             sub _key {
79 0     0     my $string = shift;
80 0           my $type = "hiragana";
81 0           my $last = "";
82 0           my @code;
83 0           $string = kanji_to_kana($string);
84 0           $string =~ s/(.)$repeat/$1$1/g;
85 0           $string =~ s/(.)$voiced_repeat/$1.chr(1+ord($1))/eg;
  0            
86 0 0         $type ="katakana" if $string =~ tr/\x{30a1}-\x{30ff}/\x{3041}-\x{309f}/;
87 0 0         $string =~ s/(.)/exists $romanize{$1} ? $romanize{$1} : $1 /eg;
  0            
88 0           return $string."/$type";
89             }
90             1;
91              
92             =head1 NAME
93              
94             Lingua::JA::Sort::ReadableKey - Sorting and Romanizing Japanese
95              
96             =head1 SYNOPSIS
97              
98             use Lingua::JA::Sort::ReadableKey;
99             for ( map { $_->[0] }
100             sort { $a->[1] cmp $b->[1] }
101             map { [ $_, japanese_sortorder($_) ] } @utf8 ) {
102              
103             =head1 DESCRIPTION
104              
105             First, does L do what you want? Look at that
106             first.
107              
108             It may not do what you want if you want
109              
110             =over 3
111              
112             =item *
113              
114             Kanji phrases sorted in their reading order, rather than as a separate block.
115              
116             =item *
117              
118             A machine-readable or storable key so that comparisons and sorting can
119             be done by a non-Japanese-aware system later.
120              
121             =back
122              
123             This module uses C to do kanji-kana conversion, and then
124             produces a comparable ASCII key for sorting.
125              
126             All text should be in "real" UTF-8 - that is, strings in Perl's internal
127             format with the UTF-8 flag on.
128              
129             =head2 EXPORT
130              
131             The following methods are exported:
132              
133             =head3 kanji_to_kana
134              
135             Use ChaSen to convert a kanji sequence into hiragana. You obviously need
136             to install ChaSen, and its Perl interface C to make this
137             work. You can get ChaSen from http://chasen.aist-nara.ac.jp/ and
138             C is bundled with it. If you have Debian, install the
139             packages "chasen" and "libtext-chasen-perl". This code will work with
140             both ChaSen1 and ChaSen2.
141              
142             =head3 japanese_pronunciation
143              
144             This turns a Japanese string into an ASCII representation of its
145             reading. You can't sort on this, because Japanese don't sort according
146             to the Latin alphabet, but you can use to label Japanese things for
147             people who can't read Japanese. This will automatically call
148             C if necessary to get the reading of kanji strings.
149              
150             =head3 japanese_sort_order
151              
152             This returns an ASCII string which represents, in some bizarre magic
153             encoding, the sort order of the Japanese input string, such that
154             comparing the C of two UTF-8 strings will tell you
155             how they should be sorted in a Japanese dictionary.
156              
157             By "bizarre" and "magic", I mean that for each character, we find its
158             order in the Japanese alphabet, and then replace that with
159             C so that it can be compared with C.
160              
161             This also calls C if there are any kanji strings.
162              
163             =head1 SEE ALSO
164              
165             L, L.
166              
167             http://chasen.aist-nara.ac.jp/
168              
169             =head1 AUTHOR
170              
171             Simon Cozens, Esimon@cpan.orgE
172              
173             =head1 COPYRIGHT AND LICENSE
174              
175             Copyright (C) 2004 by Simon Cozens
176              
177             This library is free software; you can redistribute it and/or modify
178             it under the same terms as Perl itself, either Perl version 5.8.4 or,
179             at your option, any later version of Perl 5 you may have available.
180              
181              
182             =cut
183              
184             __DATA__