| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package eGuideDog::Dict::Cantonese; |
|
2
|
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
64923
|
use strict; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
86
|
|
|
4
|
2
|
|
|
2
|
|
11
|
use warnings; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
61
|
|
|
5
|
2
|
|
|
2
|
|
1564
|
use utf8; |
|
|
2
|
|
|
|
|
17
|
|
|
|
2
|
|
|
|
|
10
|
|
|
6
|
2
|
|
|
2
|
|
1974
|
use Encode::CNMap; |
|
|
2
|
|
|
|
|
34520
|
|
|
|
2
|
|
|
|
|
318
|
|
|
7
|
2
|
|
|
2
|
|
2517
|
use Storable; |
|
|
2
|
|
|
|
|
7910
|
|
|
|
2
|
|
|
|
|
4107
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require Exporter; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export |
|
14
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead. |
|
15
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants. |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# This allows declaration use eGuideDog::Dict::Cantonese ':all'; |
|
18
|
|
|
|
|
|
|
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
|
19
|
|
|
|
|
|
|
# will save memory. |
|
20
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
) ] ); |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our @EXPORT = qw( |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
); |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
our $VERSION = '0.41'; |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Preloaded methods go here. |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub new() { |
|
36
|
1
|
|
|
1
|
1
|
15
|
my $self = {}; |
|
37
|
1
|
|
|
|
|
4
|
$self->{jyutping} = {}; # The most probably phonetic symbol |
|
38
|
1
|
|
|
|
|
5
|
$self->{chars} = {}; # all phonetic symbols (array ref) |
|
39
|
1
|
|
|
|
|
3
|
$self->{words} = {}; # word phonetic symbols (array ref) |
|
40
|
1
|
|
|
|
|
3
|
$self->{word_index} = {}; # the first char to words (array ref) |
|
41
|
1
|
|
|
|
|
6
|
bless $self, __PACKAGE__; |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# load zhy_list |
|
44
|
1
|
|
|
|
|
4
|
my $dir = __FILE__; |
|
45
|
1
|
|
|
|
|
10
|
$dir =~ s/[.]pm$//; |
|
46
|
|
|
|
|
|
|
|
|
47
|
1
|
50
|
|
|
|
56
|
if(-e "$dir/Cantonese.dict") { |
|
48
|
1
|
|
|
|
|
10
|
my $dict = retrieve("$dir/Cantonese.dict"); |
|
49
|
1
|
|
|
|
|
104962
|
$self->{jyutping} = $dict->{jyutping}; |
|
50
|
1
|
|
|
|
|
5
|
$self->{chars} = $dict->{chars}; |
|
51
|
1
|
|
|
|
|
5
|
$self->{words} = $dict->{words}; |
|
52
|
1
|
|
|
|
|
5
|
$self->{word_index} = $dict->{word_index}; |
|
53
|
|
|
|
|
|
|
} |
|
54
|
|
|
|
|
|
|
|
|
55
|
1
|
|
|
|
|
11
|
return $self; |
|
56
|
|
|
|
|
|
|
} |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub update_dict { |
|
59
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
|
60
|
|
|
|
|
|
|
|
|
61
|
0
|
|
|
|
|
0
|
$self->{jyutping} = {}; |
|
62
|
0
|
|
|
|
|
0
|
$self->{chars} = {}; |
|
63
|
0
|
|
|
|
|
0
|
$self->{words} = {}; |
|
64
|
0
|
|
|
|
|
0
|
$self->{word_index} = {}; |
|
65
|
|
|
|
|
|
|
|
|
66
|
0
|
|
|
|
|
0
|
$self->import_unihan("Cantonese.txt"); |
|
67
|
0
|
|
|
|
|
0
|
$self->import_zhy_list("zhy_list"); |
|
68
|
|
|
|
|
|
|
|
|
69
|
0
|
|
|
|
|
0
|
my $dict = {jyutping => $self->{jyutping}, |
|
70
|
|
|
|
|
|
|
chars => $self->{chars}, |
|
71
|
|
|
|
|
|
|
words => $self->{words}, |
|
72
|
|
|
|
|
|
|
word_index => $self->{word_index}, |
|
73
|
|
|
|
|
|
|
}; |
|
74
|
0
|
|
|
|
|
0
|
store($dict, "Cantonese.dict"); |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub import_unihan { |
|
78
|
0
|
|
|
0
|
0
|
0
|
my ($self, $cantonese_txt) = @_; |
|
79
|
0
|
|
|
|
|
0
|
open(DATA_FILE, '<', $cantonese_txt); |
|
80
|
0
|
|
|
|
|
0
|
while() { |
|
81
|
0
|
|
|
|
|
0
|
chomp; |
|
82
|
0
|
|
|
|
|
0
|
my @line = split(/\s+/, $_); |
|
83
|
0
|
|
|
|
|
0
|
my $char = chr(hex($line[0])); |
|
84
|
0
|
|
|
|
|
0
|
my @phons = @line[1 .. $#line]; |
|
85
|
0
|
0
|
|
|
|
0
|
if (not defined $self->{chars}->{$char}) { |
|
86
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char} = \@phons; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
0
|
|
|
|
|
0
|
my $char_simp = utf8_to_simputf8($char); |
|
89
|
0
|
0
|
|
|
|
0
|
if ($char_simp !~ /[?]/) { |
|
90
|
0
|
0
|
|
|
|
0
|
if (!defined $self->{chars}->{$char_simp}) { |
|
91
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char_simp} = \@phons; |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
} |
|
94
|
0
|
|
|
|
|
0
|
my $char_trad = utf8_to_tradutf8($char); |
|
95
|
0
|
0
|
|
|
|
0
|
if ($char_trad !~ /[?]/) { |
|
96
|
0
|
0
|
|
|
|
0
|
if (!defined $self->{chars}->{$char_trad}) { |
|
97
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char_trad} = \@phons; |
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
} |
|
100
|
|
|
|
|
|
|
} |
|
101
|
0
|
|
|
|
|
0
|
close(DATA_FILE); |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub add_symbol { |
|
105
|
0
|
|
|
0
|
0
|
0
|
my ($self, $char, $symbol) = @_; |
|
106
|
|
|
|
|
|
|
|
|
107
|
0
|
0
|
|
|
|
0
|
if (not $self->{chars}->{$char}) { |
|
108
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char} = [$symbol]; |
|
109
|
0
|
|
|
|
|
0
|
return 1; |
|
110
|
|
|
|
|
|
|
} else { |
|
111
|
0
|
|
|
|
|
0
|
foreach (@{$self->{chars}->{$char}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
112
|
0
|
0
|
|
|
|
0
|
if ($symbol eq $_) { |
|
113
|
0
|
|
|
|
|
0
|
return 0; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
} |
|
116
|
0
|
|
|
|
|
0
|
$self->{chars}->{$char} = [@{$self->{chars}->{$char}}, $symbol]; |
|
|
0
|
|
|
|
|
0
|
|
|
117
|
0
|
|
|
|
|
0
|
return 1; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub import_zhy_list { |
|
122
|
0
|
|
|
0
|
0
|
0
|
my ($self, $zhy_list) = @_; |
|
123
|
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
0
|
open(ZHY_LIST, '<:utf8', $zhy_list); |
|
125
|
0
|
|
|
|
|
0
|
while (my $line = ) { |
|
126
|
0
|
0
|
|
|
|
0
|
if ($line =~ /^(.)\s([^\s]*)\s$/) { |
|
|
|
0
|
|
|
|
|
|
|
127
|
0
|
0
|
0
|
|
|
0
|
if ($1 && $2) { |
|
128
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{$1} = $2; |
|
129
|
0
|
|
|
|
|
0
|
$self->add_symbol($1, $2); |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
} elsif ($line =~ /^[(]([^)]*)[)]\s([^\s]*)\s$/) { |
|
132
|
0
|
|
|
|
|
0
|
my @chars = split(/ /, $1); |
|
133
|
0
|
|
|
|
|
0
|
my @symbols = split(/[|]/, $2); |
|
134
|
0
|
0
|
|
|
|
0
|
if ($#chars != $#symbols) { |
|
135
|
0
|
|
|
|
|
0
|
warn "Dictionary error:" . "@chars" . "-" . "@symbols"; |
|
136
|
0
|
|
|
|
|
0
|
next; |
|
137
|
|
|
|
|
|
|
} |
|
138
|
0
|
|
|
|
|
0
|
my $word = join("", @chars); |
|
139
|
0
|
0
|
|
|
|
0
|
if ($self->{word_index}->{$chars[0]}) { |
|
140
|
0
|
|
|
|
|
0
|
push(@{$self->{word_index}->{$chars[0]}}, $word); |
|
|
0
|
|
|
|
|
0
|
|
|
141
|
|
|
|
|
|
|
} else { |
|
142
|
0
|
|
|
|
|
0
|
$self->{word_index}->{$chars[0]} = [$word]; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
0
|
|
|
|
|
0
|
$self->{words}->{$word} = \@symbols; |
|
145
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i <= $#chars; $i++) { |
|
146
|
0
|
|
|
|
|
0
|
$self->add_symbol($chars[$i], $symbols[$i]); |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
} |
|
150
|
0
|
|
|
|
|
0
|
close(ZHY_LIST); |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# add numbers |
|
153
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"0"} = "ling4"; |
|
154
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"1"} = "jat1"; |
|
155
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"2"} = "ji6"; |
|
156
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"3"} = "saam1"; |
|
157
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"4"} = "sei3"; |
|
158
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"5"} = "ng5"; |
|
159
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"6"} = "luk6"; |
|
160
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"7"} = "cat1"; |
|
161
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"8"} = "baat3"; |
|
162
|
0
|
|
|
|
|
0
|
$self->{jyutping}->{"9"} = "gau2"; |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub get_jyutping { |
|
166
|
3
|
|
|
3
|
1
|
8
|
my ($self, $str) = @_; |
|
167
|
|
|
|
|
|
|
|
|
168
|
3
|
50
|
|
|
|
21
|
if (not utf8::is_utf8($str)) { |
|
|
|
50
|
|
|
|
|
|
|
169
|
0
|
0
|
|
|
|
0
|
if (not utf8::decode($str)) { |
|
170
|
0
|
|
|
|
|
0
|
warn "$str is not in utf8 encoding."; |
|
171
|
0
|
|
|
|
|
0
|
return undef; |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
} elsif (not $str) { |
|
174
|
0
|
|
|
|
|
0
|
return undef; |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
|
|
177
|
3
|
100
|
|
|
|
9
|
if (wantarray) { |
|
178
|
1
|
|
|
|
|
2
|
my @jyutping; |
|
179
|
1
|
|
|
|
|
6
|
for (my $i = 0; $i < length($str); $i++) { |
|
180
|
2
|
|
|
|
|
8
|
my $char = substr($str, $i, 1); |
|
181
|
2
|
|
|
|
|
6
|
my @words = $self->get_words($char); |
|
182
|
2
|
|
|
|
|
3
|
my $longest_word = ''; |
|
183
|
2
|
|
|
|
|
5
|
foreach my $word (@words) { |
|
184
|
0
|
0
|
|
|
|
0
|
if (index($str, $word) == 0) { |
|
185
|
0
|
0
|
|
|
|
0
|
if (length($word) > length($longest_word)) { |
|
186
|
0
|
|
|
|
|
0
|
$longest_word = $word; |
|
187
|
|
|
|
|
|
|
} |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
} |
|
190
|
2
|
50
|
|
|
|
6
|
if ($longest_word) { |
|
191
|
0
|
|
|
|
|
0
|
push(@jyutping, @{$self->{words}->{$longest_word}}); |
|
|
0
|
|
|
|
|
0
|
|
|
192
|
0
|
|
|
|
|
0
|
$i += $#{$self->{words}->{$longest_word}}; |
|
|
0
|
|
|
|
|
0
|
|
|
193
|
|
|
|
|
|
|
} else { |
|
194
|
2
|
|
|
|
|
12
|
push(@jyutping, $self->{jyutping}->{$char}); |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
} |
|
197
|
1
|
|
|
|
|
17
|
return @jyutping; |
|
198
|
|
|
|
|
|
|
} else { |
|
199
|
2
|
|
|
|
|
11
|
my $char = substr($str, 0, 1); |
|
200
|
2
|
|
|
|
|
7
|
my @words = $self->get_words($char); |
|
201
|
2
|
|
|
|
|
6
|
my $longest_word = ''; |
|
202
|
2
|
|
|
|
|
4
|
foreach my $word (@words) { |
|
203
|
52
|
100
|
|
|
|
101
|
if (index($str, $word) == 0) { |
|
204
|
1
|
50
|
|
|
|
6
|
if (length($word) > length($longest_word)) { |
|
205
|
1
|
|
|
|
|
4
|
$longest_word = $word; |
|
206
|
|
|
|
|
|
|
} |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
} |
|
209
|
2
|
100
|
|
|
|
7
|
if ($longest_word) { |
|
210
|
1
|
|
|
|
|
9
|
return $self->{words}->{$longest_word}->[0]; |
|
211
|
|
|
|
|
|
|
} else { |
|
212
|
1
|
|
|
|
|
10
|
return $self->{jyutping}->{$char}; |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub get_words { |
|
218
|
5
|
|
|
5
|
1
|
13
|
my ($self, $char) = @_; |
|
219
|
|
|
|
|
|
|
|
|
220
|
5
|
100
|
|
|
|
18
|
if ($self->{word_index}->{$char}) { |
|
221
|
3
|
|
|
|
|
6
|
return @{$self->{word_index}->{$char}}; |
|
|
3
|
|
|
|
|
30
|
|
|
222
|
|
|
|
|
|
|
} else { |
|
223
|
2
|
|
|
|
|
6
|
return (); |
|
224
|
|
|
|
|
|
|
} |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub is_multi_phon { |
|
228
|
0
|
|
|
0
|
1
|
|
my ($self, $char) = @_; |
|
229
|
0
|
|
|
|
|
|
return $#{$self->{chars}->{$char}}; |
|
|
0
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub get_multi_phon { |
|
233
|
0
|
|
|
0
|
1
|
|
my ($self, $char) = @_; |
|
234
|
0
|
0
|
|
|
|
|
if ($self->{chars}->{$char}) { |
|
235
|
0
|
|
|
|
|
|
return @{$self->{chars}->{$char}}; |
|
|
0
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
} else { |
|
237
|
0
|
|
|
|
|
|
return undef; |
|
238
|
|
|
|
|
|
|
} |
|
239
|
|
|
|
|
|
|
} |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
1; |
|
242
|
|
|
|
|
|
|
__END__ |