blib/lib/Encode/Korean/TransliteratorGenerator.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 89 | 267 | 33.3 |
branch | 18 | 84 | 21.4 |
condition | 2 | 21 | 9.5 |
subroutine | 16 | 35 | 45.7 |
pod | 0 | 32 | 0.0 |
total | 125 | 439 | 28.4 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | # Korean Transliterator Generator | ||||||
2 | |||||||
3 | # $Id: TransliteratorGenerator.pm,v 1.7 2007/11/29 14:25:31 you Exp $ | ||||||
4 | |||||||
5 | package Encode::Korean::TransliteratorGenerator; | ||||||
6 | |||||||
7 | our $VERSION = do { q$Revision: 1.7 $ =~ /\d+\.(\d+)/; sprintf "%.2f", $1 / 100 }; | ||||||
8 | |||||||
9 | 11 | 11 | 22151 | use 5.008008; | |||
11 | 44 | ||||||
11 | 493 | ||||||
10 | 11 | 11 | 57 | use strict; | |||
11 | 22 | ||||||
11 | 364 | ||||||
11 | 11 | 11 | 53 | use warnings; | |||
11 | 21 | ||||||
11 | 40286 | ||||||
12 | |||||||
13 | # == CONSTANTS == | ||||||
14 | my $NotFound = '-1'; | ||||||
15 | |||||||
16 | my $CamelCase = 0; | ||||||
17 | my $GREEDY_SEP =1; | ||||||
18 | my $SMART_SEP = 2; | ||||||
19 | |||||||
20 | my %MODE = ( | ||||||
21 | 'CamelCase' => $CamelCase, | ||||||
22 | 'camel' => $CamelCase, | ||||||
23 | |||||||
24 | 'greedy_sep' => $GREEDY_SEP, | ||||||
25 | 'greedy' => $GREEDY_SEP, | ||||||
26 | |||||||
27 | 'smart_sep' => $SMART_SEP, | ||||||
28 | 'smart' => $SMART_SEP | ||||||
29 | ); | ||||||
30 | |||||||
31 | # == CONSTRUCTOR == | ||||||
32 | sub new { | ||||||
33 | 11 | 11 | 0 | 35 | my ($class) = @_; | ||
34 | 11 | 110 | my $self = { | ||||
35 | CONSONANTS => [], | ||||||
36 | VOWELS => [], | ||||||
37 | EL => undef, | ||||||
38 | ELL => undef, | ||||||
39 | NAUGHT => undef, | ||||||
40 | SEP => undef, | ||||||
41 | ENMODE => [], | ||||||
42 | DEMODE => [], | ||||||
43 | HEAD => [], | ||||||
44 | BODY => [], | ||||||
45 | FOOT => [], | ||||||
46 | HEADMAP => {}, | ||||||
47 | BODYMAP => {}, | ||||||
48 | FOOTMAP => {} | ||||||
49 | }; | ||||||
50 | |||||||
51 | 11 | 36 | bless $self, $class; | ||||
52 | 11 | 41 | return $self; | ||||
53 | } | ||||||
54 | |||||||
55 | # == METHODS == | ||||||
56 | # accessor | ||||||
57 | sub consonants { | ||||||
58 | 67 | 67 | 0 | 291 | my $self = shift; | ||
59 | 67 | 100 | 191 | if(@_) { | |||
60 | 11 | 19 | @{ $self->{CONSONANTS} } = @_; | ||||
11 | 90 | ||||||
61 | 11 | 41 | $self->head(@_); | ||||
62 | 11 | 264 | @ { $self->{FOOT} } = ( | ||||
11 | 73 | ||||||
63 | '', # NULL | ||||||
64 | $self->{CONSONANTS}->[0], # kiyeok (ㄱ) | ||||||
65 | $self->{CONSONANTS}->[1], # ssangkiyeok (ㄲ) | ||||||
66 | $self->{CONSONANTS}->[0] . $self->{CONSONANTS}->[9], # kiyeok sios (ㄳ) | ||||||
67 | $self->{CONSONANTS}->[2], # nieun (ㄴ) | ||||||
68 | $self->{CONSONANTS}->[2] . $self->{CONSONANTS}->[12], # nieun cieuc (ㄵ) | ||||||
69 | $self->{CONSONANTS}->[2] . $self->{CONSONANTS}->[18], # nieun hieuh (ㄶ) | ||||||
70 | $self->{CONSONANTS}->[3], # tikeut (ㄷ) | ||||||
71 | $self->{CONSONANTS}->[5], # rieul (ㄹ) | ||||||
72 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[0], # rieul kiyeok (ㄺ) | ||||||
73 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[6], # rieul mieum (ㄻ) | ||||||
74 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[7], # rieul pieup (ㄼ) | ||||||
75 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[9], # rieul sios (ㄽ) | ||||||
76 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[16], # rieul thieuth (ㄾ) | ||||||
77 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[17], # rieul phieuph (ㄿ) | ||||||
78 | $self->{CONSONANTS}->[5] . $self->{CONSONANTS}->[18], # rieul hieuh (ㅀ) | ||||||
79 | $self->{CONSONANTS}->[6], # mieum (ㅁ) | ||||||
80 | $self->{CONSONANTS}->[7], # pieup (ㅂ) | ||||||
81 | $self->{CONSONANTS}->[7] . $self->{CONSONANTS}->[9], # pieup sios (ㅄ) | ||||||
82 | $self->{CONSONANTS}->[9], # sios (ㅅ) | ||||||
83 | $self->{CONSONANTS}->[10], # ssangsios (ㅆ) | ||||||
84 | $self->{CONSONANTS}->[11], # ieung (ㅇ) | ||||||
85 | $self->{CONSONANTS}->[12], # cieuc (ㅈ) | ||||||
86 | $self->{CONSONANTS}->[14], # chieuch (ㅊ) | ||||||
87 | $self->{CONSONANTS}->[15], # khieukh (ㅋ) | ||||||
88 | $self->{CONSONANTS}->[16], # thieuth (ㅌ) | ||||||
89 | $self->{CONSONANTS}->[17], # phieuph (ㅍ) | ||||||
90 | $self->{CONSONANTS}->[18] # hieuh (ㅎ) | ||||||
91 | ); | ||||||
92 | } | ||||||
93 | 67 | 277 | return $self->{CONSONANTS}; | ||||
94 | } | ||||||
95 | |||||||
96 | sub head { | ||||||
97 | 858 | 858 | 0 | 888 | my $self = shift; | ||
98 | 858 | 100 | 1388 | if (@_) { @{ $self->{HEAD} } = @_; } | |||
11 | 23 | ||||||
11 | 49 | ||||||
99 | 858 | 2407 | return $self->{HEAD}; | ||||
100 | } | ||||||
101 | |||||||
102 | sub foot { | ||||||
103 | 691 | 691 | 0 | 750 | my $self = shift; | ||
104 | 691 | 50 | 1133 | if (@_) { @{ $self->{FOOT} } = @_; } | |||
0 | 0 | ||||||
0 | 0 | ||||||
105 | 691 | 1907 | return $self->{FOOT}; | ||||
106 | } | ||||||
107 | |||||||
108 | # accessor | ||||||
109 | sub vowels { | ||||||
110 | 11 | 11 | 0 | 22 | my $self = shift; | ||
111 | 11 | 50 | 52 | if(@_) { | |||
112 | 11 | 23 | @{ $self->{VOWELS} } = @_; | ||||
11 | 62 | ||||||
113 | 11 | 46 | $self->body(@_); | ||||
114 | } | ||||||
115 | 11 | 32 | return $self->{VOWELS}; | ||||
116 | } | ||||||
117 | |||||||
118 | sub body { | ||||||
119 | 1177 | 1177 | 0 | 1160 | my $self = shift; | ||
120 | 1177 | 100 | 2094 | if (@_) { @{ $self->{BODY} } = @_; } | |||
11 | 22 | ||||||
11 | 45 | ||||||
121 | 1177 | 2948 | return $self->{BODY}; | ||||
122 | } | ||||||
123 | |||||||
124 | |||||||
125 | |||||||
126 | # accessor | ||||||
127 | sub el { | ||||||
128 | 8 | 8 | 0 | 15 | my $self = shift; | ||
129 | 8 | 50 | 26 | if(@_) { | |||
130 | 8 | 16 | $self->{EL} = shift; | ||||
131 | |||||||
132 | # Sets jongseongs with rieul | ||||||
133 | 8 | 28 | $self->foot->[8] = $self->{EL}; # rieul (ㄹ) | ||||
134 | 8 | 36 | $self->foot->[9] = $self->{EL} . $self->consonants->[0]; # rieul kiyeok (ㄺ) | ||||
135 | 8 | 63 | $self->foot->[10] = $self->{EL} . $self->consonants->[6]; # rieul mieum (ㄻ) | ||||
136 | 8 | 33 | $self->foot->[11] = $self->{EL} . $self->consonants->[7]; # rieul pieup (ㄼ) | ||||
137 | 8 | 28 | $self->foot->[12] = $self->{EL} . $self->consonants->[9]; # rieul sios (ㄽ) | ||||
138 | 8 | 43 | $self->foot->[13] = $self->{EL} . $self->consonants->[16]; # rieul thieuth (ㄾ) | ||||
139 | 8 | 24 | $self->foot->[14] = $self->{EL} . $self->consonants->[17]; # rieul phieuph (ㄿ) | ||||
140 | 8 | 28 | $self->foot->[15] = $self->{EL} . $self->consonants->[18]; # rieul hieuh (ㅀ) | ||||
141 | } | ||||||
142 | 8 | 23 | return $self->{EL}; | ||||
143 | } | ||||||
144 | |||||||
145 | # accessor | ||||||
146 | sub ell { | ||||||
147 | 8 | 8 | 0 | 14 | my $self = shift; | ||
148 | 8 | 50 | 36 | if(@_) { | |||
149 | 8 | 143 | $self->{ELL} = shift; | ||||
150 | } | ||||||
151 | 8 | 25 | return $self->{ELL}; | ||||
152 | } | ||||||
153 | |||||||
154 | # accessor | ||||||
155 | sub naught { | ||||||
156 | 10 | 10 | 0 | 21 | my $self = shift; | ||
157 | 10 | 50 | 39 | if(@_) { | |||
158 | 10 | 22 | $self->{NAUGHT} = shift; | ||||
159 | 10 | 25 | $self->{HEAD}->[11] = $self->{NAUGHT}; | ||||
160 | } | ||||||
161 | 10 | 38 | return $self->{NAUGHT}; | ||||
162 | } | ||||||
163 | |||||||
164 | # accessor | ||||||
165 | sub sep { | ||||||
166 | 11 | 11 | 0 | 23 | my $self = shift; | ||
167 | 11 | 50 | 52 | if(@_) { | |||
168 | 11 | 33 | $self->{SEP} = shift; | ||||
169 | } | ||||||
170 | 11 | 45 | return $self->{SEP}; | ||||
171 | } | ||||||
172 | |||||||
173 | # accessor | ||||||
174 | sub enmode { | ||||||
175 | 12 | 12 | 0 | 25 | my $self = shift; | ||
176 | 12 | 50 | 56 | if(@_) { | |||
177 | 12 | 35 | $self->{ENMODE} = shift; | ||||
178 | } | ||||||
179 | 12 | 46 | return $self->{ENMODE}; | ||||
180 | } | ||||||
181 | |||||||
182 | sub demode { | ||||||
183 | 10 | 10 | 0 | 18 | my $self = shift; | ||
184 | 10 | 50 | 37 | if(@_) { | |||
185 | 10 | 45 | $self->{DEMODE} = shift; | ||||
186 | } | ||||||
187 | 10 | 29 | return $self->{DEMODE}; | ||||
188 | } | ||||||
189 | |||||||
190 | |||||||
191 | |||||||
192 | sub make { | ||||||
193 | 11 | 11 | 0 | 52 | my $self = shift; | ||
194 | |||||||
195 | 11 | 29 | for ( my $i=0; $i <= $#{$self->head}; ++$i ) { | ||||
220 | 360 | ||||||
196 | 209 | 50 | 66 | 368 | if ($self->head->[$i] eq "" && $i != 11) { | ||
197 | #printf "error: empty slot. fill the transliteration for /%s/! ", |
||||||
198 | # encode::encode("utf8", $han_consonant[$i]); exit(1); | ||||||
199 | } | ||||||
200 | 209 | 50 | 414 | if (exists $self->{HEADMAP}->{$self->head->[$i]}) { | |||
201 | #print_mapping_error($self::head[$i], $self::head{$self::head[$i]}, $i); | ||||||
202 | 0 | 0 | exit(1); | ||||
203 | } else { | ||||||
204 | 209 | 394 | $self->{HEADMAP}->{$self->head->[$i]} = $i; | ||||
205 | }; | ||||||
206 | } | ||||||
207 | |||||||
208 | 11 | 25 | for ( my $i=0; $i <= $#{$self->body}; ++$i ) { | ||||
242 | 415 | ||||||
209 | 231 | 50 | 361 | if ($self->body->[$i] eq "") { | |||
210 | #printf "error: empty slot. fill the transliteration for /%s/! ", |
||||||
211 | # Encode::encode("utf8", $HAN_VOWEL[$i]); | ||||||
212 | 0 | 0 | exit(1); | ||||
213 | } | ||||||
214 | 231 | 50 | 477 | if (exists $self->{BODYMAP}->{$self->body->[$i]}) { | |||
215 | #print_mapping_error($self::BODY[$i], $self::BODY{$self::BODY[$i]}, $i); | ||||||
216 | 0 | 0 | exit(1); | ||||
217 | } else { | ||||||
218 | 231 | 508 | $self->{BODYMAP}->{$self->body->[$i]} = $i; | ||||
219 | }; | ||||||
220 | 231 | 458 | $self->{BODYMAP}->{$self->body->[$i]} = $i; | ||||
221 | } | ||||||
222 | 11 | 26 | for ( my $i=0; $i <= $#{$self->foot}; ++$i ) { | ||||
319 | 674 | ||||||
223 | 308 | 565 | $self->{FOOTMAP}->{$self->foot->[$i]} = $i; | ||||
224 | } | ||||||
225 | |||||||
226 | 11 | 43 | return $self; | ||||
227 | } | ||||||
228 | |||||||
229 | |||||||
230 | # encode($string [,$check]) | ||||||
231 | # = transliteration (romanization) | ||||||
232 | sub encode($$;$) { | ||||||
233 | 0 | 0 | 0 | my ($obj, $str, $chk) = @_; | |||
234 | 0 | my $tr = $obj->transliterate($str); | |||||
235 | 0 | 0 | $_[1] = '' if $chk; | ||||
236 | 0 | return $tr; | |||||
237 | } | ||||||
238 | |||||||
239 | # decode($octets [,$check]) | ||||||
240 | sub decode ($$;$) { | ||||||
241 | 0 | 0 | 0 | my ($obj, $str, $chk) = @_; | |||
242 | 0 | my $han = $obj->hangulize($str); | |||||
243 | 0 | 0 | $_[1] = '' if $chk; | ||||
244 | 0 | return $han; | |||||
245 | } | ||||||
246 | |||||||
247 | # to work with encoding pragma | ||||||
248 | # cat_decode($destination, $octets, $offset, $terminator [,$check]) | ||||||
249 | |||||||
250 | |||||||
251 | |||||||
252 | |||||||
253 | |||||||
254 | |||||||
255 | # = HAN TRANSLITERATOR = | ||||||
256 | # romanizer and hangulizer | ||||||
257 | |||||||
258 | # == hangul composer and decomposer == | ||||||
259 | # | ||||||
260 | # Unicode : 0xAC00 (가) -- 0xD7A3 (힣) | ||||||
261 | # | ||||||
262 | # foot (28 types) : 가각갂갃간갅갆갇갈갉갊갋갌갍갎갏감갑값갓갔강갖갗갘같갚갛 | ||||||
263 | # body (21 types) : 가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기 | ||||||
264 | # head (19 types) : 가까나다따라마바빠사싸아자짜차카타파하 | ||||||
265 | # | ||||||
266 | |||||||
267 | |||||||
268 | # === decompose === | ||||||
269 | # decomposes an unicode hangul chr into a hancode ($head, $body, $foot) | ||||||
270 | # for example, decompose('한') returns (18, 0, 4) | ||||||
271 | sub decompose { | ||||||
272 | 0 | 0 | 0 | my $self = shift; | |||
273 | |||||||
274 | 0 | my($chr) = @_; | |||||
275 | 0 | my $unicode = ord($chr); | |||||
276 | 0 | my $head = int(($unicode - 0xAC00) / (28*21)); | |||||
277 | 0 | my $body = int(($unicode - 0xAC00 - $head*28*21) /28); | |||||
278 | 0 | my $foot = $unicode - 0xAC00 - $head*28*21 - $body*28; | |||||
279 | 0 | return ($head, $body, $foot); | |||||
280 | } | ||||||
281 | |||||||
282 | # === compose === | ||||||
283 | # composes an unicode hangul chr from a hancode ($head, $body, $foot) | ||||||
284 | # for example, compose((18,0,4)) returns '한' | ||||||
285 | sub compose { | ||||||
286 | 0 | 0 | 0 | my $self = shift; | |||
287 | |||||||
288 | 0 | my($head, $body, $foot) = @_; | |||||
289 | 0 | my $unicode = 0xAC00 + $head*28*21 + $body*28 + $foot; | |||||
290 | 0 | return chr($unicode); | |||||
291 | } | ||||||
292 | |||||||
293 | |||||||
294 | |||||||
295 | # == ROMANIZE (TRANSLITERATE) == | ||||||
296 | |||||||
297 | # === transliterates a hangul chr (unicode hangul syllable) === | ||||||
298 | # for example, transliterate('한') returns ('h', 'a', 'n') | ||||||
299 | sub transliterate_chr { | ||||||
300 | 0 | 0 | 0 | my $self = shift; | |||
301 | 0 | my($chr) = @_; | |||||
302 | 0 | my($head,$body,$foot) = $self->decompose($chr); | |||||
303 | #return ($self->head->[$head], $self->body->[$body], $self->foot->[$foot]); | ||||||
304 | 0 | 0 | 0 | if ($self->enmode eq 'greedy' && $head == 11) { | |||
305 | 0 | return $self->body->[$body] . $self->foot->[$foot]; | |||||
306 | } else { | ||||||
307 | 0 | return $self->head->[$head] . $self->body->[$body] . $self->foot->[$foot]; | |||||
308 | } | ||||||
309 | } | ||||||
310 | sub transliterate_first_chr_of_word { | ||||||
311 | 0 | 0 | 0 | my $self = shift; | |||
312 | 0 | my($chr) = @_; | |||||
313 | 0 | my($head, $body, $foot) = $self->decompose ($chr); | |||||
314 | 0 | 0 | if ($head == 11) { | ||||
315 | 0 | return $self->body->[$body] . $self->foot->[$foot]; | |||||
316 | } else { | ||||||
317 | 0 | return $self->head->[$head] . $self->body->[$body] . $self->foot->[$foot]; | |||||
318 | } | ||||||
319 | |||||||
320 | } | ||||||
321 | |||||||
322 | # === transliterate a hangul word === | ||||||
323 | # Transliterates a hangul word (a string containing | ||||||
324 | # only hangul syllables) | ||||||
325 | sub transliterate_hangul_word { | ||||||
326 | 0 | 0 | 0 | my $self = shift; | |||
327 | 0 | my($word) = @_; | |||||
328 | 0 | my(@char) = split //, $word; | |||||
329 | 0 | my $tr = $self->transliterate_first_chr_of_word($char[0]); | |||||
330 | 0 | for (my $i=1; $i <= $#char; ++$i) { | |||||
331 | 0 | 0 | if ($MODE{$self->enmode} == $GREEDY_SEP) { | ||||
332 | 0 | $tr = $tr . $self->sep . $self->transliterate_chr($char[$i]); | |||||
333 | } else { | ||||||
334 | 0 | $tr = $tr . $self->transliterate_chr($char[$i]); | |||||
335 | } | ||||||
336 | } | ||||||
337 | 0 | return $tr; | |||||
338 | } | ||||||
339 | |||||||
340 | # === transliterate a string === | ||||||
341 | # The input string may contain any character. | ||||||
342 | # Transliterates only unicode hangul syllables (AC00-D7A3), | ||||||
343 | # returns other characters including hangul jamo (1100-11F9) | ||||||
344 | # and hangul compatibility jamo. | ||||||
345 | sub transliterate_line { | ||||||
346 | 0 | 0 | 0 | my($str) = @_; | |||
347 | 0 | my $tr; | |||||
348 | 0 | my(@char) = split(//,$str); | |||||
349 | 0 | foreach my $c (@char) { | |||||
350 | 0 | 0 | 0 | if (ord($c)>=0xAC00 && ord($c)<=0xD7A3){ | |||
351 | 0 | $tr = $tr . transliterate_chr($c); | |||||
352 | } else { | ||||||
353 | 0 | $tr = $tr . $c; | |||||
354 | } | ||||||
355 | } | ||||||
356 | 0 | return $tr; | |||||
357 | } | ||||||
358 | |||||||
359 | # === transliterate === | ||||||
360 | # Transliterates word by word | ||||||
361 | sub transliterate { | ||||||
362 | 0 | 0 | 0 | my $self = shift; | |||
363 | |||||||
364 | #my($str) = @_; | ||||||
365 | 0 | my $str = shift; | |||||
366 | 0 | my $tr; | |||||
367 | 0 | my(@word) = split /([^\x{AC00}-\x{D7A3}]+)/, $str; | |||||
368 | 0 | foreach my $w (@word) { | |||||
369 | 0 | 0 | if ($w =~ m/^[\x{AC00}-\x{D7A3}]+$/) { | ||||
370 | 0 | $tr = $tr . $self->transliterate_hangul_word($w); | |||||
371 | } else { | ||||||
372 | 0 | $tr = $tr . $w; | |||||
373 | } | ||||||
374 | } | ||||||
375 | |||||||
376 | 0 | return $tr; | |||||
377 | } | ||||||
378 | |||||||
379 | |||||||
380 | # | ||||||
381 | # == HANGULIZE (REVERSE TRANSLITERATION) == | ||||||
382 | # | ||||||
383 | # H: head, B: body, F: foot | ||||||
384 | # H?BF?(HBF?)* | ||||||
385 | |||||||
386 | # === hangulize === | ||||||
387 | # reverse transliteration : hangulizes a transliterated strings | ||||||
388 | # for example: hangulize('hangugmal') returns '한국말' | ||||||
389 | sub hangulize { | ||||||
390 | 0 | 0 | 0 | my $self = shift; | |||
391 | 0 | my $sep = $self->sep; | |||||
392 | |||||||
393 | 0 | my($str) = @_; | |||||
394 | 0 | my $h; | |||||
395 | |||||||
396 | 0 | 0 | if ($sep ne '') { | ||||
397 | 0 | my @word = split(/\Q$sep\E/, $str); | |||||
398 | 0 | foreach(@word) { $h = $h . $self->get_han($_); } | |||||
0 | |||||||
399 | } else { | ||||||
400 | 0 | $h = $h . $self->get_han($str); | |||||
401 | } | ||||||
402 | 0 | return $h; | |||||
403 | } | ||||||
404 | |||||||
405 | #------------------------------ | ||||||
406 | # hangulizes an array of alphabets into one hangul chr | ||||||
407 | # for example, hangulize_code(('h', 'a', 'n')) returns '한' | ||||||
408 | sub hangulize_code { | ||||||
409 | 0 | 0 | 0 | my $self = shift; | |||
410 | |||||||
411 | 0 | my($head, $body, $foot) = @_; | |||||
412 | 0 | my @hancode = ($self->{HEADMAP}->{$head}, $self->{BODYMAP}->{$body}, $self->{FOOTMAP}->{$foot}); | |||||
413 | 0 | return $self->compose(@hancode); | |||||
414 | } | ||||||
415 | |||||||
416 | |||||||
417 | #------------------------------- | ||||||
418 | # lookup $str, @list_of_jamo_transliteration | ||||||
419 | # eg. lookup('ssan', @CONSONANT) returns ('ss', 'an') | ||||||
420 | # where @CONSONANT has an item 'ss' | ||||||
421 | sub lookup { | ||||||
422 | 0 | 0 | 0 | my $self = shift; | |||
423 | |||||||
424 | 0 | my($str, @where) = @_; | |||||
425 | 0 | my $found = $NotFound; | |||||
426 | 0 | my $rest = $str; | |||||
427 | 0 | foreach(@where) { | |||||
428 | 0 | 0 | if ($_ eq substr($str, 0, length($_))) { | ||||
429 | 0 | 0 | if ($found eq $NotFound) { | ||||
0 | |||||||
430 | 0 | $found = $_; | |||||
431 | 0 | $rest = substr($str, length($_)); | |||||
432 | } elsif (length($found) < length($_)) { | ||||||
433 | 0 | $found = $_; | |||||
434 | 0 | $rest = substr($str, length($_)); | |||||
435 | } | ||||||
436 | } | ||||||
437 | } | ||||||
438 | # if($found eq $NotFound) { | ||||||
439 | # if(@where == @HEAD) {$found = $HEAD[11];} | ||||||
440 | # elsif (@where == @BODY) {$found = $NotFound;} | ||||||
441 | # elsif (@where == @FOOT) {$found = $FOOT[0];} | ||||||
442 | # $rest = $str; | ||||||
443 | # } | ||||||
444 | 0 | return ($found, $rest); | |||||
445 | } | ||||||
446 | |||||||
447 | #------------------------------- | ||||||
448 | # $SEP = "/"; $NAUGHT = "'"; | ||||||
449 | # isse = 이써 | ||||||
450 | # iss'e = 있어 : is/se = 잇서 | ||||||
451 | # ibsi = 입시 | ||||||
452 | # ibs'i = 잆이 | ||||||
453 | # ibsse = 입써 : ibs/se = 잆서 | ||||||
454 | # ibssse = 잆써 | ||||||
455 | |||||||
456 | #------------------------------- | ||||||
457 | # get_head($str) | ||||||
458 | # eg. get_head("ssan") retunrs ("ss", "an") | ||||||
459 | sub get_head { | ||||||
460 | 0 | 0 | 0 | my $self = shift; | |||
461 | |||||||
462 | 0 | my($str) = @_; | |||||
463 | 0 | my($head, $rest) = $self->lookup($str, @{$self->head}); | |||||
0 | |||||||
464 | 0 | return ($head, $rest); | |||||
465 | } | ||||||
466 | |||||||
467 | #------------------------------- | ||||||
468 | # get_body($str) | ||||||
469 | # eg. get_body("wan") returns ("wa", "n") | ||||||
470 | sub get_body { | ||||||
471 | 0 | 0 | 0 | my $self = shift; | |||
472 | |||||||
473 | 0 | my($str) = @_; | |||||
474 | 0 | my($body, $rest) = $self->lookup($str, @{$self->body}); | |||||
0 | |||||||
475 | 0 | return ($body, $rest); | |||||
476 | } | ||||||
477 | #------------------------------- | ||||||
478 | # get_foot($str) | ||||||
479 | # eg. get_foot("bssan") returns ("bs", "san") | ||||||
480 | sub get_foot { | ||||||
481 | 0 | 0 | 0 | my $self = shift; | |||
482 | |||||||
483 | 0 | my($str) = @_; | |||||
484 | 0 | my($foot, $rest) = $self->lookup($str, @{$self->foot}); | |||||
0 | |||||||
485 | 0 | return ($foot, $rest); | |||||
486 | } | ||||||
487 | |||||||
488 | #------------------------------- | ||||||
489 | # look_ahead for the next head - body sequence | ||||||
490 | # case : | ||||||
491 | # normal : look_ahead("mal") == "m"; | ||||||
492 | # no_head: look_ahead("an") == ""; | ||||||
493 | # no_body: look_ahead("kkkkk") eq $NotFound; | ||||||
494 | sub look_ahead { | ||||||
495 | 0 | 0 | 0 | my $self = shift; | |||
496 | |||||||
497 | 0 | my ($right) = @_; | |||||
498 | 0 | my $head; | |||||
499 | my $body; | ||||||
500 | 0 | ($head, $right) = $self->get_head($right); | |||||
501 | 0 | ($body, $right) = $self->get_body($right); | |||||
502 | |||||||
503 | 0 | 0 | if ($body eq $NotFound) { return $NotFound;} | ||||
0 | 0 | ||||||
0 | |||||||
504 | elsif($head eq $NotFound) {return "";} | ||||||
505 | 0 | else { return $head;} | |||||
506 | } | ||||||
507 | |||||||
508 | #------------------------------- | ||||||
509 | # get a hangul string from a transliteration : | ||||||
510 | # Makes the first hangul syllable from a transliterated string | ||||||
511 | # and recursively processes the rest. | ||||||
512 | # for example: get_han('hangugmal') returns unicode string '한국말' | ||||||
513 | sub get_han { | ||||||
514 | 0 | 0 | 0 | my $self = shift; | |||
515 | 0 | my $NAUGHT = $self->naught; | |||||
516 | 0 | my $FILL = ""; # jongseong filler | |||||
517 | |||||||
518 | 0 | my ($right) = @_; | |||||
519 | 0 | my $head; | |||||
520 | my $body; | ||||||
521 | 0 | my $foot; | |||||
522 | 0 | my $look_ahead_token; | |||||
523 | 0 | my $h; | |||||
524 | |||||||
525 | |||||||
526 | 0 | show_process(0, "begin", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
527 | |||||||
528 | 0 | ($head, $right) = $self->get_head($right); | |||||
529 | 0 | show_process(1, "get_head", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
530 | |||||||
531 | 0 | ($body, $right) = $self->get_body($right); | |||||
532 | 0 | show_process(2, "get_body", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
533 | |||||||
534 | 0 | 0 | 0 | if ($head eq $NotFound && $body eq $NotFound ) { | |||
0 | 0 | ||||||
535 | 0 | $h = $h . substr($right,0,1); | |||||
536 | 0 | show_process(21, "no head", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
537 | 0 | 0 | if($right ne "") {$h = $h . $self->get_han(substr($right,1));} | ||||
0 | |||||||
538 | } elsif ($head ne $NotFound && $body eq $NotFound) { | ||||||
539 | 0 | $h = $h . $head; | |||||
540 | 0 | show_process(22, "no body", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
541 | 0 | 0 | if($right ne "") {$h = $h . substr($right, 0, 1) . $self->get_han(substr($right,1));} | ||||
0 | |||||||
542 | } else { | ||||||
543 | 0 | 0 | if($head eq $NotFound) { $head = $NAUGHT; } | ||||
0 | |||||||
544 | 0 | ($foot, $right) = $self->get_foot($right); | |||||
545 | 0 | show_process(3, "get_foot", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
546 | 0 | 0 | 0 | if ($foot eq $NotFound || $foot eq $FILL) { | |||
0 | |||||||
547 | 0 | $h = $h . $self->hangulize_code($head, $body, $FILL); | |||||
548 | 0 | show_process(31, "no foot", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
549 | 0 | 0 | if($right ne "") {$h = $h . $self->get_han($right);} | ||||
0 | |||||||
550 | } elsif($right eq "") { | ||||||
551 | 0 | $h = $h . $self->hangulize_code($head, $body, $foot); | |||||
552 | 0 | show_process(32, "eof", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
553 | } else { | ||||||
554 | 0 | $look_ahead_token = $self->look_ahead($right); | |||||
555 | 0 | show_process(4, "look_ahead", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
556 | 0 | 0 | 0 | if ($look_ahead_token eq $NotFound || $look_ahead_token eq $NAUGHT) { | |||
557 | 0 | $h = $h . $self->hangulize_code($head, $body, $foot); | |||||
558 | 0 | show_process(41, "no look", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
559 | 0 | 0 | if($right ne "") {$h = $h . $self->get_han($right);} | ||||
0 | |||||||
560 | } else { | ||||||
561 | 0 | ($foot, $right) = $self->get_correct_foot($foot, $look_ahead_token, $right); | |||||
562 | 0 | $h = $h . $self->hangulize_code($head, $body, $foot); | |||||
563 | 0 | show_process(42, "get_correct_foot", $h, $head, $body, $foot, $look_ahead_token, $right); | |||||
564 | 0 | 0 | if($right ne "") {$h = $h . $self->get_han($right);} | ||||
0 | |||||||
565 | } | ||||||
566 | |||||||
567 | } | ||||||
568 | } | ||||||
569 | 0 | return $h; | |||||
570 | } | ||||||
571 | |||||||
572 | |||||||
573 | |||||||
574 | $, = "\t"; | ||||||
575 | sub show_process { | ||||||
576 | 0 | 0 | 0 | if(0) { | |||
577 | my($id, $desc, $h, $head, $body, $foot, $look_ahead_token, $right) = @_; | ||||||
578 | print $id , $desc, $h, $head, $body, $foot, $look_ahead_token, $right, "\n"; | ||||||
579 | } | ||||||
580 | } | ||||||
581 | |||||||
582 | #------------------------------- | ||||||
583 | # correct foot | ||||||
584 | # |
||||||
585 | # |
||||||
586 | # |
||||||
587 | #my $foot_p, my $look_ahead_token, my $right_p; | ||||||
588 | #my $foot, my $right; | ||||||
589 | sub get_correct_foot { | ||||||
590 | 0 | 0 | 0 | my $self = shift; | |||
591 | |||||||
592 | 0 | my ($foot_p, $look_ahead_token, $right_p) = @_; | |||||
593 | 0 | my $foot, my $right; | |||||
594 | 0 | $foot_p = $foot_p . $look_ahead_token;; | |||||
595 | 0 | $right_p = substr($right_p, length($look_ahead_token)); | |||||
596 | 0 | $foot = $foot_p; | |||||
597 | 0 | $right = $right_p; | |||||
598 | 0 | my $found = $NotFound; | |||||
599 | |||||||
600 | 0 | foreach(@{$self->head}) { | |||||
0 | |||||||
601 | 0 | 0 | if ($_ eq substr($foot_p, length($foot_p) - length($_))) { | ||||
602 | 0 | 0 | if ($found eq $NotFound) { | ||||
0 | |||||||
603 | 0 | $found = $_; | |||||
604 | 0 | $foot = substr($foot_p, 0, length($foot)-length($found)); | |||||
605 | 0 | $right = $found . $right_p; | |||||
606 | } elsif (length($found) < length($_)) { | ||||||
607 | 0 | $found = $_; | |||||
608 | 0 | $foot = substr($foot_p, 0, length($foot_p)-length($found)); | |||||
609 | 0 | $right = $found . $right_p; | |||||
610 | } | ||||||
611 | } | ||||||
612 | } | ||||||
613 | |||||||
614 | 0 | return ($foot, $right); | |||||
615 | } | ||||||
616 | |||||||
617 | 1; | ||||||
618 | __END__ |