line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::LO::NLP::Analyze; |
2
|
6
|
|
|
6
|
|
86363
|
use strict; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
139
|
|
3
|
6
|
|
|
6
|
|
24
|
use warnings; |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
119
|
|
4
|
6
|
|
|
6
|
|
84
|
use 5.012000; |
|
6
|
|
|
|
|
17
|
|
5
|
6
|
|
|
6
|
|
26
|
use utf8; |
|
6
|
|
|
|
|
14
|
|
|
6
|
|
|
|
|
27
|
|
6
|
6
|
|
|
6
|
|
148
|
use feature qw/ unicode_strings say /; |
|
6
|
|
|
|
|
7
|
|
|
6
|
|
|
|
|
406
|
|
7
|
6
|
|
|
6
|
|
30
|
use charnames qw/ :full lao /; |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
26
|
|
8
|
6
|
|
|
6
|
|
2821
|
use version 0.77; our $VERSION = version->declare('v1.0.1'); |
|
6
|
|
|
|
|
1328
|
|
|
6
|
|
|
|
|
30
|
|
9
|
6
|
|
|
6
|
|
790
|
use Unicode::Normalize 'NFC'; |
|
6
|
|
|
|
|
1639
|
|
|
6
|
|
|
|
|
261
|
|
10
|
6
|
|
|
6
|
|
29
|
use Carp; |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
255
|
|
11
|
6
|
|
|
6
|
|
262
|
use Class::Accessor::Fast 'antlers'; |
|
6
|
|
|
|
|
1749
|
|
|
6
|
|
|
|
|
45
|
|
12
|
6
|
|
|
6
|
|
772
|
use Lingua::LO::NLP::Data ':all'; |
|
6
|
|
|
|
|
13
|
|
|
6
|
|
|
|
|
637
|
|
13
|
|
|
|
|
|
|
|
14
|
6
|
|
|
6
|
|
44
|
use constant SUNG => 0; # "high class" |
|
6
|
|
|
|
|
19
|
|
|
6
|
|
|
|
|
371
|
|
15
|
6
|
|
|
6
|
|
32
|
use constant KANG => 1; # "middle class" |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
217
|
|
16
|
6
|
|
|
6
|
|
27
|
use constant TAM => 2; # "low class" |
|
6
|
|
|
|
|
9
|
|
|
6
|
|
|
|
|
549
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=encoding utf8 |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 NAME |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
Lingua::LO::NLP::Analyze - Analyze a Lao syllable and provide accessors to its constituents |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 FUNCTION |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Objects of this class represent a Lao syllable with an analysis of its |
27
|
|
|
|
|
|
|
constituents. After passing a valid syllable to the constructor, the parts are |
28
|
|
|
|
|
|
|
available via accessor methods as outlined below. |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=cut |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
for my $attribute (qw/ syllable parse vowel consonant end_consonant vowel_length tone tone_mark h semivowel live /) { |
33
|
|
|
|
|
|
|
has $attribute => (is => 'ro'); |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# This is a 2-level lookup table. The first level is the tone mark, the second |
37
|
|
|
|
|
|
|
# is the consonant class (SUNG/KANG/TAM, see constant definitions) |
38
|
|
|
|
|
|
|
my %TONE_MARKS = ( |
39
|
|
|
|
|
|
|
"\N{LAO TONE MAI EK}" => [ qw/ MID MID MID / ], |
40
|
|
|
|
|
|
|
"\N{LAO TONE MAI THO}" => [ qw/ MID_FALLING HIGH_FALLING HIGH_FALLING / ], |
41
|
|
|
|
|
|
|
# TODO: is this HIGH or HIGH_FALLING? Opinions seem to differ |
42
|
|
|
|
|
|
|
# and I haven't found a definitive source yet |
43
|
|
|
|
|
|
|
"\N{LAO TONE MAI TI}" => [ qw/ HIGH HIGH HIGH / ], |
44
|
|
|
|
|
|
|
"\N{LAO TONE MAI CATAWA}" => [ qw/ RISING RISING RISING /], |
45
|
|
|
|
|
|
|
); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# This is a 2-level lookup table. The first level is the consonant class |
48
|
|
|
|
|
|
|
# (SUNG/KANG/TAM, see constant definitions), the second is an index as |
49
|
|
|
|
|
|
|
# calculated in classify(): 0 for live, 1 for dead+short, 2 for dead+long |
50
|
|
|
|
|
|
|
my @TONE_NOMARK = ( |
51
|
|
|
|
|
|
|
[qw/ RISING HIGH MID_FALLING /], # SUNG/high |
52
|
|
|
|
|
|
|
[qw/ LOW HIGH MID_FALLING /], # KANG/mid |
53
|
|
|
|
|
|
|
[qw/ HIGH MID HIGH_FALLING /], # TAM/low |
54
|
|
|
|
|
|
|
); |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
my %CONSONANTS = ( |
57
|
|
|
|
|
|
|
'ກ' => KANG, |
58
|
|
|
|
|
|
|
'ຂ' => SUNG, |
59
|
|
|
|
|
|
|
'ຄ' => TAM, |
60
|
|
|
|
|
|
|
'ງ' => TAM, |
61
|
|
|
|
|
|
|
'ຈ' => KANG, |
62
|
|
|
|
|
|
|
'ສ' => SUNG, |
63
|
|
|
|
|
|
|
'ຊ' => TAM, |
64
|
|
|
|
|
|
|
'ຍ' => TAM, |
65
|
|
|
|
|
|
|
'ດ' => KANG, |
66
|
|
|
|
|
|
|
'ຕ' => KANG, |
67
|
|
|
|
|
|
|
'ຖ' => SUNG, |
68
|
|
|
|
|
|
|
'ທ' => TAM, |
69
|
|
|
|
|
|
|
'ນ' => TAM, |
70
|
|
|
|
|
|
|
'ບ' => KANG, |
71
|
|
|
|
|
|
|
'ປ' => KANG, |
72
|
|
|
|
|
|
|
'ຜ' => SUNG, |
73
|
|
|
|
|
|
|
'ຝ' => SUNG, |
74
|
|
|
|
|
|
|
'ພ' => TAM, |
75
|
|
|
|
|
|
|
'ຟ' => TAM, |
76
|
|
|
|
|
|
|
'ມ' => TAM, |
77
|
|
|
|
|
|
|
'ຢ' => KANG, |
78
|
|
|
|
|
|
|
'ລ' => TAM, |
79
|
|
|
|
|
|
|
'ວ' => TAM, |
80
|
|
|
|
|
|
|
'ຫ' => SUNG, |
81
|
|
|
|
|
|
|
'ອ' => KANG, |
82
|
|
|
|
|
|
|
'ຮ' => TAM, |
83
|
|
|
|
|
|
|
'ຣ' => TAM, |
84
|
|
|
|
|
|
|
'ຫງ' => SUNG, |
85
|
|
|
|
|
|
|
'ຫຍ' => SUNG, |
86
|
|
|
|
|
|
|
'ຫນ' => SUNG, |
87
|
|
|
|
|
|
|
'ໜ' => SUNG, |
88
|
|
|
|
|
|
|
'ຫມ' => SUNG, |
89
|
|
|
|
|
|
|
'ໝ' => SUNG, |
90
|
|
|
|
|
|
|
'ຫລ' => SUNG, |
91
|
|
|
|
|
|
|
'ຫຼ' => SUNG, |
92
|
|
|
|
|
|
|
'ຫວ' => SUNG, |
93
|
|
|
|
|
|
|
); |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
my %CONS_H_MNL = ( 'ມ' => 'ໝ', 'ນ' => 'ໜ', 'ລ' => "\N{LAO SEMIVOWEL SIGN LO}" ); |
96
|
|
|
|
|
|
|
my %ENDCONS_STOP = ( 'ກ' => 1, 'ດ' => 1, 'ບ' => 1 ); |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=head1 METHODS |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head2 new |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
C |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
The constructor takes a syllable and any number of options as hash-style |
105
|
|
|
|
|
|
|
arguments. The only option specified so far is C, a boolean value |
106
|
|
|
|
|
|
|
indicating whether to run the syllable through |
107
|
|
|
|
|
|
|
L and tone mark normalization |
108
|
|
|
|
|
|
|
(see L). Set this if you are unsure |
109
|
|
|
|
|
|
|
that your text is well-formed according to Unicode rules. |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=cut |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub new { |
114
|
290
|
|
|
290
|
1
|
108862
|
my $class = shift; |
115
|
290
|
|
|
|
|
439
|
my $syllable = shift; |
116
|
290
|
|
|
|
|
487
|
my %opts = @_; |
117
|
290
|
100
|
|
|
|
606
|
if($opts{normalize}) { |
118
|
86
|
|
|
|
|
427
|
$syllable = NFC($syllable); |
119
|
86
|
|
|
|
|
202
|
normalize_tone_marks($syllable); |
120
|
|
|
|
|
|
|
} |
121
|
290
|
|
|
|
|
527
|
return bless _classify($syllable), $class; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
{ |
125
|
|
|
|
|
|
|
my $regexp = get_sylre_named(); |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub _classify { |
128
|
290
|
|
66
|
290
|
|
711
|
my $s = shift // croak("`syllable' argument missing or undefined"); |
129
|
|
|
|
|
|
|
|
130
|
289
|
50
|
|
|
|
6945
|
$s =~ /^$regexp/ or croak("`$s' does not start with a valid syllable"); |
131
|
|
|
|
|
|
|
|
132
|
289
|
|
|
|
|
6338
|
my %class = ( |
133
|
|
|
|
|
|
|
syllable => $s, |
134
|
4
|
|
|
4
|
|
2814
|
parse => { %+ } |
|
4
|
|
|
|
|
1107
|
|
|
4
|
|
|
|
|
204
|
|
135
|
|
|
|
|
|
|
); |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
(my $consonant, my $end_consonant, @class{qw/ h semivowel tone_mark /}) = |
138
|
289
|
|
|
|
|
3093
|
@+{qw/ consonant end_consonant h semivowel tone_mark /}; |
139
|
|
|
|
|
|
|
|
140
|
289
|
|
100
|
|
|
1628
|
my @vowels = $+{vowel0} // (); |
141
|
289
|
|
|
|
|
633
|
push @vowels, "\N{DOTTED CIRCLE}"; |
142
|
289
|
|
|
|
|
878
|
push @vowels, grep { defined } @+{qw/ vowel1 vowel2 vowel3 /}; |
|
867
|
|
|
|
|
2288
|
|
143
|
289
|
|
|
|
|
952
|
$class{vowel} = join('', @vowels); |
144
|
|
|
|
|
|
|
|
145
|
289
|
|
|
|
|
560
|
my $cc = $CONSONANTS{ $consonant }; # consonant category |
146
|
289
|
100
|
|
|
|
558
|
if( $class{h} ) { |
147
|
10
|
|
|
|
|
19
|
$cc = SUNG; # $CONSONANTS{'ຫ'} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# If consonant is one of ມ, ນ or ລ *and* no vowel precedes the ຫ, |
150
|
|
|
|
|
|
|
# pretend we saw the combined form |
151
|
10
|
100
|
100
|
|
|
61
|
if(exists $CONS_H_MNL{ $consonant } and not $+{vowel0}) { |
152
|
1
|
|
|
|
|
3
|
$class{consonant} = $CONS_H_MNL{ $consonant }; |
153
|
1
|
|
|
|
|
3
|
delete $class{h}; |
154
|
|
|
|
|
|
|
} else { |
155
|
|
|
|
|
|
|
# If there is a preceding vowel, it uses the ຫ as a consonant and the |
156
|
|
|
|
|
|
|
# one parsed as core consonant is actually an end consonant |
157
|
9
|
100
|
100
|
|
|
50
|
unless($consonant eq 'ວ' or $consonant eq 'ຍ') { |
158
|
6
|
|
|
|
|
9
|
$end_consonant = $consonant; |
159
|
6
|
|
|
|
|
8
|
$consonant = 'ຫ'; |
160
|
6
|
|
|
|
|
14
|
delete $class{h}; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# Set both $class{vowel_length} and a quick flag that we'll need later |
166
|
289
|
|
|
|
|
380
|
my $long_vowel = 1; |
167
|
289
|
100
|
|
|
|
815
|
if(is_long_vowel( $class{vowel} )) { |
168
|
177
|
|
|
|
|
317
|
$class{vowel_length} = 'long'; |
169
|
|
|
|
|
|
|
} else { |
170
|
112
|
|
|
|
|
189
|
$class{vowel_length} = 'short'; |
171
|
112
|
|
|
|
|
161
|
$long_vowel = 0; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Determine syllable liveness. |
175
|
289
|
|
|
|
|
396
|
my $live; |
176
|
289
|
100
|
|
|
|
468
|
if( defined $end_consonant ) { |
177
|
|
|
|
|
|
|
# If we have an end consonant, a syllable is considered live if the |
178
|
|
|
|
|
|
|
# former is not a stopped consonant |
179
|
116
|
100
|
|
|
|
310
|
$live = exists $ENDCONS_STOP{ $end_consonant } ? 0 : 1; |
180
|
|
|
|
|
|
|
} else { |
181
|
|
|
|
|
|
|
# Syllables without an end consonant are live iff the vowel is long |
182
|
173
|
|
|
|
|
225
|
$live = $long_vowel; |
183
|
|
|
|
|
|
|
} |
184
|
289
|
|
|
|
|
680
|
$class{live} = $live; |
185
|
|
|
|
|
|
|
|
186
|
289
|
100
|
|
|
|
512
|
if(defined $class{tone_mark}) { |
187
|
|
|
|
|
|
|
# If a tone mark exists, it and the consonant's class |
188
|
|
|
|
|
|
|
# determine the tone |
189
|
66
|
|
|
|
|
166
|
$class{tone} = $TONE_MARKS{ $class{tone_mark} }[$cc]; |
190
|
|
|
|
|
|
|
} else { |
191
|
|
|
|
|
|
|
# No tone mark, so calculate the index |
192
|
223
|
100
|
|
|
|
559
|
$class{tone} = $TONE_NOMARK[$cc][ $live ? 0 : $long_vowel + 1 ]; |
193
|
|
|
|
|
|
|
} |
194
|
289
|
|
|
|
|
508
|
$class{consonant} = $consonant; |
195
|
289
|
100
|
|
|
|
557
|
$class{end_consonant} = $end_consonant if defined $end_consonant; |
196
|
|
|
|
|
|
|
#say Dumper(\%class); |
197
|
289
|
|
|
|
|
1650
|
return \%class; |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=head2 ACCESSORS |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=head3 syllable |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
The original syllable as used by the parser. This may be subtly different from |
207
|
|
|
|
|
|
|
the one passed to the constructor: |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=over 4 |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=item |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
If the C option was set, tone marks and vowels may have been reordered |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=item |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
If the decomposed form of LAO VOWEL SIGN AM (◌າ) is used, it will have been |
218
|
|
|
|
|
|
|
converted to the composed form |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
Combinations of ຫ with ລ, ມ or ນ will have been converted to the combined characters. |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=back |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=head3 parse |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
A hash of raw constituents as returned by the parsing regexp. Although the |
229
|
|
|
|
|
|
|
other accessors present constituents in a more accessible way and take care of |
230
|
|
|
|
|
|
|
morphological special cases like the treatment of ຫ, this may come in handy to |
231
|
|
|
|
|
|
|
quickly check e.g. if there was a vowel component before the core consonant. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=head3 vowel |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
The syllable's vowel or diphthong. As the majority of vowels have more than one |
236
|
|
|
|
|
|
|
code point, the consonant position is represented by the Unicode character |
237
|
|
|
|
|
|
|
designated for this function, DOTTED CIRCLE or U+25CC. |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=head3 consonant |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
The syllable's core consonant. |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=head3 end_consonant |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
The end consonant if present, C otherwise. |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=head3 tone_mark |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
The tone mark if present, C otherwise. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=head3 semivowel |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
The semivowel following the core consonant if present, C otherwise. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=head3 h |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
"ຫ" if the syllable contained a combining ຫ, i.e. one that isn't the core consonant. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=head3 vowel_length |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
The string 'long' or 'short'. |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head3 live |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Boolean indicating whether this is a "live" or a "dead" syllable. Dead |
266
|
|
|
|
|
|
|
syllables end in a short vowel or stopped consonant (ກ, ດ or ບ), lives ones end |
267
|
|
|
|
|
|
|
in a long vowel, diphthong, semivowel or nasal consonant. This is used for tone |
268
|
|
|
|
|
|
|
determination but also available as an attribute, just in case it might be |
269
|
|
|
|
|
|
|
useful. C indicates a live syllable. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=head3 tone |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
One of the following strings, depending on core consonant class, vowel length and tone mark: |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=over 4 |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=item LOW_RISING |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=item LOW |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=item MID |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=item HIGH |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=item MID_FALLING |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=item HIGH_FALLING |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=back |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
The latter two occur with short vowels, the other ones with long vowels. |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=cut |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
1; |
296
|
|
|
|
|
|
|
|