File Coverage

blib/lib/Lingua/LO/NLP/Analyze.pm
Criterion Covered Total %
statement 86 86 100.0
branch 21 22 95.4
condition 11 12 91.6
subroutine 17 17 100.0
pod 1 1 100.0
total 136 138 98.5


line stmt bran cond sub pod time code
1             package Lingua::LO::NLP::Analyze;
2 6     6   150408 use strict;
  6         17  
  6         190  
3 6     6   34 use warnings;
  6         12  
  6         172  
4 6     6   116 use 5.012000;
  6         38  
5 6     6   65 use utf8;
  6         16  
  6         38  
6 6     6   193 use feature qw/ unicode_strings say /;
  6         16  
  6         557  
7 6     6   35 use charnames qw/ :full lao /;
  6         14  
  6         47  
8 6     6   3607 use version 0.77; our $VERSION = version->declare('v1.0.1');
  6         1668  
  6         49  
9 6     6   1000 use Unicode::Normalize 'NFC';
  6         1587  
  6         414  
10 6     6   43 use Carp;
  6         15  
  6         374  
11 6     6   276 use Class::Accessor::Fast 'antlers';
  6         2248  
  6         49  
12 6     6   1025 use Lingua::LO::NLP::Data ':all';
  6         22  
  6         988  
13              
14 6     6   81 use constant SUNG => 0; # "high class"
  6         28  
  6         604  
15 6     6   46 use constant KANG => 1; # "middle class"
  6         15  
  6         356  
16 6     6   45 use constant TAM => 2; # "low class"
  6         15  
  6         838  
17              
18             =encoding utf8
19              
20             =head1 NAME
21              
22             Lingua::LO::NLP::Analyze - Analyze a Lao syllable and provide accessors to its constituents
23              
24             =head1 FUNCTION
25              
26             Objects of this class represent a Lao syllable with an analysis of its
27             constituents. After passing a valid syllable to the constructor, the parts are
28             available via accessor methods as outlined below.
29              
30             =cut
31              
32             for my $attribute (qw/ syllable parse vowel consonant end_consonant vowel_length tone tone_mark h semivowel live /) {
33             has $attribute => (is => 'ro');
34             }
35              
36             # This is a 2-level lookup table. The first level is the tone mark, the second
37             # is the consonant class (SUNG/KANG/TAM, see constant definitions)
38             my %TONE_MARKS = (
39             "\N{LAO TONE MAI EK}" => [ qw/ MID MID MID / ],
40             "\N{LAO TONE MAI THO}" => [ qw/ MID_FALLING HIGH_FALLING HIGH_FALLING / ],
41             # TODO: is this HIGH or HIGH_FALLING? Opinions seem to differ
42             # and I haven't found a definitive source yet
43             "\N{LAO TONE MAI TI}" => [ qw/ HIGH HIGH HIGH / ],
44             "\N{LAO TONE MAI CATAWA}" => [ qw/ RISING RISING RISING /],
45             );
46              
47             # This is a 2-level lookup table. The first level is the consonant class
48             # (SUNG/KANG/TAM, see constant definitions), the second is an index as
49             # calculated in classify(): 0 for live, 1 for dead+short, 2 for dead+long
50             my @TONE_NOMARK = (
51             [qw/ RISING HIGH MID_FALLING /], # SUNG/high
52             [qw/ LOW HIGH MID_FALLING /], # KANG/mid
53             [qw/ HIGH MID HIGH_FALLING /], # TAM/low
54             );
55              
56             my %CONSONANTS = (
57             'ກ' => KANG,
58             'ຂ' => SUNG,
59             'ຄ' => TAM,
60             'ງ' => TAM,
61             'ຈ' => KANG,
62             'ສ' => SUNG,
63             'ຊ' => TAM,
64             'ຍ' => TAM,
65             'ດ' => KANG,
66             'ຕ' => KANG,
67             'ຖ' => SUNG,
68             'ທ' => TAM,
69             'ນ' => TAM,
70             'ບ' => KANG,
71             'ປ' => KANG,
72             'ຜ' => SUNG,
73             'ຝ' => SUNG,
74             'ພ' => TAM,
75             'ຟ' => TAM,
76             'ມ' => TAM,
77             'ຢ' => KANG,
78             'ລ' => TAM,
79             'ວ' => TAM,
80             'ຫ' => SUNG,
81             'ອ' => KANG,
82             'ຮ' => TAM,
83             'ຣ' => TAM,
84             'ຫງ' => SUNG,
85             'ຫຍ' => SUNG,
86             'ຫນ' => SUNG,
87             'ໜ' => SUNG,
88             'ຫມ' => SUNG,
89             'ໝ' => SUNG,
90             'ຫລ' => SUNG,
91             'ຫຼ' => SUNG,
92             'ຫວ' => SUNG,
93             );
94              
95             my %CONS_H_MNL = ( 'ມ' => 'ໝ', 'ນ' => 'ໜ', 'ລ' => "\N{LAO SEMIVOWEL SIGN LO}" );
96             my %ENDCONS_STOP = ( 'ກ' => 1, 'ດ' => 1, 'ບ' => 1 );
97              
98             =head1 METHODS
99              
100             =head2 new
101              
102             C
103              
104             The constructor takes a syllable and any number of options as hash-style
105             arguments. The only option specified so far is C, a boolean value
106             indicating whether to run the syllable through
107             L and tone mark normalization
108             (see L). Set this if you are unsure
109             that your text is well-formed according to Unicode rules.
110              
111             =cut
112              
113             sub new {
114 290     290 1 139164 my $class = shift;
115 290         445 my $syllable = shift;
116 290         536 my %opts = @_;
117 290 100       646 if($opts{normalize}) {
118 86         466 $syllable = NFC($syllable);
119 86         252 normalize_tone_marks($syllable);
120             }
121 290         534 return bless _classify($syllable), $class;
122             }
123              
124             {
125             my $regexp = get_sylre_named();
126              
127             sub _classify {
128 290   66 290   756 my $s = shift // croak("`syllable' argument missing or undefined");
129              
130 289 50       7628 $s =~ /^$regexp/ or croak("`$s' does not start with a valid syllable");
131              
132 289         6855 my %class = (
133             syllable => $s,
134 4     4   6000 parse => { %+ }
  4         1743  
  4         322  
135             );
136              
137             (my $consonant, my $end_consonant, @class{qw/ h semivowel tone_mark /}) =
138 289         3407 @+{qw/ consonant end_consonant h semivowel tone_mark /};
139              
140 289   100     1988 my @vowels = $+{vowel0} // ();
141 289         735 push @vowels, "\N{DOTTED CIRCLE}";
142 289         997 push @vowels, grep { defined } @+{qw/ vowel1 vowel2 vowel3 /};
  867         2769  
143 289         1045 $class{vowel} = join('', @vowels);
144              
145 289         577 my $cc = $CONSONANTS{ $consonant }; # consonant category
146 289 100       605 if( $class{h} ) {
147 10         20 $cc = SUNG; # $CONSONANTS{'ຫ'}
148              
149             # If consonant is one of ມ, ນ or ລ *and* no vowel precedes the ຫ,
150             # pretend we saw the combined form
151 10 100 100     52 if(exists $CONS_H_MNL{ $consonant } and not $+{vowel0}) {
152 1         2 $class{consonant} = $CONS_H_MNL{ $consonant };
153 1         3 delete $class{h};
154             } else {
155             # If there is a preceding vowel, it uses the ຫ as a consonant and the
156             # one parsed as core consonant is actually an end consonant
157 9 100 100     46 unless($consonant eq 'ວ' or $consonant eq 'ຍ') {
158 6         12 $end_consonant = $consonant;
159 6         11 $consonant = 'ຫ';
160 6         16 delete $class{h};
161             }
162             }
163             }
164              
165             # Set both $class{vowel_length} and a quick flag that we'll need later
166 289         422 my $long_vowel = 1;
167 289 100       808 if(is_long_vowel( $class{vowel} )) {
168 177         328 $class{vowel_length} = 'long';
169             } else {
170 112         220 $class{vowel_length} = 'short';
171 112         174 $long_vowel = 0;
172             }
173              
174             # Determine syllable liveness.
175 289         452 my $live;
176 289 100       510 if( defined $end_consonant ) {
177             # If we have an end consonant, a syllable is considered live if the
178             # former is not a stopped consonant
179 116 100       252 $live = exists $ENDCONS_STOP{ $end_consonant } ? 0 : 1;
180             } else {
181             # Syllables without an end consonant are live iff the vowel is long
182 173         241 $live = $long_vowel;
183             }
184 289         724 $class{live} = $live;
185              
186 289 100       548 if(defined $class{tone_mark}) {
187             # If a tone mark exists, it and the consonant's class
188             # determine the tone
189 66         166 $class{tone} = $TONE_MARKS{ $class{tone_mark} }[$cc];
190             } else {
191             # No tone mark, so calculate the index
192 223 100       581 $class{tone} = $TONE_NOMARK[$cc][ $live ? 0 : $long_vowel + 1 ];
193             }
194 289         504 $class{consonant} = $consonant;
195 289 100       609 $class{end_consonant} = $end_consonant if defined $end_consonant;
196             #say Dumper(\%class);
197 289         1439 return \%class;
198             }
199             }
200              
201             =head2 ACCESSORS
202              
203              
204             =head3 syllable
205              
206             The original syllable as used by the parser. This may be subtly different from
207             the one passed to the constructor:
208              
209             =over 4
210              
211             =item
212              
213             If the C option was set, tone marks and vowels may have been reordered
214              
215             =item
216              
217             If the decomposed form of LAO VOWEL SIGN AM (◌າ) is used, it will have been
218             converted to the composed form
219              
220             =item
221              
222             Combinations of ຫ with ລ, ມ or ນ will have been converted to the combined characters.
223              
224             =back
225              
226             =head3 parse
227              
228             A hash of raw constituents as returned by the parsing regexp. Although the
229             other accessors present constituents in a more accessible way and take care of
230             morphological special cases like the treatment of ຫ, this may come in handy to
231             quickly check e.g. if there was a vowel component before the core consonant.
232              
233             =head3 vowel
234              
235             The syllable's vowel or diphthong. As the majority of vowels have more than one
236             code point, the consonant position is represented by the Unicode character
237             designated for this function, DOTTED CIRCLE or U+25CC.
238              
239             =head3 consonant
240              
241             The syllable's core consonant.
242              
243             =head3 end_consonant
244              
245             The end consonant if present, C otherwise.
246              
247             =head3 tone_mark
248              
249             The tone mark if present, C otherwise.
250              
251             =head3 semivowel
252              
253             The semivowel following the core consonant if present, C otherwise.
254              
255             =head3 h
256              
257             "ຫ" if the syllable contained a combining ຫ, i.e. one that isn't the core consonant.
258              
259             =head3 vowel_length
260              
261             The string 'long' or 'short'.
262              
263             =head3 live
264              
265             Boolean indicating whether this is a "live" or a "dead" syllable. Dead
266             syllables end in a short vowel or stopped consonant (ກ, ດ or ບ), lives ones end
267             in a long vowel, diphthong, semivowel or nasal consonant. This is used for tone
268             determination but also available as an attribute, just in case it might be
269             useful. C indicates a live syllable.
270              
271             =head3 tone
272              
273             One of the following strings, depending on core consonant class, vowel length and tone mark:
274              
275             =over 4
276              
277             =item LOW_RISING
278              
279             =item LOW
280              
281             =item MID
282              
283             =item HIGH
284              
285             =item MID_FALLING
286              
287             =item HIGH_FALLING
288              
289             =back
290              
291             The latter two occur with short vowels, the other ones with long vowels.
292              
293             =cut
294              
295             1;
296