File Coverage

blib/lib/Lingua/LO/NLP/Analyze.pm
Criterion Covered Total %
statement 86 86 100.0
branch 21 22 95.4
condition 11 12 91.6
subroutine 17 17 100.0
pod 1 1 100.0
total 136 138 98.5


line stmt bran cond sub pod time code
1             package Lingua::LO::NLP::Analyze;
2 6     6   107679 use strict;
  6         13  
  6         156  
3 6     6   29 use warnings;
  6         8  
  6         136  
4 6     6   89 use 5.012000;
  6         18  
5 6     6   27 use utf8;
  6         15  
  6         31  
6 6     6   149 use feature qw/ unicode_strings say /;
  6         11  
  6         458  
7 6     6   29 use charnames qw/ :full lao /;
  6         11  
  6         37  
8 6     6   3130 use version 0.77; our $VERSION = version->declare('v1.0.1');
  6         1659  
  6         39  
9 6     6   882 use Unicode::Normalize 'NFC';
  6         1503  
  6         379  
10 6     6   46 use Carp;
  6         13  
  6         349  
11 6     6   291 use Class::Accessor::Fast 'antlers';
  6         2090  
  6         57  
12 6     6   1032 use Lingua::LO::NLP::Data ':all';
  6         18  
  6         932  
13              
14 6     6   68 use constant SUNG => 0; # "high class"
  6         24  
  6         467  
15 6     6   37 use constant KANG => 1; # "middle class"
  6         13  
  6         278  
16 6     6   48 use constant TAM => 2; # "low class"
  6         17  
  6         709  
17              
18             =encoding utf8
19              
20             =head1 NAME
21              
22             Lingua::LO::NLP::Analyze - Analyze a Lao syllable and provide accessors to its constituents
23              
24             =head1 FUNCTION
25              
26             Objects of this class represent a Lao syllable with an analysis of its
27             constituents. After passing a valid syllable to the constructor, the parts are
28             available via accessor methods as outlined below.
29              
30             =cut
31              
32             for my $attribute (qw/ syllable parse vowel consonant end_consonant vowel_length tone tone_mark h semivowel live /) {
33             has $attribute => (is => 'ro');
34             }
35              
36             # This is a 2-level lookup table. The first level is the tone mark, the second
37             # is the consonant class (SUNG/KANG/TAM, see constant definitions)
38             my %TONE_MARKS = (
39             "\N{LAO TONE MAI EK}" => [ qw/ MID MID MID / ],
40             "\N{LAO TONE MAI THO}" => [ qw/ MID_FALLING HIGH_FALLING HIGH_FALLING / ],
41             # TODO: is this HIGH or HIGH_FALLING? Opinions seem to differ
42             # and I haven't found a definitive source yet
43             "\N{LAO TONE MAI TI}" => [ qw/ HIGH HIGH HIGH / ],
44             "\N{LAO TONE MAI CATAWA}" => [ qw/ RISING RISING RISING /],
45             );
46              
47             # This is a 2-level lookup table. The first level is the consonant class
48             # (SUNG/KANG/TAM, see constant definitions), the second is an index as
49             # calculated in classify(): 0 for live, 1 for dead+short, 2 for dead+long
50             my @TONE_NOMARK = (
51             [qw/ RISING HIGH MID_FALLING /], # SUNG/high
52             [qw/ LOW HIGH MID_FALLING /], # KANG/mid
53             [qw/ HIGH MID HIGH_FALLING /], # TAM/low
54             );
55              
56             my %CONSONANTS = (
57             'ກ' => KANG,
58             'ຂ' => SUNG,
59             'ຄ' => TAM,
60             'ງ' => TAM,
61             'ຈ' => KANG,
62             'ສ' => SUNG,
63             'ຊ' => TAM,
64             'ຍ' => TAM,
65             'ດ' => KANG,
66             'ຕ' => KANG,
67             'ຖ' => SUNG,
68             'ທ' => TAM,
69             'ນ' => TAM,
70             'ບ' => KANG,
71             'ປ' => KANG,
72             'ຜ' => SUNG,
73             'ຝ' => SUNG,
74             'ພ' => TAM,
75             'ຟ' => TAM,
76             'ມ' => TAM,
77             'ຢ' => KANG,
78             'ລ' => TAM,
79             'ວ' => TAM,
80             'ຫ' => SUNG,
81             'ອ' => KANG,
82             'ຮ' => TAM,
83             'ຣ' => TAM,
84             'ຫງ' => SUNG,
85             'ຫຍ' => SUNG,
86             'ຫນ' => SUNG,
87             'ໜ' => SUNG,
88             'ຫມ' => SUNG,
89             'ໝ' => SUNG,
90             'ຫລ' => SUNG,
91             'ຫຼ' => SUNG,
92             'ຫວ' => SUNG,
93             );
94              
95             my %CONS_H_MNL = ( 'ມ' => 'ໝ', 'ນ' => 'ໜ', 'ລ' => "\N{LAO SEMIVOWEL SIGN LO}" );
96             my %ENDCONS_STOP = ( 'ກ' => 1, 'ດ' => 1, 'ບ' => 1 );
97              
98             =head1 METHODS
99              
100             =head2 new
101              
102             C
103              
104             The constructor takes a syllable and any number of options as hash-style
105             arguments. The only option specified so far is C, a boolean value
106             indicating whether to run the syllable through
107             L and tone mark normalization
108             (see L). Set this if you are unsure
109             that your text is well-formed according to Unicode rules.
110              
111             =cut
112              
113             sub new {
114 292     292 1 185046 my $class = shift;
115 292         590 my $syllable = shift;
116 292         662 my %opts = @_;
117 292 100       841 if($opts{normalize}) {
118 87         819 $syllable = NFC($syllable);
119 87         347 normalize_tone_marks($syllable);
120             }
121 292         734 return bless _classify($syllable), $class;
122             }
123              
124             {
125             my $regexp = get_sylre_named();
126              
127             sub _classify {
128 292   66 292   1090 my $s = shift // croak("`syllable' argument missing or undefined");
129              
130 291 50       9682 $s =~ /^$regexp/ or croak("`$s' does not start with a valid syllable");
131              
132 291         8917 my %class = (
133             syllable => $s,
134 4     4   4243 parse => { %+ }
  4         1650  
  4         307  
135             );
136              
137             (my $consonant, my $end_consonant, @class{qw/ h semivowel tone_mark /}) =
138 291         4147 @+{qw/ consonant end_consonant h semivowel tone_mark /};
139              
140 291   100     2389 my @vowels = $+{vowel0} // ();
141 291         867 push @vowels, "\N{DOTTED CIRCLE}";
142 291         1252 push @vowels, grep { defined } @+{qw/ vowel1 vowel2 vowel3 /};
  873         3501  
143 291         1325 $class{vowel} = join('', @vowels);
144              
145 291         769 my $cc = $CONSONANTS{ $consonant }; # consonant category
146 291 100       763 if( $class{h} ) {
147 10         18 $cc = SUNG; # $CONSONANTS{'ຫ'}
148              
149             # If consonant is one of ມ, ນ or ລ *and* no vowel precedes the ຫ,
150             # pretend we saw the combined form
151 10 100 100     57 if(exists $CONS_H_MNL{ $consonant } and not $+{vowel0}) {
152 1         5 $class{consonant} = $CONS_H_MNL{ $consonant };
153 1         4 delete $class{h};
154             } else {
155             # If there is a preceding vowel, it uses the ຫ as a consonant and the
156             # one parsed as core consonant is actually an end consonant
157 9 100 100     51 unless($consonant eq 'ວ' or $consonant eq 'ຍ') {
158 6         15 $end_consonant = $consonant;
159 6         45 $consonant = 'ຫ';
160 6         18 delete $class{h};
161             }
162             }
163             }
164              
165             # Set both $class{vowel_length} and a quick flag that we'll need later
166 291         606 my $long_vowel = 1;
167 291 100       1113 if(is_long_vowel( $class{vowel} )) {
168 179         511 $class{vowel_length} = 'long';
169             } else {
170 112         327 $class{vowel_length} = 'short';
171 112         197 $long_vowel = 0;
172             }
173              
174             # Determine syllable liveness.
175 291         539 my $live;
176 291 100       729 if( defined $end_consonant ) {
177             # If we have an end consonant, a syllable is considered live if the
178             # former is not a stopped consonant
179 118 100       461 $live = exists $ENDCONS_STOP{ $end_consonant } ? 0 : 1;
180             } else {
181             # Syllables without an end consonant are live iff the vowel is long
182 173         279 $live = $long_vowel;
183             }
184 291         933 $class{live} = $live;
185              
186 291 100       679 if(defined $class{tone_mark}) {
187             # If a tone mark exists, it and the consonant's class
188             # determine the tone
189 66         312 $class{tone} = $TONE_MARKS{ $class{tone_mark} }[$cc];
190             } else {
191             # No tone mark, so calculate the index
192 225 100       843 $class{tone} = $TONE_NOMARK[$cc][ $live ? 0 : $long_vowel + 1 ];
193             }
194 291         700 $class{consonant} = $consonant;
195 291 100       795 $class{end_consonant} = $end_consonant if defined $end_consonant;
196             #say Dumper(\%class);
197 291         2456 return \%class;
198             }
199             }
200              
201             =head2 ACCESSORS
202              
203              
204             =head3 syllable
205              
206             The original syllable as used by the parser. This may be subtly different from
207             the one passed to the constructor:
208              
209             =over 4
210              
211             =item
212              
213             If the C option was set, tone marks and vowels may have been reordered
214              
215             =item
216              
217             If the decomposed form of LAO VOWEL SIGN AM (◌າ) is used, it will have been
218             converted to the composed form
219              
220             =item
221              
222             Combinations of ຫ with ລ, ມ or ນ will have been converted to the combined characters.
223              
224             =back
225              
226             =head3 parse
227              
228             A hash of raw constituents as returned by the parsing regexp. Although the
229             other accessors present constituents in a more accessible way and take care of
230             morphological special cases like the treatment of ຫ, this may come in handy to
231             quickly check e.g. if there was a vowel component before the core consonant.
232              
233             =head3 vowel
234              
235             The syllable's vowel or diphthong. As the majority of vowels have more than one
236             code point, the consonant position is represented by the Unicode character
237             designated for this function, DOTTED CIRCLE or U+25CC.
238              
239             =head3 consonant
240              
241             The syllable's core consonant.
242              
243             =head3 end_consonant
244              
245             The end consonant if present, C otherwise.
246              
247             =head3 tone_mark
248              
249             The tone mark if present, C otherwise.
250              
251             =head3 semivowel
252              
253             The semivowel following the core consonant if present, C otherwise.
254              
255             =head3 h
256              
257             "ຫ" if the syllable contained a combining ຫ, i.e. one that isn't the core consonant.
258              
259             =head3 vowel_length
260              
261             The string 'long' or 'short'.
262              
263             =head3 live
264              
265             Boolean indicating whether this is a "live" or a "dead" syllable. Dead
266             syllables end in a short vowel or stopped consonant (ກ, ດ or ບ), lives ones end
267             in a long vowel, diphthong, semivowel or nasal consonant. This is used for tone
268             determination but also available as an attribute, just in case it might be
269             useful. C indicates a live syllable.
270              
271             =head3 tone
272              
273             One of the following strings, depending on core consonant class, vowel length and tone mark:
274              
275             =over 4
276              
277             =item LOW_RISING
278              
279             =item LOW
280              
281             =item MID
282              
283             =item HIGH
284              
285             =item MID_FALLING
286              
287             =item HIGH_FALLING
288              
289             =back
290              
291             The latter two occur with short vowels, the other ones with long vowels.
292              
293             =cut
294              
295             1;
296