File Coverage

blib/lib/Lingua/LO/NLP/Data.pm
Criterion Covered Total %
statement 47 47 100.0
branch 2 2 100.0
condition n/a
subroutine 17 17 100.0
pod 5 5 100.0
total 71 71 100.0


line stmt bran cond sub pod time code
1             package Lingua::LO::NLP::Data;
2 8     8   170908 use strict;
  8         102  
  8         262  
3 8     8   53 use warnings;
  8         18  
  8         236  
4 8     8   158 use 5.012000;
  8         38  
5 8     8   66 use utf8;
  8         21  
  8         48  
6 8     8   260 use feature 'unicode_strings';
  8         18  
  8         852  
7 8     8   755 use version 0.77; our $VERSION = version->declare('v1.0.1');
  8         4313  
  8         58  
8 8     8   1875 use charnames qw/ :full lao /;
  8         64829  
  8         55  
9 8     8   29474 use parent 'Exporter';
  8         2272  
  8         46  
10              
11             =encoding utf8
12              
13             =head1 NAME
14              
15             Lingua::LO::NLP::Data - Helper module to keep common read-only data
16              
17             =head1 FUNCTION
18              
19             Provides a few functions that return regular expressions for matching and
20             extracting parts from Lao syllables. Instead of hardcoding these expressions as
21             strings, they are constructed from fragments at runtime, trading maintainability
22             for a small one-time initialization cost.
23              
24             Also holds common read-only data such as vowel classifications.
25              
26             You will probably not want to use this module on its own. If you do, see the
27             other L modules for examples.
28              
29             =cut
30              
31             our %EXPORT_TAGS = (
32             all => [ qw/
33             get_sylre_basic get_sylre_full get_sylre_named is_long_vowel
34             normalize_tone_marks
35             /
36             ]
37             );
38             our @EXPORT_OK = @{ $EXPORT_TAGS{all} };
39              
40             # Character classes
41             my $TONE_MARKS = "\N{LAO TONE MAI EK}\N{LAO TONE MAI THO}" .
42             "\N{LAO TONE MAI TI}\N{LAO TONE MAI CATAWA}";
43              
44             my $CONSONANTS = "\N{LAO LETTER KO}\N{LAO LETTER KHO SUNG}\N{LAO LETTER KHO TAM}" .
45             "\N{LAO LETTER NGO}\N{LAO LETTER CO}\N{LAO LETTER SO TAM}\N{LAO LETTER NYO}" .
46             "\N{LAO LETTER DO}\N{LAO LETTER TO}\N{LAO LETTER THO SUNG}\N{LAO LETTER THO TAM}" .
47             "\N{LAO LETTER NO}\N{LAO LETTER BO}\N{LAO LETTER PO}\N{LAO LETTER PHO SUNG}" .
48             "\N{LAO LETTER FO TAM}\N{LAO LETTER PHO TAM}\N{LAO LETTER FO SUNG}" .
49             "\N{LAO LETTER MO}\N{LAO LETTER YO}\N{LAO LETTER LO LING}\N{LAO LETTER LO LOOT}" .
50             "\N{LAO LETTER WO}\N{LAO LETTER SO SUNG}\N{LAO LETTER HO SUNG}\N{LAO LETTER O}" .
51             "\N{LAO LETTER HO TAM}";
52              
53             my $VOWELS_COMBINING = "\N{LAO VOWEL SIGN MAI KAN}" .
54             "\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}" .
55             "\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}\N{LAO VOWEL SIGN U}" .
56             "\N{LAO VOWEL SIGN UU}\N{LAO VOWEL SIGN MAI KON}\N{LAO NIGGAHITA}";
57              
58             my $VOWELS = "\N{LAO VOWEL SIGN A}\N{LAO VOWEL SIGN MAI KAN}\N{LAO VOWEL SIGN AA}" .
59             "\N{LAO VOWEL SIGN AM}\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}" .
60             "\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}\N{LAO VOWEL SIGN U}" .
61             "\N{LAO VOWEL SIGN UU}\N{LAO VOWEL SIGN MAI KON}\N{LAO SEMIVOWEL SIGN LO}" .
62             "\N{LAO SEMIVOWEL SIGN NYO}\N{LAO VOWEL SIGN E}\N{LAO VOWEL SIGN EI}" .
63             "\N{LAO VOWEL SIGN O}\N{LAO VOWEL SIGN AY}\N{LAO VOWEL SIGN AI}" .
64             "\N{LAO NIGGAHITA}";
65              
66             # Regular expression fragments. The cryptic names correspond to the naming
67             # in PHISSAMAY et al: Syllabification of Lao Script for Line Breaking
68             # Using what looks like interpolated variables in single-quoted strings is
69             # intentional; the interpolation is done manually later to be able to construct
70             # expressions with and without named captures.
71             my %regexp_fragments = (
72             x0_1 => 'ເ',
73             x0_2 => 'ແ',
74             x0_3 => 'ໂ',
75             x0_4 => 'ໄ',
76             x0_5 => 'ໃ',
77              
78             x1 => 'ຫ',
79              
80             x => '[ກຂຄງຈສຊຍດຕຖທນບປຜຝພຟມຢຣລວຫອຮໜໝ]',
81              
82             x2 => "[\N{LAO SEMIVOWEL SIGN LO}ຣວລ]",
83              
84             x3 => "[\N{LAO VOWEL SIGN U}\N{LAO VOWEL SIGN UU}]",
85              
86             x4_12 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}]",
87             x4_34 => "[\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
88             x4_5 => "\N{LAO NIGGAHITA}",
89             x4_6 => "\N{LAO VOWEL SIGN MAI KON}",
90             x4_7 => "\N{LAO VOWEL SIGN MAI KAN}",
91             x4_1t4 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
92              
93             x5 => "[$TONE_MARKS]",
94              
95             x6_1 => 'ວ',
96             x6_2 => 'ອ',
97             x6_3 => 'ຽ',
98             x6 => '[ວອຽ]',
99              
100             x7_1 => 'ະ',
101             x7_2 => 'າ',
102             x7_3 => "\N{LAO VOWEL SIGN AM}",
103              
104             x8_3t8 => '[ຍດນມຢບ]',
105             x8 => '[ກງຍດນມຢບວ]',
106              
107             x9 => '[ຈສຊພຟລ]',
108              
109             x10_12 => '[ຯໆ]',
110             x10_3 => "\N{LAO CANCELLATION MARK}",
111              
112             x9a10_3 => '(?: $x9 $x10_3)',
113             );
114             my $re1_all = '$x0_1 $x1? $x $x2?';
115             ## See naming explanation above
116             ## no critic(Bangs::ProhibitNumberedNames)
117             my $re1_1 = '$x5? $x8? $x9a10_3?';
118             my $re1_2 = '$x4_12 $x5? $x8? $x9a10_3?';
119             my $re1_3 = '$x4_34 $x5? $x6_2 $x8? $x9a10_3?';
120             my $re1_4 = '$x7_2? $x7_1';
121             my $re1_5 = '$x4_6 $x5? $x7_2';
122             my $re1_6 = '$x4_7 $x5? $x8 $x9a10_3?';
123             my $re1_8 = '$x4_7? $x5? $x6_3';
124              
125             my $re2_all = '$x0_2 $x1? $x $x2?';
126             my $re2_1 = '$x5? $x6? $x8? $x9a10_3?';
127             my $re2_2 = '$x7_1';
128             my $re2_3 = '$x4_7 $x5? $x8 $x9a10_3?';
129              
130             my $re3_all = '$x0_3 $x1? $x $x2?';
131             my $re3_1 = '$x5? $x8? $x9a10_3?';
132             my $re3_2 = '$x7_1';
133             my $re3_3 = '$x4_7 $x5? $x8_3t8?';
134              
135             my $re4 = '$x0_4 $x1? $x $x2? $x5? $x6_1? $x9a10_3?';
136              
137             my $re5 = '$x0_5 $x1? $x $x2? $x5? $x6_1?';
138              
139             my $re6 = '$x1? $x $x2? $x3 $x5? $x8? $x9a10_3?';
140              
141             my $re7 = '$x1? $x $x2? $x4_1t4 $x5? $x8? $x9a10_3?';
142              
143             my $re8 = '$x1? $x $x2? $x4_5 $x5? $x7_2? $x9a10_3?';
144              
145             my $re9 = '$x1? $x $x2? $x4_6 $x5? ( $x8 $x9a10_3? | $x6_1 $x7_1 )';
146              
147             my $re10 = '$x1? $x $x2? $x4_7 $x5? $x6_1? $x8 $x9a10_3?';
148              
149             my $re11 = '$x1? $x $x2? $x5? $x6 $x8 $x9a10_3?';
150              
151             my $re12 = '$x1? $x $x2? $x5? $x7_1';
152              
153             my $re13 = '$x1? $x $x2? $x5? $x7_2 $x8? $x9a10_3?';
154              
155             my $re14 = '$x1? $x $x2? $x5? $x7_3 $x9a10_3?';
156              
157             my $re_num = '[໑໒໓໔໕໖໗໘໙໐]';
158              
159             my $rex1012 = '$x10_12';
160              
161             # This is the basic regexp that matches a syllable, still with variables to be
162             # substituted
163             my $re_basic = <<"EOF";
164             (
165             (
166             ( $re1_all ( $re1_1 | $re1_2 | $re1_3 | $re1_4 | $re1_5 | $re1_6 | $re1_8 ) ) |
167             ( $re2_all ( $re2_1 | $re2_2 | $re2_3 ) ) |
168             ( $re3_all ( $re3_1 | $re3_2 | $re3_3 ) ) |
169             $re4 | $re5 | $re6 | $re7 | $re8 | $re9 |
170             $re10 | $re11 | $re12 | $re13 | $re14
171             ) $rex1012? |
172             $re_num+
173             )
174             EOF
175              
176             # A simplified lookahead expression that matches only the syllables not starting
177             # in x0/x1. For the latter, matching their first character is sufficient.
178             my $re_lookahead = <<"EOF";
179             (
180             $re6 | $re7 | $re8 | $re9 | $re10 | $re11 | $re12 | $re13 | $re14 | $re_num+
181             )
182             EOF
183              
184             # Fix up regexen
185             for( $re_basic, $re_lookahead ) {
186             s/\n//gs; # Remove newlines
187             s/\(/(?:/gs; # Make all groups non-capturing
188             s/\s+//g;
189             }
190              
191             # Functional names for all the x-something groups from the original paper
192             # Used for named captures.
193             my %CAPTURE_NAMES = (
194             'x' => 'consonant',
195             'x0_\d' => 'vowel0',
196             'x1' => 'h',
197             'x2' => 'semivowel',
198             'x3' => 'vowel1',
199             'x4_[1-9t]{1,3}'=> 'vowel1',
200             'x5' => 'tone_mark',
201             'x6' => 'vowel2',
202             'x6_\d' => 'vowel2',
203             'x7_2' => 'vowel2',
204             'x7_[13]' => 'vowel3',
205             'x8' => 'end_consonant',
206             'x8_3t8' => 'end_consonant',
207             'x9' => 'foreign_consonant',
208             'x10_12' => 'extra',
209             'x10_3' => 'cancel',
210             );
211              
212             # Substitute longer fragment names first so their matches don't get swallowed
213             # by the shorter ones. x9a10_3 is a convenience shotcut for '(?: $x9 $x10_3)'
214             # so we have to do it first.
215             my @SORTED_X_NAMES = ('x9a10_3', reverse sort { length $a <=> length $b } keys %CAPTURE_NAMES);
216              
217             our %VOWEL_LENGTH = (
218             ### Monophthongs
219             'Xະ' => 0, # /a/
220             'Xັ' => 0, # /a/ with end consonant
221             'Xາ' => 1, # /aː/
222              
223             'Xິ' => 0, # /i/
224             'Xີ' => 1, # /iː/
225              
226             'Xຶ' => 0, # /ɯ/
227             'Xື' => 1, # /ɯː/
228              
229             'Xຸ' => 0, # /u/
230             'Xູ' => 1, # /uː/
231              
232             'ເXະ' => 0, # /e/
233             'ເXັ' => 0, # /e/ with end consonant
234             'ເX' => 1, # /eː/
235              
236             'ແXະ' => 0, # /ɛ/
237             'ແXັ' => 0, # /ɛ/ with end consonant
238             'ແX' => 1, # /ɛː/
239              
240             'ໂXະ' => 0, # /o/
241             'Xົ' => 0, # /o/
242             'ໂX' => 1, # /oː/
243              
244             'ເXາະ' => 0, # /ɔ/
245             'Xັອ' => 0, # /ɔ/ with end consonant
246             'Xໍ' => 1, # /ɔː/
247             'Xອ' => 1, # /ɔː/ with end consonant
248              
249             'ເXິ' => 0, # /ɤ/
250             'ເXີ' => 1, # /ɤː/
251              
252             ###' Diphthongs
253             'ເXັຍ' => 0, # /iə/
254             'Xັຽ' => 0, # /iə/
255             'ເXຍ' => 1, # /iːə/
256             'Xຽ' => 1, # /iːə/
257              
258             'ເXຶອ' => 0, # /ɯə/
259             'ເXືອ' => 1, # /ɯːə/
260              
261             'Xົວະ' => 0, # /uə/
262             'Xັວ' => 0, # /uə/
263             'Xົວ' => 1, # /uːə/
264             'Xວ' => 1, # /uːə/ with end consonant
265              
266             'ໄX' => 1, # /aj/ - Actually short but counts as long for rules
267             'ໃX' => 1, # /aj/ - Actually short but counts as long for rules
268             'Xາຍ' => 1, # /aj/ - Actually short but counts as long for rules
269             'Xັຍ' => 0, # /aj/
270              
271             'ເXົາ' => 0, # /aw/
272             'Xໍາ' => 0, # /am/
273             );
274             {
275             # Replace "X" in %VOWELS keys with DOTTED CIRCLE. Makes code easier to edit.
276             my %v;
277             foreach my $v (keys %VOWEL_LENGTH) {
278             (my $w = $v) =~ s/X/\N{DOTTED CIRCLE}/;
279             $v{$w} = $VOWEL_LENGTH{$v};
280             }
281             %VOWEL_LENGTH = %v;
282             }
283              
284              
285             =head1 FUNCTIONS
286              
287             =head2 get_sylre_basic
288              
289             Returns a basic regexp that can match a Lao syllable. It consists of a bunch of
290             alternations and will thus return the I possible match which is neither
291             guaranteed to be the longest nor the appropriate one in a longer sequence of
292             characters. It is useful as a building block and for verifying syllables
293             though.
294              
295             =cut
296              
297             sub get_sylre_basic {
298 7     7 1 98 my $syl_re = _subst_regexp_fragments($re_basic);
299 7         3201 return qr/ $syl_re /x;
300             }
301              
302             =head2 get_sylre_full
303              
304             In addition to the matching done by L, this one makes sure
305             matches are either followed by another complete syllable (or what can only be
306             the start of one), a space, the end of string/line or some non-Lao character.
307             This ensures correct matching of ambiguous syllable boundaries where the core
308             consonant of a following syllable could also be an end consonant of the current
309             one.
310              
311             =cut
312              
313             sub get_sylre_full {
314 6     6 1 26 return _format_syllable_regexp(
315             _subst_regexp_fragments($re_basic),
316             );
317             }
318              
319             =head2 get_sylre_named
320              
321             The expression returned is the same as for L but also includes
322             named captures that upon a successful match allow to get the syllable's parts
323             from C<%+>.
324              
325             =cut
326              
327             sub get_sylre_named {
328 6     6 1 27 return _format_syllable_regexp(
329             _subst_regexp_fragments_named($re_basic),
330             );
331             }
332              
333             =head2 is_long_vowel
334              
335             C
336              
337             Returns a boolean indicating whether the vowel passed in is long. Consonant
338             placeholders must be included in the form of DOTTED CIRCLE (U+25CC). Note that
339             for speed there is no check if the vowel actually exists in the data, so
340             passing many bogus values may lead to uncontrolled growth of the
341             C<%VOWEL_LENGTH> hash due to autovivification!
342              
343             =cut
344              
345 289     289 1 896 sub is_long_vowel { return $VOWEL_LENGTH{+shift} }
346              
347             =head2 normalize_tone_marks
348              
349             C
350              
351             Normalize tone mark order in C<$text>. Usually when using a combining vowel
352             such as ◌ິ, ◌ຸ or ◌ໍ with a tone mark, they have to be typed in the order
353             I as renderers are supposed to stack above-consonant
354             signs in the order they appear in the text, and tone marks are supposed to go
355             on top. As some renderers will put them on top no matter what, these sequences
356             are sometimes incorrectly entered as I and would thus
357             not be parsed correctly.
358              
359             This function is just meant for internal use and modifies its argument in place
360             for speed!
361              
362             =cut
363              
364             ## Modify-in-place for speed reasons
365             ## no critic(Subroutines::RequireFinalReturn)
366             sub normalize_tone_marks {
367 87     87 1 146 my $t = $_[0];
368 87         489 $_[0] =~ s/([$CONSONANTS])([$TONE_MARKS])([$VOWELS_COMBINING])/$1$3$2/og;
369             }
370              
371             sub _named_capture {
372 966     966   2429 my ($fragments, $atom, $match) = @_;
373              
374             return sprintf(
375             '(?<%s> %s)',
376             $CAPTURE_NAMES{$atom}, $fragments->{$match}
377 966 100       6478 ) if defined $CAPTURE_NAMES{$atom};
378              
379 96         488 return $fragments->{$match};
380             }
381              
382             sub _subst_regexp_fragments_named {
383 6     6   34 my $re = shift;
384              
385 6         22 for my $atom (@SORTED_X_NAMES) {
386 102         2315 $re =~ s/\$($atom)/_named_capture(\%regexp_fragments, $atom, $1)/eg;
  966         2110  
387             }
388              
389 6         73 return $re;
390             }
391              
392             sub _subst_regexp_fragments {
393 25     25   65 my $re = shift;
394              
395 25         64 for my $atom (@SORTED_X_NAMES) {
396 425         6813 $re =~ s/\$($atom)/$regexp_fragments{$1}/eg;
  3041         13122  
397             }
398              
399 25         120 return $re;
400             }
401              
402             sub _format_syllable_regexp {
403 12     12   36 my $re = shift;
404 12         43 my $lookahead = _subst_regexp_fragments($re_lookahead);
405              
406             my $expr = sprintf(
407             '%s (?= [%s] | \s | \P{Lao} | $ | %s )',
408             $re,
409 12         238 join('', @regexp_fragments{qw/ x0_1 x0_2 x0_3 x0_4 x0_5 x1 /} ),
410             $lookahead
411             );
412              
413 12         5902 return qr/ $expr /x;
414             }
415              
416              
417             1;