File Coverage

blib/lib/Lingua/LO/Transform/Data.pm
Criterion Covered Total %
statement 40 43 93.0
branch 2 2 100.0
condition n/a
subroutine 13 16 81.2
pod 3 7 42.8
total 58 68 85.2


line stmt bran cond sub pod time code
1             package Lingua::LO::Transform::Data;
2 5     5   46946 use strict;
  5         6  
  5         113  
3 5     5   14 use warnings;
  5         5  
  5         95  
4 5     5   68 use 5.012000;
  5         16  
5 5     5   14 use utf8;
  5         5  
  5         23  
6 5     5   85 use feature 'unicode_strings';
  5         651  
  5         375  
7 5     5   819 use version 0.77; our $VERSION = version->declare('v0.0.1');
  5         2559  
  5         24  
8 5     5   1313 use charnames qw/ :full lao /;
  5         42919  
  5         23  
9 5     5   21169 use parent 'Exporter';
  5         1150  
  5         22  
10              
11             =encoding UTF-8
12              
13             =head1 NAME
14              
15             Lingua::LO::Transform::Data - Helper module to keep common read-only data
16              
17             =head1 FUNCTION
18              
19             Provides a few functions that return regular expressions for matching and
20             extracting parts from Lao syllables. Instead of hardcoding these expressions as
21             strings, they are constructed from ragments at runtime, trading maintainability
22             for a small one-time initialization cost.
23              
24             Also holds common read-only data such as vowel classifications.
25              
26             You will probably not want to use this module on its own. If you do, see the
27             other L modules for examples.
28              
29             =cut
30              
31             our %EXPORT_TAGS = (
32             all => [ qw/
33             get_sylre_basic get_sylre_full get_sylre_named is_long_vowel
34             get_consonants get_vowels get_tone_marks
35             /
36             ]
37             );
38             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
39              
40             # Character classes
41             my $TONE_MARKS = "\N{LAO TONE MAI EK}\N{LAO TONE MAI THO}" .
42             "\N{LAO TONE MAI TI}\N{LAO TONE MAI CATAWA}";
43             my $CONSONANTS = "\N{LAO LETTER KO}\N{LAO LETTER KHO SUNG}\N{LAO LETTER KHO TAM}" .
44             "\N{LAO LETTER NGO}\N{LAO LETTER CO}\N{LAO LETTER SO TAM}\N{LAO LETTER NYO}" .
45             "\N{LAO LETTER DO}\N{LAO LETTER TO}\N{LAO LETTER THO SUNG}\N{LAO LETTER THO TAM}" .
46             "\N{LAO LETTER NO}\N{LAO LETTER BO}\N{LAO LETTER PO}\N{LAO LETTER PHO SUNG}" .
47             "\N{LAO LETTER FO TAM}\N{LAO LETTER PHO TAM}\N{LAO LETTER FO SUNG}" .
48             "\N{LAO LETTER MO}\N{LAO LETTER YO}\N{LAO LETTER LO LING}\N{LAO LETTER LO LOOT}" .
49             "\N{LAO LETTER WO}\N{LAO LETTER SO SUNG}\N{LAO LETTER HO SUNG}\N{LAO LETTER O}" .
50             "\N{LAO LETTER HO TAM}";
51             my $VOWELS = "\N{LAO VOWEL SIGN A}\N{LAO VOWEL SIGN MAI KAN}\N{LAO VOWEL SIGN AA}" .
52             "\N{LAO VOWEL SIGN AM}\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}" .
53             "\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}\N{LAO VOWEL SIGN U}" .
54             "\N{LAO VOWEL SIGN UU}\N{LAO VOWEL SIGN MAI KON}\N{LAO SEMIVOWEL SIGN LO}" .
55             "\N{LAO SEMIVOWEL SIGN NYO}\N{LAO VOWEL SIGN E}\N{LAO VOWEL SIGN EI}" .
56             "\N{LAO VOWEL SIGN O}\N{LAO VOWEL SIGN AY}\N{LAO VOWEL SIGN AI}" .
57             "\N{LAO NIGGAHITA}";
58              
59             # Regular expression fragments. The cryptic names correspond to the naming
60             # in PHISSAMAY et al: Syllabification of Lao Script for Line Breaking
61             # Using what looks like interpolated variables in single-quoted strings is
62             # intentional; the interpolation is done manually later to be able to construct
63             # expressions with and without named captures.
64             my %regexp_fragments = (
65             x0_1 => 'ເ',
66             x0_2 => 'ແ',
67             x0_3 => 'ໂ',
68             x0_4 => 'ໄ',
69             x0_5 => 'ໃ',
70              
71             x1 => 'ຫ',
72              
73             x => '[ກຂຄງຈສຊຍດຕຖທນບປຜຝພຟມຢຣລວຫອຮໜໝ]',
74              
75             x2 => "[\N{LAO SEMIVOWEL SIGN LO}ຣວລ]",
76              
77             x3 => "[\N{LAO VOWEL SIGN U}\N{LAO VOWEL SIGN UU}]",
78              
79             x4_12 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}]",
80             x4_34 => "[\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
81             x4_5 => "\N{LAO NIGGAHITA}",
82             x4_6 => "\N{LAO VOWEL SIGN MAI KON}",
83             x4_7 => "\N{LAO VOWEL SIGN MAI KAN}",
84             x4_1t4 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
85              
86             x5 => "[$TONE_MARKS]",
87              
88             x6_1 => 'ວ',
89             x6_2 => 'ອ',
90             x6_3 => 'ຽ',
91             x6 => '[ວອຽ]',
92              
93             x7_1 => 'ະ',
94             x7_2 => 'າ',
95             x7_3 => "\N{LAO VOWEL SIGN AM}",
96              
97             x8_3t8 => '[ຍດນມຢບ]',
98             x8 => '[ກງຍດນມຢບວ]',
99              
100             x9 => '[ຈສຊພຟລ]',
101              
102             x10_12 => '[ຯໆ]',
103             x10_3 => "\N{LAO CANCELLATION MARK}",
104              
105             x9a10_3 => '(?: $x9 $x10_3)',
106             );
107             my $re1_all = '$x0_1 $x1? $x $x2?';
108             my $re1_1 = '$x5? $x8? $x9a10_3?';
109             my $re1_2 = '$x4_12 $x5? $x8? $x9a10_3?';
110             my $re1_3 = '$x4_34 $x5? $x6_2 $x8? $x9a10_3?';
111             my $re1_4 = '$x7_2? $x7_1';
112             my $re1_5 = '$x4_6 $x5? $x7_2';
113             my $re1_6 = '$x4_7 $x5? $x8 $x9a10_3?';
114             my $re1_8 = '$x4_7? $x5? $x6_3';
115              
116             my $re2_all = '$x0_2 $x1? $x $x2?';
117             my $re2_1 = '$x5? $x6? $x8? $x9a10_3?';
118             my $re2_2 = '$x7_1';
119             my $re2_3 = '$x4_7 $x5? $x8 $x9a10_3?';
120              
121             my $re3_all = '$x0_3 $x1? $x $x2?';
122             my $re3_1 = '$x5? $x8? $x9a10_3?';
123             my $re3_2 = '$x7_1';
124             my $re3_3 = '$x4_7 $x5? $x8_3t8?';
125              
126             my $re4 = '$x0_4 $x1? $x $x2? $x5? $x6_1? $x9a10_3?';
127              
128             my $re5 = '$x0_5 $x1? $x $x2? $x5? $x6_1?';
129              
130             my $re6 = '$x1? $x $x2? $x3 $x5? $x8? $x9a10_3?';
131              
132             my $re7 = '$x1? $x $x2? $x4_1t4 $x5? $x8? $x9a10_3?';
133              
134             my $re8 = '$x1? $x $x2? $x4_5 $x5? $x7_2? $x9a10_3?';
135              
136             my $re9 = '$x1? $x $x2? $x4_6 $x5? (?: $x8 $x9a10_3? | $x6_1 $x7_1 )';
137              
138             my $re10 = '$x1? $x $x2? $x4_7 $x5? $x6_1? $x8 $x9a10_3?';
139              
140             my $re11 = '$x1? $x $x2? $x5? $x6 $x8 $x9a10_3?';
141              
142             my $re12 = '$x1? $x $x2? $x5? $x7_1';
143              
144             my $re13 = '$x1? $x $x2? $x5? $x7_2 $x8? $x9a10_3?';
145              
146             my $re14 = '$x1? $x $x2? $x5? $x7_3 $x9a10_3?';
147              
148             my $re_num = '[໑໒໓໔໕໖໗໘໙໐]';
149              
150             my $rex1012 = '$x10_12';
151              
152             # This is the basic regexp that matches a syllable, still with variables to be
153             # substituted
154             my $re_basic = <
155             (?:
156             (?:
157             (?: $re1_all (?: $re1_1 | $re1_2 | $re1_3 | $re1_4 | $re1_5 | $re1_6 | $re1_8 ) ) |
158             (?: $re2_all (?: $re2_1 | $re2_2 | $re2_3 ) ) |
159             (?: $re3_all (?: $re3_1 | $re3_2 | $re3_3 ) ) |
160             $re4 | $re5 | $re6 | $re7 | $re8 | $re9 |
161             $re10 | $re11 | $re12 | $re13 | $re14
162             ) $rex1012? |
163             $re_num+
164             )
165             EOF
166             $re_basic =~ s/\n//gs;
167             $re_basic =~ s/\s+/ /g; # keep it a bit more readable. could use s/\s+//g
168              
169             # Functional names for all the x-something groups from the original paper
170             # Used for named catures.
171             my %capture_names = (
172             'x' => 'consonant',
173             'x0_\d' => 'vowel0',
174             'x1' => 'h',
175             'x2' => 'semivowel',
176             'x3' => 'vowel1',
177             'x4_[1-9t]{1,3}'=> 'vowel1',
178             'x5' => 'tone_mark',
179             'x6' => 'vowel2',
180             'x6_\d' => 'vowel2',
181             'x7_\d' => 'vowel3',
182             'x8' => 'end_consonant',
183             'x8_3t8' => 'end_consonant',
184             'x9' => 'foreign_consonant',
185             'x10_12' => 'extra',
186             'x10_3' => 'cancel',
187             );
188              
189             # Substitute longer fragment names first so their matches don't get swallowed
190             # by the shorter ones. x9a10_3 is a convenience shotcut for '(?: $x9 $x10_3)'
191             # so we have to do it first.
192             my @sorted_x_names = ('x9a10_3', reverse sort { length $a <=> length $b } keys %capture_names);
193              
194             my %VOWEL_LENGTH = (
195             ### Monophthongs
196             'Xະ' => 0, # /a/
197             'Xັ' => 0, # /a/ with end consonant
198             'Xາ' => 1, # /aː/
199              
200             'Xິ' => 0, # /i/
201             'Xີ' => 1, # /iː/
202              
203             'Xຶ' => 0, # /ɯ/
204             'Xື' => 1, # /ɯː/
205              
206             'Xຸ' => 0, # /u/
207             'Xູ' => 1, # /uː/
208              
209             'ເXະ' => 0, # /e/
210             'ເXັ' => 0, # /e/ with end consonant
211             'ເX' => 1, # /eː/
212              
213             'ແXະ' => 0, # /ɛ/
214             'ແXັ' => 0, # /ɛ/ with end consonant
215             'ແX' => 1, # /ɛː/
216              
217             'ໂXະ' => 0, # /o/
218             'Xົ' => 0, # /o/
219             'ໂX' => 1, # /oː/
220              
221             'ເXາະ' => 0, # /ɔ/
222             'Xັອ' => 0, # /ɔ/ with end consonant
223             'Xໍ' => 1, # /ɔː/
224             'Xອ' => 1, # /ɔː/ with end consonant
225              
226             'ເXິ' => 0, # /ɤ/
227             'ເXີ' => 1, # /ɤː/
228              
229             ###' Diphthongs
230             'ເXັຍ' => 0, # /iə/
231             'Xັຽ' => 0, # /iə/
232             'ເXຍ' => 1, # /iːə/
233             'Xຽ' => 1, # /iːə/
234              
235             'ເXຶອ' => 0, # /ɯə/
236             'ເXືອ' => 1, # /ɯːə/
237              
238             'Xົວະ' => 0, # /uə/
239             'Xັວ' => 0, # /uə/
240             'Xົວ' => 1, # /uːə/
241             'Xວ' => 1, # /uːə/ with end consonant
242              
243             'ໄX' => 1, # /aj/ - Actually short but counts as long for rules
244             'ໃX' => 1, # /aj/ - Actually short but counts as long for rules
245             'Xາຍ' => 1, # /aj/ - Actually short but counts as long for rules
246             'Xັຍ' => 0, # /aj/
247              
248             'ເXົາ' => 0, # /aw/
249             'Xໍາ' => 0, # /am/
250             );
251             {
252             # Replace "X" in %VOWELS keys with DOTTED CIRCLE. Makes code easier to edit.
253             my %v;
254             foreach my $v (keys %VOWEL_LENGTH) {
255             (my $w = $v) =~ s/X/\N{DOTTED CIRCLE}/;
256             $v{$w} = $VOWEL_LENGTH{$v};
257             }
258             %VOWEL_LENGTH = %v;
259             }
260              
261              
262             =head1 FUNCTIONS
263              
264             =head2 get_sylre_basic
265              
266             Returns a basic regexp that can match a Lao syllable. It consists of a bunch of
267             alternations and will thus return the I possible match which is neither
268             guaranteed to be the longest nor the appropriate one in a longer sequence of
269             characters. It is useful as a building block and for verifying syllables
270             though.
271              
272             =cut
273              
274             sub get_sylre_basic {
275 11     11 1 26 my $syl_re = $re_basic;
276 11         20 for my $atom (@sorted_x_names) {
277 176         2129 $syl_re =~ s/\$($atom)/$regexp_fragments{$1}/eg;
  1771         3758  
278             }
279              
280 11         2010 return qr/ $syl_re /x;
281             }
282              
283             =head2 get_sylre_full
284              
285             In addition to the matching done by L, this one makes sure
286             matches are either followed by another complete syllable, a blank, the end of
287             string/line or some non-Lao character. This ensures correct matching of
288             ambiguous syllable boundaries where the core consonant of a following syllable
289             could also be an end consonant of the current one.
290              
291             =cut
292              
293             sub get_sylre_full {
294 4     4 1 451 my $syl_short = get_sylre_basic();
295 4         1582 return qr/ $syl_short (?= \P{Lao} | \s | $ | $syl_short ) /x;
296             }
297              
298             =head2 get_sylre_named
299              
300             The expression returned is the same as for L but also includes
301             named captures that upon a successful match allow to get the syllable's parts
302             from C<%+>.
303              
304             =cut
305              
306             sub get_sylre_named {
307 3     3 1 6 my $syl_short = get_sylre_basic();
308 3         13 my $syl_capture = $re_basic;
309 3         5 for my $atom (@sorted_x_names) {
310 48         749 $syl_capture =~ s/\$($atom)/_named_capture(\%regexp_fragments, $atom, $1)/eg;
  483         479  
311             }
312              
313 3         2757 return qr/ $syl_capture (?= \P{Lao} | \s | $ | $syl_short )/x;
314             }
315              
316 0     0 0 0 sub get_consonants { return $CONSONANTS; }
317 0     0 0 0 sub get_vowels { return $VOWELS; }
318 0     0 0 0 sub get_tone_marks { return $TONE_MARKS; }
319              
320 112     112 0 277 sub is_long_vowel { return $VOWEL_LENGTH{+shift} }
321              
322             sub _named_capture {
323 483     483   461 my ($fragments, $atom, $match) = @_;
324              
325             return sprintf(
326             '(?<%s> %s)',
327             $capture_names{$atom}, $fragments->{$match}
328 483 100       1971 ) if defined $capture_names{$atom};
329              
330 48         106 return $fragments->{$match};
331             }
332              
333             1;