File Coverage

blib/lib/Lingua/LO/Transform/Data.pm
Criterion Covered Total %
statement 40 40 100.0
branch 2 2 100.0
condition n/a
subroutine 13 13 100.0
pod 3 4 75.0
total 58 59 98.3


line stmt bran cond sub pod time code
1             package Lingua::LO::Transform::Data;
2 5     5   47804 use strict;
  5         7  
  5         114  
3 5     5   15 use warnings;
  5         5  
  5         96  
4 5     5   66 use 5.012000;
  5         14  
5 5     5   14 use utf8;
  5         5  
  5         22  
6 5     5   650 use feature 'unicode_strings';
  5         5  
  5         362  
7 5     5   802 use version 0.77; our $VERSION = version->declare('v0.0.1');
  5         2698  
  5         26  
8 5     5   1324 use charnames qw/ :full lao /;
  5         44038  
  5         66  
9 5     5   22000 use parent 'Exporter';
  5         1279  
  5         26  
10              
11             =encoding UTF-8
12              
13             =head1 NAME
14              
15             Lingua::LO::Transform::Data - Helper module to keep common read-only data
16              
17             =head1 FUNCTION
18              
19             Provides a few functions that return regular expressions for matching and
20             extracting parts from Lao syllables. Instead of hardcoding these expressions as
21             strings, they are constructed from ragments at runtime, trading maintainability
22             for a small one-time initialization cost.
23              
24             Also holds common read-only data such as vowel classifications.
25              
26             You will probably not want to use this module on its own. If you do, see the
27             other L modules for examples.
28              
29             =cut
30              
31             our @EXPORT_OK = qw/ get_sylre_basic get_sylre_full get_sylre_named is_long_vowel /;
32              
33             # Regular expression fragments. The cryptic names correspond to the naming
34             # in PHISSAMAY et al: Syllabification of Lao Script for Line Breaking
35             # Using what looks like interpolated variables in single-quoted strings is
36             # intentional; the interpolation is done manually later to be able to construct
37             # expressions with and without named captures.
38             my %regexp_fragments = (
39             x0_1 => 'ເ',
40             x0_2 => 'ແ',
41             x0_3 => 'ໂ',
42             x0_4 => 'ໄ',
43             x0_5 => 'ໃ',
44              
45             x1 => 'ຫ',
46              
47             x => '[ກຂຄງຈສຊຍດຕຖທນບປຜຝພຟມຢຣລວຫອຮໜໝ]',
48              
49             x2 => "[\N{LAO SEMIVOWEL SIGN LO}ຣວລ]",
50              
51             x3 => "[\N{LAO VOWEL SIGN U}\N{LAO VOWEL SIGN UU}]",
52              
53             x4_12 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}]",
54             x4_34 => "[\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
55             x4_5 => "\N{LAO NIGGAHITA}",
56             x4_6 => "\N{LAO VOWEL SIGN MAI KON}",
57             x4_7 => "\N{LAO VOWEL SIGN MAI KAN}",
58             x4_1t4 => "[\N{LAO VOWEL SIGN I}\N{LAO VOWEL SIGN II}\N{LAO VOWEL SIGN Y}\N{LAO VOWEL SIGN YY}]",
59              
60             x5 => "[\N{LAO TONE MAI EK}\N{LAO TONE MAI THO}\N{LAO TONE MAI TI}\N{LAO TONE MAI CATAWA}]",
61              
62             x6_1 => 'ວ',
63             x6_2 => 'ອ',
64             x6_3 => 'ຽ',
65             x6 => '[ວອຽ]',
66              
67             x7_1 => 'ະ',
68             x7_2 => 'າ',
69             x7_3 => "\N{LAO VOWEL SIGN AM}",
70              
71             x8_3t8 => '[ຍດນມຢບ]',
72             x8 => '[ກງຍດນມຢບວ]',
73              
74             x9 => '[ຈສຊພຟລ]',
75              
76             x10_12 => '[ຯໆ]',
77             x10_3 => "\N{LAO CANCELLATION MARK}",
78              
79             x9a10_3 => '(?: $x9 $x10_3)',
80             );
81             my $re1_all = '$x0_1 $x1? $x $x2?';
82             my $re1_1 = '$x5? $x8? $x9a10_3?';
83             my $re1_2 = '$x4_12 $x5? $x8? $x9a10_3?';
84             my $re1_3 = '$x4_34 $x5? $x6_2 $x8? $x9a10_3?';
85             my $re1_4 = '$x7_2? $x7_1';
86             my $re1_5 = '$x4_6 $x5? $x7_2';
87             my $re1_6 = '$x4_7 $x5? $x8 $x9a10_3?';
88             my $re1_8 = '$x4_7? $x5? $x6_3';
89              
90             my $re2_all = '$x0_2 $x1? $x $x2?';
91             my $re2_1 = '$x5? $x6? $x8? $x9a10_3?';
92             my $re2_2 = '$x7_1';
93             my $re2_3 = '$x4_7 $x5? $x8 $x9a10_3?';
94              
95             my $re3_all = '$x0_3 $x1? $x $x2?';
96             my $re3_1 = '$x5? $x8? $x9a10_3?';
97             my $re3_2 = '$x7_1';
98             my $re3_3 = '$x4_7 $x5? $x8_3t8?';
99              
100             my $re4 = '$x0_4 $x1? $x $x2? $x5? $x6_1? $x9a10_3?';
101              
102             my $re5 = '$x0_5 $x1? $x $x2? $x5? $x6_1?';
103              
104             my $re6 = '$x1? $x $x2? $x3 $x5? $x8? $x9a10_3?';
105              
106             my $re7 = '$x1? $x $x2? $x4_1t4 $x5? $x8? $x9a10_3?';
107              
108             my $re8 = '$x1? $x $x2? $x4_5 $x5? $x7_2? $x9a10_3?';
109              
110             my $re9 = '$x1? $x $x2? $x4_6 $x5? (?: $x8 $x9a10_3? | $x6_1 $x7_1 )';
111              
112             my $re10 = '$x1? $x $x2? $x4_7 $x5? $x6_1? $x8 $x9a10_3?';
113              
114             my $re11 = '$x1? $x $x2? $x5? $x6 $x8 $x9a10_3?';
115              
116             my $re12 = '$x1? $x $x2? $x5? $x7_1';
117              
118             my $re13 = '$x1? $x $x2? $x5? $x7_2 $x8? $x9a10_3?';
119              
120             my $re14 = '$x1? $x $x2? $x5? $x7_3 $x9a10_3?';
121              
122             my $re_num = '[໑໒໓໔໕໖໗໘໙໐]';
123              
124             my $rex1012 = '$x10_12';
125              
126             # This is the basic regexp that matches a syllable, still with variables to be
127             # substituted
128             my $re_basic = <
129             (?:
130             (?:
131             (?: $re1_all (?: $re1_1 | $re1_2 | $re1_3 | $re1_4 | $re1_5 | $re1_6 | $re1_8 ) ) |
132             (?: $re2_all (?: $re2_1 | $re2_2 | $re2_3 ) ) |
133             (?: $re3_all (?: $re3_1 | $re3_2 | $re3_3 ) ) |
134             $re4 | $re5 | $re6 | $re7 | $re8 | $re9 |
135             $re10 | $re11 | $re12 | $re13 | $re14
136             ) $rex1012? |
137             $re_num+
138             )
139             EOF
140             $re_basic =~ s/\n//gs;
141             $re_basic =~ s/\s+/ /g; # keep it a bit more readable. could use s/\s+//g
142              
143             # Functional names for all the x-something groups from the original paper
144             # Used for named catures.
145             my %capture_names = (
146             'x' => 'consonant',
147             'x0_\d' => 'vowel0',
148             'x1' => 'h',
149             'x2' => 'semivowel',
150             'x3' => 'vowel1',
151             'x4_[1-9t]{1,3}'=> 'vowel1',
152             'x5' => 'tone_mark',
153             'x6' => 'vowel2',
154             'x6_\d' => 'vowel2',
155             'x7_\d' => 'vowel3',
156             'x8' => 'end_consonant',
157             'x8_3t8' => 'end_consonant',
158             'x9' => 'foreign_consonant',
159             'x10_12' => 'extra',
160             'x10_3' => 'cancel',
161             );
162              
163             # Substitute longer fragment names first so their matches don't get swallowed
164             # by the shorter ones. x9a10_3 is a convenience shotcut for '(?: $x9 $x10_3)'
165             # so we have to do it first.
166             my @sorted_x_names = ('x9a10_3', reverse sort { length $a <=> length $b } keys %capture_names);
167              
168             my %VOWEL_LENGTH = (
169             ### Monophthongs
170             'Xະ' => 0, # /a/
171             'Xັ' => 0, # /a/ with end consonant
172             'Xາ' => 1, # /aː/
173              
174             'Xິ' => 0, # /i/
175             'Xີ' => 1, # /iː/
176              
177             'Xຶ' => 0, # /ɯ/
178             'Xື' => 1, # /ɯː/
179              
180             'Xຸ' => 0, # /u/
181             'Xູ' => 1, # /uː/
182              
183             'ເXະ' => 0, # /e/
184             'ເXັ' => 0, # /e/ with end consonant
185             'ເX' => 1, # /eː/
186              
187             'ແXະ' => 0, # /ɛ/
188             'ແXັ' => 0, # /ɛ/ with end consonant
189             'ແX' => 1, # /ɛː/
190              
191             'ໂXະ' => 0, # /o/
192             'Xົ' => 0, # /o/
193             'ໂX' => 1, # /oː/
194              
195             'ເXາະ' => 0, # /ɔ/
196             'Xັອ' => 0, # /ɔ/ with end consonant
197             'Xໍ' => 1, # /ɔː/
198             'Xອ' => 1, # /ɔː/ with end consonant
199              
200             'ເXິ' => 0, # /ɤ/
201             'ເXີ' => 1, # /ɤː/
202              
203             ###' Diphthongs
204             'ເXັຍ' => 0, # /iə/
205             'Xັຽ' => 0, # /iə/
206             'ເXຍ' => 1, # /iːə/
207             'Xຽ' => 1, # /iːə/
208              
209             'ເXຶອ' => 0, # /ɯə/
210             'ເXືອ' => 1, # /ɯːə/
211              
212             'Xົວະ' => 0, # /uə/
213             'Xັວ' => 0, # /uə/
214             'Xົວ' => 1, # /uːə/
215             'Xວ' => 1, # /uːə/ with end consonant
216              
217             'ໄX' => 1, # /aj/ - Actually short but counts as long for rules
218             'ໃX' => 1, # /aj/ - Actually short but counts as long for rules
219             'Xາຍ' => 1, # /aj/ - Actually short but counts as long for rules
220             'Xັຍ' => 0, # /aj/
221              
222             'ເXົາ' => 0, # /aw/
223             'Xໍາ' => 0, # /am/
224             );
225             {
226             # Replace "X" in %VOWELS keys with DOTTED CIRCLE. Makes code easier to edit.
227             my %v;
228             foreach my $v (keys %VOWEL_LENGTH) {
229             (my $w = $v) =~ s/X/\N{DOTTED CIRCLE}/;
230             $v{$w} = $VOWEL_LENGTH{$v};
231             }
232             %VOWEL_LENGTH = %v;
233             }
234              
235              
236             =head1 FUNCTIONS
237              
238             =head2 get_sylre_basic
239              
240             Returns a basic regexp that can match a Lao syllable. It consists of a bunch of
241             alternations and will thus return the I possible match which is neither
242             guaranteed to be the longest nor the appropriate one in a longer sequence of
243             characters. It is useful as a building block and for verifying syllables
244             though.
245              
246             =cut
247              
248             sub get_sylre_basic {
249 11     11 1 27 my $syl_re = $re_basic;
250 11         17 for my $atom (@sorted_x_names) {
251 176         2306 $syl_re =~ s/\$($atom)/$regexp_fragments{$1}/eg;
  1771         4047  
252             }
253              
254 11         2278 return qr/ $syl_re /x;
255             }
256              
257             =head2 get_sylre_full
258              
259             In addition to the matching done by L, this one makes sure
260             matches are either followed by another complete syllable, a blank, the end of
261             string/line or some non-Lao character. This ensures correct matching of
262             ambiguous syllable boundaries where the core consonant of a following syllable
263             could also be an end consonant of the current one.
264              
265             =cut
266              
267             sub get_sylre_full {
268 4     4 1 398 my $syl_short = get_sylre_basic();
269 4         1828 return qr/ $syl_short (?= \P{Lao} | \s | $ | $syl_short ) /x;
270             }
271              
272             =head2 get_sylre_named
273              
274             The expression returned is the same as for L but also includes
275             named captures that upon a successful match allow to get the syllable's parts
276             from C<%+>.
277              
278             =cut
279              
280             sub get_sylre_named {
281 3     3 1 6 my $syl_short = get_sylre_basic();
282 3         12 my $syl_capture = $re_basic;
283 3         6 for my $atom (@sorted_x_names) {
284 48         753 $syl_capture =~ s/\$($atom)/_named_capture(\%regexp_fragments, $atom, $1)/eg;
  483         488  
285             }
286              
287 3         1625 return qr/ $syl_capture (?= \P{Lao} | \s | $ | $syl_short )/x;
288             }
289              
290 112     112 0 267 sub is_long_vowel { return $VOWEL_LENGTH{+shift} }
291              
292             sub _named_capture {
293 483     483   464 my ($fragments, $atom, $match) = @_;
294              
295             return sprintf(
296             '(?<%s> %s)',
297             $capture_names{$atom}, $fragments->{$match}
298 483 100       2002 ) if defined $capture_names{$atom};
299              
300 48         107 return $fragments->{$match};
301             }
302              
303             1;