| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Lingua::LO::NLP::Romanize; |
|
2
|
5
|
|
|
5
|
|
259072
|
use strict; |
|
|
5
|
|
|
|
|
18
|
|
|
|
5
|
|
|
|
|
150
|
|
|
3
|
5
|
|
|
5
|
|
23
|
use warnings; |
|
|
5
|
|
|
|
|
9
|
|
|
|
5
|
|
|
|
|
119
|
|
|
4
|
5
|
|
|
5
|
|
93
|
use 5.012000; |
|
|
5
|
|
|
|
|
15
|
|
|
5
|
5
|
|
|
5
|
|
23
|
use utf8; |
|
|
5
|
|
|
|
|
5
|
|
|
|
5
|
|
|
|
|
30
|
|
|
6
|
5
|
|
|
5
|
|
750
|
use version 0.77; our $VERSION = version->declare('v1.0.1'); |
|
|
5
|
|
|
|
|
4093
|
|
|
|
5
|
|
|
|
|
27
|
|
|
7
|
5
|
|
|
5
|
|
406
|
use Carp; |
|
|
5
|
|
|
|
|
11
|
|
|
|
5
|
|
|
|
|
266
|
|
|
8
|
5
|
|
|
5
|
|
29
|
use Scalar::Util 'blessed'; |
|
|
5
|
|
|
|
|
9
|
|
|
|
5
|
|
|
|
|
239
|
|
|
9
|
5
|
|
|
5
|
|
739
|
use Class::Accessor::Fast 'antlers'; |
|
|
5
|
|
|
|
|
5373
|
|
|
|
5
|
|
|
|
|
30
|
|
|
10
|
5
|
|
|
5
|
|
1315
|
use Lingua::LO::NLP::Syllabify; |
|
|
5
|
|
|
|
|
22
|
|
|
|
5
|
|
|
|
|
39
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=encoding utf8 |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Lingua::LO::NLP::Romanize - Romanize Lao syllables |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 FUNCTION |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
This is a factory class for C. Currently there |
|
21
|
|
|
|
|
|
|
are the following romanization modules: |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=over 4 |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=item L |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
for the standard set by the |
|
28
|
|
|
|
|
|
|
L |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=item L |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
for the International Phonetic Alphabet |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=back |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
my $o = Lingua::LO::NLP::Romanize->new( |
|
39
|
|
|
|
|
|
|
variant => 'PCGN', |
|
40
|
|
|
|
|
|
|
hyphen => 1, |
|
41
|
|
|
|
|
|
|
); |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 METHODS |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head2 new |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
The constructor takes any number of hash-style named arguments. The following |
|
50
|
|
|
|
|
|
|
ones are always recognized: |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=over 4 |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=item C |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Standard according to which to romanize; this determines the |
|
57
|
|
|
|
|
|
|
L subclass to actually instantiate. This argument is |
|
58
|
|
|
|
|
|
|
mandatory. |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=item C |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Separate runs of Lao syllables with "hyphens". Set this to the character you |
|
63
|
|
|
|
|
|
|
would like to use as a hyphen - usually this will be the ASCII "hyphen minus" |
|
64
|
|
|
|
|
|
|
(U+002D) but it can be the unambiguous Unicode hyphen ("โ", U+2010), a slash or |
|
65
|
|
|
|
|
|
|
anything you like (except for the special-cased '0' and '1' - but you wouldn't |
|
66
|
|
|
|
|
|
|
want those between your syllables anyway!). As a special case, you can pass a 1 |
|
67
|
|
|
|
|
|
|
to use the ASCII version. If this argument is missing, C or C<0>, blanks |
|
68
|
|
|
|
|
|
|
are used. Syllables duplicated using "เป" are always joined with a hyphen: |
|
69
|
|
|
|
|
|
|
either the one you specify or the ASCII one. |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=item C |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Run text through tone mark order normalization; see |
|
74
|
|
|
|
|
|
|
L. If your text looks fine but |
|
75
|
|
|
|
|
|
|
syllables are not recognized, you may need this. |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=back |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Subclasses may specify additional arguments, such as |
|
80
|
|
|
|
|
|
|
L's C that controls the rendering of |
|
81
|
|
|
|
|
|
|
IPA diacritics for tonal languages. |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=cut |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub new { |
|
86
|
11
|
|
|
11
|
1
|
3256
|
my ($class, %args) = @_; |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# Allow subclasses to omit a constructor |
|
89
|
11
|
100
|
|
|
|
49
|
return bless {}, $class if $class ne __PACKAGE__; |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# If we've been called on Lingua::LO::NLP::Romanize, require a variant |
|
92
|
10
|
100
|
|
|
|
490
|
my $variant = delete $args{variant} or croak("`variant' argument missing or undefined"); |
|
93
|
9
|
|
|
|
|
22
|
my $hyphen = delete $args{hyphen}; |
|
94
|
9
|
|
|
|
|
21
|
my $normalize = delete $args{normalize}; |
|
95
|
|
|
|
|
|
|
|
|
96
|
9
|
|
|
|
|
64
|
my $subclass = __PACKAGE__ . "::$variant"; |
|
97
|
9
|
|
|
|
|
75
|
(my $module = $subclass) =~ s!::!/!g; |
|
98
|
9
|
|
|
|
|
1444
|
require "$module.pm"; ## no critic (BarewordIncludes) |
|
99
|
|
|
|
|
|
|
|
|
100
|
9
|
|
|
|
|
179
|
my $self = $subclass->new(%args); |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# Pass an explicit false if hyphen arg was unset |
|
103
|
9
|
|
100
|
|
|
73
|
$self->hyphen($hyphen // 0); |
|
104
|
9
|
|
|
|
|
248
|
$self->normalize($normalize); |
|
105
|
9
|
|
|
|
|
155
|
return $self; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head2 romanize |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
romanize( $text ) |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Return the romanization of C<$text> according to the standard passed to the |
|
113
|
|
|
|
|
|
|
constructor. Text is split up by |
|
114
|
|
|
|
|
|
|
L; Lao syllables are processed |
|
115
|
|
|
|
|
|
|
and everything else is passed through unchanged save for possible conversion of |
|
116
|
|
|
|
|
|
|
combining characters to a canonically equivalent form by |
|
117
|
|
|
|
|
|
|
L. |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=cut |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub romanize { |
|
122
|
50
|
|
|
50
|
1
|
25014
|
my ($self, $text) = @_; |
|
123
|
50
|
|
|
|
|
112
|
my $result = ''; |
|
124
|
|
|
|
|
|
|
|
|
125
|
50
|
|
|
|
|
1174
|
my @frags = Lingua::LO::NLP::Syllabify->new( $text, normalize => $self->normalize )->get_fragments; |
|
126
|
50
|
|
|
|
|
231
|
while(@frags) { |
|
127
|
51
|
|
|
|
|
108
|
my @lao; |
|
128
|
51
|
|
100
|
|
|
426
|
push @lao, shift @frags while @frags and $frags[0]->{is_lao}; |
|
129
|
51
|
|
|
|
|
156
|
$result .= join($self->{hyphen}, map { $self->romanize_syllable( $_->{text} ) } @lao); |
|
|
100
|
|
|
|
|
290
|
|
|
130
|
50
|
|
100
|
|
|
349
|
$result .= (shift @frags)->{text} while @frags and not $frags[0]->{is_lao}; |
|
131
|
|
|
|
|
|
|
} |
|
132
|
49
|
|
|
|
|
408
|
return $result; |
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=head2 romanize_syllable |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
romanize_syllable( $syllable | $analysis ) |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Return the romanization of a single C<$syllable> according to the standard |
|
140
|
|
|
|
|
|
|
passed to the constructor. This method accepts either a plain string or an |
|
141
|
|
|
|
|
|
|
analysis result from L. The latter helps avoid |
|
142
|
|
|
|
|
|
|
redundant parsing if you need both an analysis and a romanization. |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=cut |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub romanize_syllable { |
|
147
|
100
|
|
|
100
|
1
|
214
|
my ($self, $thing) = @_; |
|
148
|
100
|
50
|
|
|
|
317
|
unless( blessed($thing) ) { |
|
149
|
|
|
|
|
|
|
# Analyze syllable first unless we got an analysis result already |
|
150
|
|
|
|
|
|
|
# (we just assume it is one if we have an object) |
|
151
|
100
|
|
|
|
|
353
|
$thing = Lingua::LO::NLP::Analyze->new($thing); |
|
152
|
|
|
|
|
|
|
} |
|
153
|
100
|
|
|
|
|
359
|
return $self->_romanize_syllable( $thing ); |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head2 _romanize_syllable |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
_romanize_syllable( $analysis ) |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
Return the romanization of a syllable passed in as a 'Lingua::LO::NLP::Analyze' |
|
161
|
|
|
|
|
|
|
result, according to the standard passed to the constructor. This is a virtual |
|
162
|
|
|
|
|
|
|
method that must be implemented by subclasses. |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=cut |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub _romanize_syllable { |
|
167
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
168
|
1
|
|
|
|
|
18
|
die blessed($self) . " must implement _romanize_syllable()"; |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=head2 hyphen |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
my $hyphen = $o->hyphen; |
|
174
|
|
|
|
|
|
|
$o->hyphen( '-' ); # Use ASCII hyphen |
|
175
|
|
|
|
|
|
|
$o->hyphen( 1 ); # Dito |
|
176
|
|
|
|
|
|
|
$o->hyphen( 0 ); # No hyphenation, separate syllables with spaces |
|
177
|
|
|
|
|
|
|
$o->hyphen( 'โ' ); # Unicode hyphen U+2010 |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Accessor for the C attribute, see L. |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=cut |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub hyphen { |
|
184
|
12
|
|
|
12
|
1
|
33
|
my ($self, $hyphen) = @_; |
|
185
|
12
|
50
|
|
|
|
35
|
if(defined $hyphen) { |
|
186
|
12
|
100
|
|
|
|
46
|
if($hyphen eq '1') { |
|
|
|
100
|
|
|
|
|
|
|
187
|
2
|
|
|
|
|
8
|
$self->{hyphen} = '-'; |
|
188
|
|
|
|
|
|
|
} elsif($hyphen eq '0') { |
|
189
|
7
|
|
|
|
|
28
|
$self->{hyphen} = ' '; |
|
190
|
|
|
|
|
|
|
} else { |
|
191
|
3
|
|
|
|
|
10
|
$self->{hyphen} = $hyphen; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
} |
|
194
|
12
|
|
|
|
|
24
|
return $self->{hyphen}; |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 normalize |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
my $normalization = $o->normalize; |
|
200
|
|
|
|
|
|
|
$o->normalize( $bool ); |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Accessor for the C attribute, see L. |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=cut |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
has normalize => (is => 'rw'); |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
1; |
|
209
|
|
|
|
|
|
|
|