File Coverage

blib/lib/Lingua/LO/NLP/Romanize.pm

Criterion	Covered	Total	%
statement	61	61	100.0
branch	10	12	83.3
condition	8	8	100.0
subroutine	14	14	100.0
pod	4	4	100.0
total	97	99	97.9

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::LO::NLP::Romanize;
2	5			5		259072	use strict;
	5					18
	5					150
3	5			5		23	use warnings;
	5					9
	5					119
4	5			5		93	use 5.012000;
	5					15
5	5			5		23	use utf8;
	5					5
	5					30
6	5			5		750	use version 0.77; our $VERSION = version->declare('v1.0.1');
	5					4093
	5					27
7	5			5		406	use Carp;
	5					11
	5					266
8	5			5		29	use Scalar::Util 'blessed';
	5					9
	5					239
9	5			5		739	use Class::Accessor::Fast 'antlers';
	5					5373
	5					30
10	5			5		1315	use Lingua::LO::NLP::Syllabify;
	5					22
	5					39
11
12							=encoding utf8
13
14							=head1 NAME
15
16							Lingua::LO::NLP::Romanize - Romanize Lao syllables
17
18							=head1 FUNCTION
19
20							This is a factory class for C. Currently there
21							are the following romanization modules:
22
23							=over 4
24
25							=item L
26
27							for the standard set by the
28							L
29
30							=item L
31
32							for the International Phonetic Alphabet
33
34							=back
35
36							=head1 SYNOPSIS
37
38							my $o = Lingua::LO::NLP::Romanize->new(
39							variant => 'PCGN',
40							hyphen => 1,
41							);
42
43							=cut
44
45							=head1 METHODS
46
47							=head2 new
48
49							The constructor takes any number of hash-style named arguments. The following
50							ones are always recognized:
51
52							=over 4
53
54							=item C
55
56							Standard according to which to romanize; this determines the
57							L subclass to actually instantiate. This argument is
58							mandatory.
59
60							=item C
61
62							Separate runs of Lao syllables with "hyphens". Set this to the character you
63							would like to use as a hyphen - usually this will be the ASCII "hyphen minus"
64							(U+002D) but it can be the unambiguous Unicode hyphen ("‐", U+2010), a slash or
65							anything you like (except for the special-cased '0' and '1' - but you wouldn't
66							want those between your syllables anyway!). As a special case, you can pass a 1
67							to use the ASCII version. If this argument is missing, C or C<0>, blanks
68							are used. Syllables duplicated using "ໆ" are always joined with a hyphen:
69							either the one you specify or the ASCII one.
70
71							=item C
72
73							Run text through tone mark order normalization; see
74							L. If your text looks fine but
75							syllables are not recognized, you may need this.
76
77							=back
78
79							Subclasses may specify additional arguments, such as
80							L's C that controls the rendering of
81							IPA diacritics for tonal languages.
82
83							=cut
84
85							sub new {
86	11			11	1	3256	my ($class, %args) = @_;
87
88							# Allow subclasses to omit a constructor
89	11	100				49	return bless {}, $class if $class ne __PACKAGE__;
90
91							# If we've been called on Lingua::LO::NLP::Romanize, require a variant
92	10	100				490	my $variant = delete $args{variant} or croak("`variant' argument missing or undefined");
93	9					22	my $hyphen = delete $args{hyphen};
94	9					21	my $normalize = delete $args{normalize};
95
96	9					64	my $subclass = __PACKAGE__ . "::$variant";
97	9					75	(my $module = $subclass) =~ s!::!/!g;
98	9					1444	require "$module.pm"; ## no critic (BarewordIncludes)
99
100	9					179	my $self = $subclass->new(%args);
101
102							# Pass an explicit false if hyphen arg was unset
103	9		100			73	$self->hyphen($hyphen // 0);
104	9					248	$self->normalize($normalize);
105	9					155	return $self;
106							}
107
108							=head2 romanize
109
110							romanize( $text )
111
112							Return the romanization of C<$text> according to the standard passed to the
113							constructor. Text is split up by
114							L; Lao syllables are processed
115							and everything else is passed through unchanged save for possible conversion of
116							combining characters to a canonically equivalent form by
117							L.
118
119							=cut
120
121							sub romanize {
122	50			50	1	25014	my ($self, $text) = @_;
123	50					112	my $result = '';
124
125	50					1174	my @frags = Lingua::LO::NLP::Syllabify->new( $text, normalize => $self->normalize )->get_fragments;
126	50					231	while(@frags) {
127	51					108	my @lao;
128	51		100			426	push @lao, shift @frags while @frags and $frags[0]->{is_lao};
129	51					156	$result .= join($self->{hyphen}, map { $self->romanize_syllable( $_->{text} ) } @lao);
	100					290
130	50		100			349	$result .= (shift @frags)->{text} while @frags and not $frags[0]->{is_lao};
131							}
132	49					408	return $result;
133							}
134
135							=head2 romanize_syllable
136
137							romanize_syllable( $syllable \| $analysis )
138
139							Return the romanization of a single C<$syllable> according to the standard
140							passed to the constructor. This method accepts either a plain string or an
141							analysis result from L. The latter helps avoid
142							redundant parsing if you need both an analysis and a romanization.
143
144							=cut
145
146							sub romanize_syllable {
147	100			100	1	214	my ($self, $thing) = @_;
148	100	50				317	unless( blessed($thing) ) {
149							# Analyze syllable first unless we got an analysis result already
150							# (we just assume it is one if we have an object)
151	100					353	$thing = Lingua::LO::NLP::Analyze->new($thing);
152							}
153	100					359	return $self->_romanize_syllable( $thing );
154							}
155
156							=head2 _romanize_syllable
157
158							_romanize_syllable( $analysis )
159
160							Return the romanization of a syllable passed in as a 'Lingua::LO::NLP::Analyze'
161							result, according to the standard passed to the constructor. This is a virtual
162							method that must be implemented by subclasses.
163
164							=cut
165
166							sub _romanize_syllable {
167	1			1		2	my $self = shift;
168	1					18	die blessed($self) . " must implement _romanize_syllable()";
169							}
170
171							=head2 hyphen
172
173							my $hyphen = $o->hyphen;
174							$o->hyphen( '-' ); # Use ASCII hyphen
175							$o->hyphen( 1 ); # Dito
176							$o->hyphen( 0 ); # No hyphenation, separate syllables with spaces
177							$o->hyphen( '‐' ); # Unicode hyphen U+2010
178
179							Accessor for the C attribute, see L.
180
181							=cut
182
183							sub hyphen {
184	12			12	1	33	my ($self, $hyphen) = @_;
185	12	50				35	if(defined $hyphen) {
186	12	100				46	if($hyphen eq '1') {
		100
187	2					8	$self->{hyphen} = '-';
188							} elsif($hyphen eq '0') {
189	7					28	$self->{hyphen} = ' ';
190							} else {
191	3					10	$self->{hyphen} = $hyphen;
192							}
193							}
194	12					24	return $self->{hyphen};
195							}
196
197							=head2 normalize
198
199							my $normalization = $o->normalize;
200							$o->normalize( $bool );
201
202							Accessor for the C attribute, see L.
203
204							=cut
205
206							has normalize => (is => 'rw');
207
208							1;
209