File Coverage

blib/lib/Lingua/LO/NLP/Romanize.pm
Criterion Covered Total %
statement 61 61 100.0
branch 10 12 83.3
condition 8 8 100.0
subroutine 14 14 100.0
pod 4 4 100.0
total 97 99 97.9


line stmt bran cond sub pod time code
1             package Lingua::LO::NLP::Romanize;
2 5     5   291849 use strict;
  5         26  
  5         145  
3 5     5   25 use warnings;
  5         9  
  5         121  
4 5     5   90 use 5.012000;
  5         17  
5 5     5   26 use utf8;
  5         10  
  5         30  
6 5     5   815 use version 0.77; our $VERSION = version->declare('v1.0.1');
  5         4304  
  5         31  
7 5     5   458 use Carp;
  5         12  
  5         304  
8 5     5   32 use Scalar::Util 'blessed';
  5         8  
  5         226  
9 5     5   730 use Class::Accessor::Fast 'antlers';
  5         5671  
  5         35  
10 5     5   1273 use Lingua::LO::NLP::Syllabify;
  5         22  
  5         56  
11              
12             =encoding utf8
13              
14             =head1 NAME
15              
16             Lingua::LO::NLP::Romanize - Romanize Lao syllables
17              
18             =head1 FUNCTION
19              
20             This is a factory class for C. Currently there
21             are the following romanization modules:
22              
23             =over 4
24              
25             =item L
26              
27             for the standard set by the
28             L
29              
30             =item L
31              
32             for the International Phonetic Alphabet
33              
34             =back
35              
36             =head1 SYNOPSIS
37              
38             my $o = Lingua::LO::NLP::Romanize->new(
39             variant => 'PCGN',
40             hyphen => 1,
41             );
42              
43             =cut
44              
45             =head1 METHODS
46              
47             =head2 new
48              
49             The constructor takes any number of hash-style named arguments. The following
50             ones are always recognized:
51              
52             =over 4
53              
54             =item C
55              
56             Standard according to which to romanize; this determines the
57             L subclass to actually instantiate. This argument is
58             mandatory.
59              
60             =item C
61              
62             Separate runs of Lao syllables with "hyphens". Set this to the character you
63             would like to use as a hyphen - usually this will be the ASCII "hyphen minus"
64             (U+002D) but it can be the unambiguous Unicode hyphen ("โ€", U+2010), a slash or
65             anything you like (except for the special-cased '0' and '1' - but you wouldn't
66             want those between your syllables anyway!). As a special case, you can pass a 1
67             to use the ASCII version. If this argument is missing, C or C<0>, blanks
68             are used. Syllables duplicated using "เป†" are always joined with a hyphen:
69             either the one you specify or the ASCII one.
70              
71             =item C
72              
73             Run text through tone mark order normalization; see
74             L. If your text looks fine but
75             syllables are not recognized, you may need this.
76              
77             =back
78              
79             Subclasses may specify additional arguments, such as
80             L's C that controls the rendering of
81             IPA diacritics for tonal languages.
82              
83             =cut
84              
85             sub new {
86 11     11 1 4146 my ($class, %args) = @_;
87              
88             # Allow subclasses to omit a constructor
89 11 100       56 return bless {}, $class if $class ne __PACKAGE__;
90              
91             # If we've been called on Lingua::LO::NLP::Romanize, require a variant
92 10 100       360 my $variant = delete $args{variant} or croak("`variant' argument missing or undefined");
93 9         26 my $hyphen = delete $args{hyphen};
94 9         20 my $normalize = delete $args{normalize};
95              
96 9         72 my $subclass = __PACKAGE__ . "::$variant";
97 9         78 (my $module = $subclass) =~ s!::!/!g;
98 9         2432 require "$module.pm"; ## no critic (BarewordIncludes)
99              
100 9         218 my $self = $subclass->new(%args);
101              
102             # Pass an explicit false if hyphen arg was unset
103 9   100     113 $self->hyphen($hyphen // 0);
104 9         288 $self->normalize($normalize);
105 9         118 return $self;
106             }
107              
108             =head2 romanize
109              
110             romanize( $text )
111              
112             Return the romanization of C<$text> according to the standard passed to the
113             constructor. Text is split up by
114             L; Lao syllables are processed
115             and everything else is passed through unchanged save for possible conversion of
116             combining characters to a canonically equivalent form by
117             L.
118              
119             =cut
120              
121             sub romanize {
122 50     50 1 19158 my ($self, $text) = @_;
123 50         132 my $result = '';
124              
125 50         1188 my @frags = Lingua::LO::NLP::Syllabify->new( $text, normalize => $self->normalize )->get_fragments;
126 50         239 while(@frags) {
127 51         84 my @lao;
128 51   100     408 push @lao, shift @frags while @frags and $frags[0]->{is_lao};
129 51         129 $result .= join($self->{hyphen}, map { $self->romanize_syllable( $_->{text} ) } @lao);
  100         290  
130 50   100     296 $result .= (shift @frags)->{text} while @frags and not $frags[0]->{is_lao};
131             }
132 49         370 return $result;
133             }
134              
135             =head2 romanize_syllable
136              
137             romanize_syllable( $syllable | $analysis )
138              
139             Return the romanization of a single C<$syllable> according to the standard
140             passed to the constructor. This method accepts either a plain string or an
141             analysis result from L. The latter helps avoid
142             redundant parsing if you need both an analysis and a romanization.
143              
144             =cut
145              
146             sub romanize_syllable {
147 100     100 1 209 my ($self, $thing) = @_;
148 100 50       293 unless( blessed($thing) ) {
149             # Analyze syllable first unless we got an analysis result already
150             # (we just assume it is one if we have an object)
151 100         335 $thing = Lingua::LO::NLP::Analyze->new($thing);
152             }
153 100         375 return $self->_romanize_syllable( $thing );
154             }
155              
156             =head2 _romanize_syllable
157              
158             _romanize_syllable( $analysis )
159              
160             Return the romanization of a syllable passed in as a 'Lingua::LO::NLP::Analyze'
161             result, according to the standard passed to the constructor. This is a virtual
162             method that must be implemented by subclasses.
163              
164             =cut
165              
166             sub _romanize_syllable {
167 1     1   3 my $self = shift;
168 1         34 die blessed($self) . " must implement _romanize_syllable()";
169             }
170              
171             =head2 hyphen
172              
173             my $hyphen = $o->hyphen;
174             $o->hyphen( '-' ); # Use ASCII hyphen
175             $o->hyphen( 1 ); # Dito
176             $o->hyphen( 0 ); # No hyphenation, separate syllables with spaces
177             $o->hyphen( 'โ€' ); # Unicode hyphen U+2010
178              
179             Accessor for the C attribute, see L.
180              
181             =cut
182              
183             sub hyphen {
184 12     12 1 43 my ($self, $hyphen) = @_;
185 12 50       45 if(defined $hyphen) {
186 12 100       53 if($hyphen eq '1') {
    100          
187 2         12 $self->{hyphen} = '-';
188             } elsif($hyphen eq '0') {
189 7         47 $self->{hyphen} = ' ';
190             } else {
191 3         12 $self->{hyphen} = $hyphen;
192             }
193             }
194 12         30 return $self->{hyphen};
195             }
196              
197             =head2 normalize
198              
199             my $normalization = $o->normalize;
200             $o->normalize( $bool );
201              
202             Accessor for the C attribute, see L.
203              
204             =cut
205              
206             has normalize => (is => 'rw');
207              
208             1;
209