File Coverage

blib/lib/Lingua/LO/NLP/Romanize.pm
Criterion Covered Total %
statement 61 61 100.0
branch 10 12 83.3
condition 8 8 100.0
subroutine 14 14 100.0
pod 4 4 100.0
total 97 99 97.9


line stmt bran cond sub pod time code
1             package Lingua::LO::NLP::Romanize;
2 5     5   406083 use strict;
  5         21  
  5         146  
3 5     5   27 use warnings;
  5         11  
  5         302  
4 5     5   176 use 5.012000;
  5         22  
5 5     5   27 use utf8;
  5         12  
  5         34  
6 5     5   1020 use version 0.77; our $VERSION = version->declare('v1.0.1');
  5         6099  
  5         43  
7 5     5   554 use Carp;
  5         13  
  5         386  
8 5     5   33 use Scalar::Util 'blessed';
  5         10  
  5         251  
9 5     5   854 use Class::Accessor::Fast 'antlers';
  5         8306  
  5         45  
10 5     5   1578 use Lingua::LO::NLP::Syllabify;
  5         28  
  5         61  
11              
12             =encoding utf8
13              
14             =head1 NAME
15              
16             Lingua::LO::NLP::Romanize - Romanize Lao syllables
17              
18             =head1 FUNCTION
19              
20             This is a factory class for C. Currently there
21             are the following romanization modules:
22              
23             =over 4
24              
25             =item L
26              
27             for the standard set by the
28             L
29              
30             =item L
31              
32             for the International Phonetic Alphabet
33              
34             =back
35              
36             =head1 SYNOPSIS
37              
38             my $o = Lingua::LO::NLP::Romanize->new(
39             variant => 'PCGN',
40             hyphen => 1,
41             );
42              
43             =cut
44              
45             =head1 METHODS
46              
47             =head2 new
48              
49             The constructor takes any number of hash-style named arguments. The following
50             ones are always recognized:
51              
52             =over 4
53              
54             =item C
55              
56             Standard according to which to romanize; this determines the
57             L subclass to actually instantiate. This argument is
58             mandatory.
59              
60             =item C
61              
62             Separate runs of Lao syllables with "hyphens". Set this to the character you
63             would like to use as a hyphen - usually this will be the ASCII "hyphen minus"
64             (U+002D) but it can be the unambiguous Unicode hyphen ("โ€", U+2010), a slash or
65             anything you like (except for the special-cased '0' and '1' - but you wouldn't
66             want those between your syllables anyway!). As a special case, you can pass a 1
67             to use the ASCII version. If this argument is missing, C or C<0>, blanks
68             are used. Syllables duplicated using "เป†" are always joined with a hyphen:
69             either the one you specify or the ASCII one.
70              
71             =item C
72              
73             Run text through tone mark order normalization; see
74             L. If your text looks fine but
75             syllables are not recognized, you may need this.
76              
77             =back
78              
79             Subclasses may specify additional arguments, such as
80             L's C that controls the rendering of
81             IPA diacritics for tonal languages.
82              
83             =cut
84              
85             sub new {
86 11     11 1 3036 my ($class, %args) = @_;
87              
88             # Allow subclasses to omit a constructor
89 11 100       45 return bless {}, $class if $class ne __PACKAGE__;
90              
91             # If we've been called on Lingua::LO::NLP::Romanize, require a variant
92 10 100       233 my $variant = delete $args{variant} or croak("`variant' argument missing or undefined");
93 9         21 my $hyphen = delete $args{hyphen};
94 9         17 my $normalize = delete $args{normalize};
95              
96 9         64 my $subclass = __PACKAGE__ . "::$variant";
97 9         67 (my $module = $subclass) =~ s!::!/!g;
98 9         1954 require "$module.pm"; ## no critic (BarewordIncludes)
99              
100 9         213 my $self = $subclass->new(%args);
101              
102             # Pass an explicit false if hyphen arg was unset
103 9   100     80 $self->hyphen($hyphen // 0);
104 9         57 $self->normalize($normalize);
105 9         124 return $self;
106             }
107              
108             =head2 romanize
109              
110             romanize( $text )
111              
112             Return the romanization of C<$text> according to the standard passed to the
113             constructor. Text is split up by
114             L; Lao syllables are processed
115             and everything else is passed through unchanged save for possible conversion of
116             combining characters to a canonically equivalent form by
117             L.
118              
119             =cut
120              
121             sub romanize {
122 50     50 1 16971 my ($self, $text) = @_;
123 50         86 my $result = '';
124              
125 50         128 my @frags = Lingua::LO::NLP::Syllabify->new( $text, normalize => $self->normalize )->get_fragments;
126 50         172 while(@frags) {
127 51         65 my @lao;
128 51   100     312 push @lao, shift @frags while @frags and $frags[0]->{is_lao};
129 51         117 $result .= join($self->{hyphen}, map { $self->romanize_syllable( $_->{text} ) } @lao);
  100         251  
130 50   100     256 $result .= (shift @frags)->{text} while @frags and not $frags[0]->{is_lao};
131             }
132 49         304 return $result;
133             }
134              
135             =head2 romanize_syllable
136              
137             romanize_syllable( $syllable | $analysis )
138              
139             Return the romanization of a single C<$syllable> according to the standard
140             passed to the constructor. This method accepts either a plain string or an
141             analysis result from L. The latter helps avoid
142             redundant parsing if you need both an analysis and a romanization.
143              
144             =cut
145              
146             sub romanize_syllable {
147 100     100 1 181 my ($self, $thing) = @_;
148 100 50       252 unless( blessed($thing) ) {
149             # Analyze syllable first unless we got an analysis result already
150             # (we just assume it is one if we have an object)
151 100         261 $thing = Lingua::LO::NLP::Analyze->new($thing);
152             }
153 100         284 return $self->_romanize_syllable( $thing );
154             }
155              
156             =head2 _romanize_syllable
157              
158             _romanize_syllable( $analysis )
159              
160             Return the romanization of a syllable passed in as a 'Lingua::LO::NLP::Analyze'
161             result, according to the standard passed to the constructor. This is a virtual
162             method that must be implemented by subclasses.
163              
164             =cut
165              
166             sub _romanize_syllable {
167 1     1   2 my $self = shift;
168 1         23 die blessed($self) . " must implement _romanize_syllable()";
169             }
170              
171             =head2 hyphen
172              
173             my $hyphen = $o->hyphen;
174             $o->hyphen( '-' ); # Use ASCII hyphen
175             $o->hyphen( 1 ); # Dito
176             $o->hyphen( 0 ); # No hyphenation, separate syllables with spaces
177             $o->hyphen( 'โ€' ); # Unicode hyphen U+2010
178              
179             Accessor for the C attribute, see L.
180              
181             =cut
182              
183             sub hyphen {
184 12     12 1 33 my ($self, $hyphen) = @_;
185 12 50       35 if(defined $hyphen) {
186 12 100       61 if($hyphen eq '1') {
    100          
187 2         13 $self->{hyphen} = '-';
188             } elsif($hyphen eq '0') {
189 7         44 $self->{hyphen} = ' ';
190             } else {
191 3         7 $self->{hyphen} = $hyphen;
192             }
193             }
194 12         29 return $self->{hyphen};
195             }
196              
197             =head2 normalize
198              
199             my $normalization = $o->normalize;
200             $o->normalize( $bool );
201              
202             Accessor for the C attribute, see L.
203              
204             =cut
205              
206             has normalize => (is => 'rw');
207              
208             1;
209