File Coverage

blib/lib/CEDict/Pinyin.pm
Criterion Covered Total %
statement 59 108 54.6
branch 19 50 38.0
condition 2 11 18.1
subroutine 10 14 71.4
pod 4 5 80.0
total 94 188 50.0


line stmt bran cond sub pod time code
1             package CEDict::Pinyin;
2             # Copyright (c) 2009 Christopher Davaz. All rights reserved.
3             # This program is free software; you can redistribute it and/or
4             # modify it under the same terms as Perl itself.
5 1     1   55045 use strict;
  1         2  
  1         46  
6 1     1   6 use warnings;
  1         13  
  1         42  
7 1     1   15 use vars qw($VERSION);
  1         5  
  1         61  
8 1     1   5 use base qw(Class::Light);
  1         3  
  1         2330  
9 1     1   2020 use Carp;
  1         3  
  1         65  
10 1     1   39081 use Encode;
  1         65330  
  1         1573  
11              
12             $VERSION = '0.02004';
13              
14             =encoding utf8
15              
16             =head1 NAME
17              
18             CEDict::Pinyin - Validates pinyin strings
19              
20             =head1 SYNOPSIS
21              
22             # This is from the test case provided with this module:
23              
24             use CEDict::Pinyin;
25              
26             my @good = ("ji2 - rui4 cheng2", "xi'an", "dian4 nao3, yuyan2", "kongzi");
27             my @bad = ("123", "not pinyin", "gu1 gu1 fsck4 fu3");
28             my $py = CEDict::Pinyin->new;
29              
30             for (@good) {
31             $py->setSource($_);
32             ok($py->isPinyin, "correctly validated good pinyin");
33             print "pinyin: " . $py->getSource . "\n";
34             print "parts: " . join(', ', @{$py->getParts}) . "\n";
35             }
36              
37             for (@bad) {
38             $py->setSource($_);
39             ok(!$py->isPinyin, "correctly invalidated bad pinyin");
40             print "pinyin: " . $py->getSource . "\n";
41             print "parts: " . join(', ', @{$py->getParts}) . "\n";
42             }
43              
44             =head1 DESCRIPTION
45              
46             This class helps you validate and parse pinyin. Currently the pinyin must follow
47             some rules about how it is formatted before being considered "valid" by this
48             class's validation method. All valid pinyin syllables are expressed by characters
49             within the 7-bit ASCII range. That means the validation method will fail on a
50             string like "nán nǚ lǎo shào". The pinyin should instead contain numbers after
51             the letter to represent tones. Instead of the string above we should use
52             "nan2 nv lao3 shao4". Being able to accept a string with accented characters
53             that represent the tone of the syllable is a feature I hope to add to a future
54             version of this module. The parser first takes a look at the entires string
55             you pass it to see if it is even worth parsing. The regular expression used
56             is shown below.
57              
58             C<< /^[A-Za-z]+[A-Za-z1-5,'\- ]*$/ >>
59              
60             If the pinyin doesn't match this regex, then isPinyin returns false and stops
61             parsing the string. All this means is that if you want to use this module to
62             validate your pinyin but your pinyin is not exactly in the same format as
63             just described then you need cleanup your pinyin strings a little bit first.
64              
65             Again, hopefully future versions of this class will be more flexible in what
66             is accepted as valid pinyin. However we want to be sure that what we are looking
67             at is really pinyin and not some English words as this module was originally
68             written in part to distinguish between a pinyin string and English. I would
69             also like to keep this idea in future versions, so if you update the class
70             with your own code, please keep that in mind.
71              
72             =cut
73              
74             # Class Data defined here
75             my %ValidPinyin;
76             $ValidPinyin{$_} = chomp $_ while ;
77             close DATA;
78              
79             # Characters containing the diacritic marks used in pinyin. The array is ordered
80             # by tone number (however since arrays are 0-indexed the first-tone diacritic
81             # information is at $Tones[0]).
82             my @Tones = (
83             {a=>"\x{0101}",e=>"\x{0113}",i=>"\x{012B}",o=>"\x{014D}",u=>"\x{016B}",v=>"\x{01D6}"},
84             {a=>"\x{00E1}",e=>"\x{00E9}",i=>"\x{00ED}",o=>"\x{00F3}",u=>"\x{00FA}",v=>"\x{01D8}"},
85             {a=>"\x{01CE}",e=>"\x{0115}",i=>"\x{01D0}",o=>"\x{01D2}",u=>"\x{01D4}",v=>"\x{01DA}"},
86             {a=>"\x{00E0}",e=>"\x{00E8}",i=>"\x{00EC}",o=>"\x{00F2}",u=>"\x{00F9}",v=>"\x{01DC}"}
87             );
88              
89             sub getValidPinyin {
90 0     0 0 0 return \%ValidPinyin;
91             }
92              
93             =head2 Methods
94              
95             =over 4
96              
97             =item C<< CEDict::Pinyin->new( >>IC<)>
98              
99             Creates a new CEDict::Pinyin object. I should be a string containing
100             the pinyin you want to work with. If I is ommited it can be set later
101             using the C method.
102              
103             =cut
104              
105             sub _init {
106 1     1   44 my $self = shift;
107 1   50     7 my $source = shift || '';
108 1         7 $self->{source} = $source;
109 1         4 $self->{parts} = [];
110             }
111              
112             =item C<< $obj->setSource( >>IC<)>
113              
114             Sets the source string to work with. Currently only the C method accesses
115             this attribute. Returns the previously set pinyin string.
116              
117             =cut
118              
119             sub setSource {
120 7     7 1 261 my $self = shift;
121 7         7 my $new = shift;
122 7         13 my $old = $self->{source};
123              
124 7         12 $self->{source} = $new;
125 7         13 $self->{parts} = [];
126              
127 7         19 return $old;
128             }
129              
130             =item C<< $obj->diacritic >>
131              
132             Returns the string of pinyin using diacritic marks instead of numbers to represent
133             tone information. For example, if the source pinyin is "nan2 nv lao3 shao4" then
134             C<< $obj->diacritic >> will return "nán nǚ lǎo shào".
135              
136             =cut
137              
138             # See http://www.pinyin.info/rules/where.html for a description of the
139             # rules regarding where tone marks go. These rules are copied here:
140             #
141             # * a and e trump all other vowels and always take the tone mark.
142             # There are no Mandarin syllables in Hanyu Pinyin that contain
143             # both a and e.
144             # * In the combination ou, o takes the mark.
145             # * In all other cases, the final vowel takes the mark.
146             #
147             sub diacritic {
148 0     0 1 0 my $self = shift;
149 0         0 my $source = $self->{source};
150 0         0 my $parts = [];
151 0 0       0 return unless $self->isPinyin($parts);
152 0         0 my @parts = @$parts;
153 0 0       0 return unless @parts;
154 0         0 my @string;
155              
156 0         0 for my $part (@parts) {
157 0         0 $part =~ s/(.*)([1-5])/$1/;
158 0 0       0 my $tone = ($2 ? $2 : 5) - 1;
159 0 0       0 if ($tone < 4) {
160 0 0       0 if ($part =~ /([ae])/) {
    0          
    0          
    0          
161 0         0 my $vowel = $1;
162 0         0 $part =~ s/$vowel/$Tones[$tone]{$vowel}/;
163             } elsif ($part =~ /ou/) {
164 0         0 $part =~ s/o/$Tones[$tone]{o}/;
165             } elsif ($part =~ /v/) {
166 0         0 $part =~ s/v/$Tones[$tone]{v}/;
167             } elsif (reverse $part =~ /([aeiou])/) {
168 0         0 my $vowel = $1;
169 0         0 $part =~ s/$vowel/$Tones[$tone]{$vowel}/;
170             }
171             }
172 0         0 push @string, $part;
173             }
174              
175 0         0 return encode('UTF-8', join (' ', @string));
176             }
177              
178             =item C<< $obj->getParts >>
179              
180             Returns the individually parsed elements of the pinyin source string.
181              
182             =cut
183              
184             # defined by Class::Light
185              
186             =item C<< $obj->getSource >>
187              
188             Returns the pinyin source string that was supplied earlier either via the
189             constructor or C<< $obj->setSource >>.
190              
191             =cut
192              
193             # defined by Class::Light
194              
195             =item C<< $obj->isPinyin >> I C<< $obj>->isPinyin( >>IC<)>
196              
197             Validates the pinyin supplied to the constructor or to C<< $obj->setSource(SCALAR) >>.
198             If an I is supplied as an argument, adds each syllable of the parsed pinyin
199             to the array. If a syllable is considered invalid then the method stops parsing and
200             immediately returns false. Returns true otherwise.
201              
202             =cut
203              
204             sub isPinyin {
205 7     7 1 29 my $self = shift;
206 7   33     31 my $parts = shift || $self->{parts};
207 7         11 my $source = $self->{source};
208 7 50       12 return unless $source;
209 7         13 $source = lc $source;
210 7 100       37 return 0 unless $source =~ /^[a-z]+[a-z1-5,'\- ]*$/;
211             # Find all the alphabetic characters before a syllable boundary ([1-5,'\- ]).
212             # The matched group may still consist of many syllables (for example, the
213             # string "shenjingbing"). So we still need to split this string into its
214             # constituent syllables.
215 6         36 my @result = $source =~ /([a-z]+[1-5]?)/g;
216 6         13 for my $validSubstring (@result) {
217 13         12 my $lastValidSubstring;
218             my $tone;
219              
220 13 100       51 $tone = $1 if $validSubstring =~ /([1-5])/;
221 13         34 $validSubstring =~ s/[1-5]//;
222              
223 13         31 while (1) {
224 15         18 $lastValidSubstring = $validSubstring;
225 15         26 $validSubstring = _getValidSubstring($validSubstring);
226 15 100       28 unless ($validSubstring) {
227 13 100       37 push @$parts, $lastValidSubstring . ($tone ? $tone : "");
228 13         15 last;
229             }
230 2         7 push @$parts, substr(
231             $lastValidSubstring,
232             0,
233             length($lastValidSubstring) - length($validSubstring)
234             );
235             }
236 13 100       39 return 0 unless defined $validSubstring;
237             }
238 4         21 return 1;
239             }
240              
241             =item C<< CEDict::Pinyin->buildRegex( >>IC<)>
242              
243             Takes a string containing pinyin and returns a regular expression that can be used with
244             the MySQL database (so far only tested against the 5.1 series). Accepts an asterisk ("*")
245             as a wildcard. Note that the C method will return false when validating such
246             a string, so if you plan on first validating the pinyin then generating the regex, make
247             sure you are validating the string without the asterisks C<($string =~ s/\*//g)>.
248              
249             =back
250              
251             =cut
252              
253             sub buildRegex {
254 0     0 1 0 my $self = shift;
255 0 0       0 my $source = shift or return;
256 0         0 $source =~ s/\*+/\*/g; # Collapse redundant wildcards into one
257 0         0 my @reParts = ($source =~ /(\*|[^*]+)/g);
258 0         0 my $regex = '^';
259              
260 0         0 for (my $h = 0; $h < @reParts; $h++) {
261 0         0 $_ = $reParts[$h];
262 0 0       0 if ($_ eq '*') {
263 0         0 $regex .= '.*';
264 0         0 next;
265             }
266 0         0 my $pinyin = __PACKAGE__->new($_);
267 0         0 my $parts = []; $pinyin->isPinyin($parts);
  0         0  
268 0         0 my @parts = @$parts;
269 0 0       0 return unless @parts;
270              
271             # Check if last part is a valid pinyin substring
272 0 0       0 return unless _isValidInitialSubstring($parts[$#parts]);
273              
274             # Use parts to construct a MySQL regular expression
275 0         0 for (my $i = 0; $i < @parts; $i++) {
276 0         0 $regex .= $parts[$i];
277 0 0       0 $regex .= '[1-5]?' unless $parts[$i] =~ /[1-5]/;
278 0 0 0     0 unless (
      0        
279             ($h == $#reParts
280             || ($h == ($#reParts - 1) && $reParts[$#reParts] eq '*'))
281             && $i == $#parts) {
282 0         0 $regex .= "[,' -]";
283             }
284             }
285             }
286              
287 0         0 $regex .= '$';
288 0         0 return $regex;
289             }
290              
291             # Private Static Method _isValidInitialSubstring
292             #
293             # Checks if $startsWith is the beginning of a valid pinyin string
294             sub _isValidInitialSubstring {
295 0     0   0 my $startsWith = shift;
296 0         0 $startsWith =~ s/[1-5]//;
297 0         0 for (keys %ValidPinyin) {
298 0 0       0 return 1 if /^$startsWith/;
299             }
300 0         0 return 0;
301             }
302              
303             # Private Static Method _getValidSubstring
304             #
305             # _getValidSubstring returns the empty string if the entire $syllable matches
306             # a valid pinyin syllable. If only a portion of the string (starting from the
307             # beginning of the string) matches, then the rest of the string that didn't
308             # match is returned. If a match can't be found undef is returned.
309             sub _getValidSubstring {
310 15     15   24 my $syllable = shift;
311 15         17 my $part = undef;
312 15         15 my $valid = 0;
313 15 100       32 my $max = length($syllable) < 6 ? length($syllable) + 1 : 7;
314              
315             # Do a quick lookup to see if the whole $syllable matches
316 15 100       50 return "" if exists $ValidPinyin{$syllable};
317              
318             # Find the longest valid syllable
319 4         10 for (my $i = 1; $i < $max; $i++) {
320 18         25 $part = substr $syllable, 0, $i;
321 18 100       59 $valid = $i if exists $ValidPinyin{$part};
322             }
323              
324             # $syllable is invalid so return undef
325 4 100       11 return undef unless $valid;
326              
327             # Get only the valid part of $syllable
328 2         5 $part = substr $syllable, 0, $valid;
329              
330 2         6 return substr $syllable, length($part), length($syllable);
331             }
332              
333             1;
334              
335             =head1 AUTHOR
336              
337             Christopher Davaz www.chrisdavaz.com cdavaz@gmail.com
338              
339             =head1 VERSION
340              
341             Version 0.02004 (Mar 01 2010)
342              
343             =head1 COPYRIGHT
344              
345             Copyright (c) 2009 Christopher Davaz. All rights reserved.
346             This program is free software; you can redistribute it and/or
347             modify it under the same terms as Perl itself.
348              
349             =cut
350              
351             __DATA__