File Coverage

blib/lib/Text/Unicode/Equivalents.pm
Criterion Covered Total %
statement 112 124 90.3
branch 40 54 74.0
condition 6 6 100.0
subroutine 12 13 92.3
pod 0 2 0.0
total 170 199 85.4


line stmt bran cond sub pod time code
1             package Text::Unicode::Equivalents;
2              
3 1     1   161938 use strict;
  1         3  
  1         45  
4 1     1   6 use warnings;
  1         2  
  1         32  
5 1     1   1120 use utf8;
  1         16  
  1         5  
6 1     1   68651 use Unicode::Normalize qw(NFD getCanon getComposite getCombinClass);
  1         10137  
  1         171  
7 1     1   1486 use Unicode::UCD;
  1         81536  
  1         70  
8 1     1   7413 use Encode;
  1         18807  
  1         87  
9 1     1   7 use Carp;
  1         2  
  1         196  
10              
11             require 5.8.0; # Had some trouble with Unicode character handling in 5.6.
12              
13             require Exporter;
14              
15             our @ISA = qw(Exporter);
16             our @EXPORT_OK = qw( all_strings );
17              
18             our $VERSION = '0.05'; # RMH 2011-06-27
19             # Changes to Makefile.PL and tests.t to improve portability
20             # Added comments about \X being different on 5.10 vs 5.12
21             # our $VERSION = '0.04'; # RMH 2011-06-27
22             # Perl 5.14 doesn't have unicore/UnicodeData.txt, so changing to use unicode/Decomposition.pl
23             # our $VERSION = '0.03'; # RMH 2011-06-24
24             # Change module name to Text::Unicode::Equivalents -- more acceptable to CPAN
25             # Eliminate all but one public function, which is renamed all_strings()
26             # Previous version didn't synthesize singletons
27             # Eliminate hard-coding of %nonStarterComposites
28             # Eliminate $ignoreSingletons parameter -- not very useful and implementation was squirrely anyay
29             # our $VERSION = '0.02'; # RMH 2004-11-08
30             # Added equivalents()
31             # Rewrote permuteCompositeChar() so composes medial sequences as well as initial
32             # As a result, it now composes 0308+0301 -> 0344
33             # our $VERSION = '0.01'; # RMH 2003-05-02 Original
34              
35             =head1 NAME
36              
37             Text::Unicode::Equivalents - synthesize canonically equivalent strings
38              
39             =head1 SYNOPSIS
40              
41             use Text::Unicode::Equivalents qw( all_strings);
42            
43             $aref = all_strings ($string);
44             map {print "$_\n"} @{$aref};
45            
46             =head1 DESCRIPTION
47              
48             =cut
49              
50             # The two things I can't seem to make the Unicode module do are to (1) compose two diacritics, e.g.,
51             # <0308+0301> => 0344 (Unicode calls such decompositions "non-starters" and won't compose them) and
52             # (2) *compose* a singleton. So I use unicore/Decomposition.pl to generate two hashes:
53              
54             my %sSingletonCompositions; # keyed by single character string; returns its singleton composite, as a string.
55             my %cpNonStarterComposites; # keyed by two-character string that has a non-starter composition; returns codepoint of the composite.
56              
57             =over
58              
59             =item all_string($s)
60              
61             Given an arbitrary string, C
62             returns a reference to an unsorted array of all unique strings that are canonically
63             equivalent to the argument.
64              
65             =cut
66              
67             sub all_strings
68             {
69 26     26 0 17096 my ($s, $trace) = @_;
70 26         41 my $i;
71            
72             # If string starts with combining mark, prefix space so we get a proper cluster:
73             my $spaceAdded;
74 26 50       106 if ($s =~ /^\pM/)
75             {
76 0         0 $s = ' ' . $s ;
77 0         0 $spaceAdded = 1;
78             }
79            
80             # Split string into Extended Grapheme Clusters
81            
82             # NB:
83             # on Perl prior to v5.12, \X matches Unicode "combining character sequence", equivalent to (?>\PM\pM*)
84             # on Perl v5.12 and later, \X matches Unicode "eXtended grapheme cluster"
85             # Thus \X matches combining hangul jamo sequence such as "\x{1100}\x{1161}\x{11a8}" on 12.0, but not 10.1
86            
87 26         159 my @clusters = ($s =~ m/(\X)/g);
88            
89             # Generate all canonically equivalent permutation of each cluster:
90 26         1765 for $i (0 .. $#clusters)
91             {
92 41         90 $clusters[$i] = _permute_cluster ($clusters[$i], $trace);
93             # Note: result is a reference to an array!
94             }
95            
96             # Now rebuild all possible combinations of the clusters:
97 26         78 my $res = _generator (\@clusters);
98 26 50       105 if ($spaceAdded)
99             {
100             # Need to remove that leading space from each:
101 0         0 foreach $i (0 .. $#{$res})
  0         0  
102             {
103 0         0 $res->[$i] =~ s/^ //o;
104             }
105             }
106 26 50       53 if ($trace)
107             {
108 0         0 map { printMessage ($_) } @$res;
  0         0  
109             }
110            
111 26         263 return $res;
112             }
113              
114             # Given a reference to a list of arrays of strings, C returns reference to an unsorted list
115             # of all strings that can be generated by concatenating together one string from each array in the list.
116              
117             sub _generator
118             {
119 399     399   604 my ($a, # Initial parameter
120             $res, $i, $s # Parameters used in recursion
121             ) = @_;
122 399 100       894 unless ($res)
123             {
124 26         42 $res = {};
125 26         36 $i = 0;
126 26         39 $s = '';
127             }
128 399 100       424 if ($i > $#{$a})
  399         735  
129             {
130 298         583 $res->{$s} = 1;
131             }
132             else
133             {
134 101         221 foreach (@{$a->[$i]})
  101         187  
135             {
136 373         851 _generator ($a, $res, $i+1, $s . $_);
137             }
138             }
139 399 100       1320 return if $i > 1;
140 174         199 return [ keys %{$res} ];
  174         929  
141             }
142              
143              
144             # Given an L
145             # (EGC) C<_permute_cluster()> returns a reference to an unsorted array of all unique strings that
146             # are canonically equivalent to the EGC
147             #
148             # returns undef if the parameter is not an EGC, i.e. does not match C.
149              
150             # Implemented by brute force evaluation of all permutations so isn't too clever.
151             # Could be made more efficient, but since EGCs are short the inefficiency isn't huge.
152              
153             sub _permute_cluster {
154 41     41   71 my ($s, $trace) = @_;
155            
156             # make sure argument is an EGC
157 41 50       4545 return undef unless $s =~ /^\X$/;
158            
159             # retrieve required data from UnicodeData.txt
160 41 100       92 _getCompositions() unless %cpNonStarterComposites;
161            
162 41         600 my %res; # Place to keep result strings (as keys so we eliminate duplicates)
163              
164             # compute and save NFD of original -- we'll use it to tell whether a candidate
165             # is canonically equivalent to the original.
166 41         219 my $origNFD = NFD($s);
167            
168             # Start with fully decomposed string:
169 41         72 $s = $origNFD;
170 41 50       131 if (length($s) == 1)
171             {
172             # we can short-circuit the computation if the length of the decomposed string == 1
173 0 0       0 if (exists $sSingletonCompositions{$s})
174 0         0 { return [ $s, $sSingletonCompositions{$s} ]; }
175             else
176 0         0 { return [ $s ]; }
177             }
178              
179             # pick up the base character
180 41         101 my $base = substr($s, 0, 1);
181              
182             # Now calculate all permutations of everything else. We'll figure out whether a given
183             # permutation is canonically equivalent to the original in a minute.
184            
185 41         43 my %strList;
186 41         50 map { $strList{$base . $_ } = 1} @{_permute(substr($s,1))};
  99         253  
  41         107  
187              
188             # Try every one of the generated permutations of marks:
189 41         176 foreach $s (keys %strList)
190             {
191 99 100       9216 next if NFD($s) ne $origNFD; # Not equivalent to original -- ignore it.
192 57 50       133 next if exists $res{$s}; # Already seen this sequence
193              
194             # Now the fun! Generate every possible sequence from $s by composing pairs and singletons:
195            
196 57         137 my @work = ( [$s, 0] );
197 57         140 while ($#work >= 0)
198             {
199 485         553 my ($s, $i) = @{pop @work};
  485         728  
200 485 50       1574 printMessage ('POP:', $s, $i, length($s)) if $trace;
201 485 100       889 if ($i >= length ($s))
202             {
203             # We've worked our way to the end of the string. At this point we have
204             # some combination of composition and decomposition that should be equivalent
205             # to the original!
206 193         664 $res{$s} = 1; # Here's a keeper!
207             }
208             else
209             {
210 292         578 push @work, [ $s, $i+1 ];
211 292 100       905 if (exists $sSingletonCompositions{substr($s, $i, 1)})
212             {
213             # recompose this singleton and save result for work
214 30 50       57 printMessage("SING at $i:", substr($s, $i, 1), ' -> ', $sSingletonCompositions{substr($s, $i, 1)}) if $trace;
215 30         45 my $s2 = $s;
216 30         81 substr($s2, $i, 1) = $sSingletonCompositions{substr($s, $i, 1)};
217 30         69 push @work, [ $s2, $i ];
218             }
219 292         941 while ($i+1 < length($s))
220             {
221             # Try to combine two chars:
222 205         327 my $s2 = substr($s, $i, 2);
223 205         421 my ($u1, $u2) = unpack ( 'UU', $s2);
224 205   100     865 my $u = getComposite($u1, $u2) || $cpNonStarterComposites{$s2};
225 205 0       368 printMessage ("COMP at $i:", sprintf('%04X',$u1), sprintf('%04X',$u2), '->', defined $u ? sprintf('%04X',$u) : 'undef') if $trace;
    50          
226 205 100       697 last unless defined $u;
227 91         156 my $c = pack('U', $u);
228 91         201 substr($s, $i, 2) = $c;
229 91         226 push @work, [$s, $i+1];
230 91 100       370 if (exists $sSingletonCompositions{$c})
231             {
232 15 50       30 printMessage("SING at $i:", $c, '->', $sSingletonCompositions{$c}) if $trace;
233 15         21 my $s2 = $s;
234 15         38 substr($s2, $i, 1) = $sSingletonCompositions{$c};
235 15         86 push @work, [$s2, $i+1];
236             }
237             }
238             }
239             }
240             }
241              
242             # All done. Return the results
243 41         343 [ keys(%res) ]
244             }
245              
246             # I'm not happy with this hack. unicore/Decomposition.pl explicitly says the code is for internal use
247             # only, but I don't know any other reasonably efficient way to construct lists of Unicode compositions
248             # other than including my own copy of, for example UnicodeData.txt, but then I couldn't guarantee that
249             # my copy was in sync with the local Perl installation. Oh well.
250              
251             sub _getCompositions {
252             # Next few lines stolen shamelessly from Unicode::UCD
253 1     1   4169 for (split /^/m, do "unicore/Decomposition.pl") {
254 5679         43011 my ($start, $end, $decomp) = / ^ (.+?) \t (.*?) \t (.+?)
255             \s* ( \# .* )? # Optional comment
256             $ /x;
257 5679 100       13568 $end = $start if $end eq "";
258            
259 5679 100       22630 if ($decomp =~ /^([[:xdigit:]]{4,6})$/o) {
    100          
260             # Singleton decomposition -- keep a record of these:
261 1023         1477 my $d = $1;
262 1023         1842 foreach my $c (hex($start) .. hex($end)) {
263 1035         4840 $sSingletonCompositions{pack('U', hex($d))} = pack('U', $c); # NB: hash values are strings
264             }
265             }
266             elsif ($decomp =~ /^([[:xdigit:]]{4,6})\s+([[:xdigit:]]{4,6})$/o) {
267             # Possible non-starter decompsition
268 1018         1336 my ($d1, $d2) = map{hex} ($1, $2);
  2036         4041  
269 1018         2402 foreach my $c (hex($start) .. hex($end)) {
270 1018 100 100     6914 $cpNonStarterComposites{pack('UU', $d1, $d2)} = $c if getCombinClass($c) || getCombinClass($d1); # NB: hash values are codepoints
271             }
272             }
273             }
274             }
275              
276              
277             # Given a string, return a reference to an unsorted array containing
278             # all permutations of the string. Does not filter out duplicates which
279             # can result if one or more chars of the string are the same.
280              
281             # adaptation of the array permutation algorithm in FAQ 4
282             #(see "How do I permute N elements of a list?")
283             #
284             # I tried to making $list a hash rather than an array so as to eliminate duplicates,
285             # but Perl 5.6.1 had trouble figuring out that some strings were in fact
286             # UTF-8, so some data got munged. A hash would probably work on 5.8.
287              
288             sub _permute {
289 248     248   401 my ($src, # initial parameter
290             $res , $list # Parameters used in recursion
291             ) = @_;
292 248 100       477 unless ($list)
293             {
294 41         64 $list = {};
295 41         65 $res = '';
296             }
297 248 100       424 unless ($src) {
298 99         227 $list->{$res} = 1;
299             } else {
300 149         188 my($newsrc,$newres,$i);
301 149         300 foreach $i (0 .. length($src)-1) {
302 207         402 $newsrc = $src;
303 207         563 $newres = $res . substr($newsrc, $i, 1, "");
304 207         347 _permute($newsrc, $newres, $list);
305             }
306             }
307             # All done. Return the results
308 248         682 return [ keys %{$list} ];
  248         1092  
309             }
310              
311             sub printMessage
312             {
313 0     0 0   my $s = join(' ', @_);
314 0           print STDERR encode('ascii', $s, Encode::FB_PERLQQ) . "\n";
315             }
316              
317             1;
318              
319             =back
320              
321             =head1 BUGS
322              
323             Uses L. On some systems (e.g. ActiveState 5.6.1) Unicode::Normalize is aware
324             only of Unicode 3.0 and thus de/compositions introduced since Unicode 3.0 will not be used.
325              
326             =head1 AUTHOR
327              
328             Bob Hallissy
329              
330             =head1 COPYRIGHT
331              
332             Copyright(C) 2003-2011, SIL International.
333              
334             This package is published under the terms of the Perl Artistic License.