File Coverage

blib/lib/Text/Unicode/Equivalents.pm

Criterion	Covered	Total	%
statement	112	124	90.3
branch	40	54	74.0
condition	6	6	100.0
subroutine	12	13	92.3
pod	0	2	0.0
total	170	199	85.4

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Unicode::Equivalents;
2
3	1			1		161938	use strict;
	1					3
	1					45
4	1			1		6	use warnings;
	1					2
	1					32
5	1			1		1120	use utf8;
	1					16
	1					5
6	1			1		68651	use Unicode::Normalize qw(NFD getCanon getComposite getCombinClass);
	1					10137
	1					171
7	1			1		1486	use Unicode::UCD;
	1					81536
	1					70
8	1			1		7413	use Encode;
	1					18807
	1					87
9	1			1		7	use Carp;
	1					2
	1					196
10
11							require 5.8.0; # Had some trouble with Unicode character handling in 5.6.
12
13							require Exporter;
14
15							our @ISA = qw(Exporter);
16							our @EXPORT_OK = qw( all_strings );
17
18							our $VERSION = '0.05'; # RMH 2011-06-27
19							# Changes to Makefile.PL and tests.t to improve portability
20							# Added comments about \X being different on 5.10 vs 5.12
21							# our $VERSION = '0.04'; # RMH 2011-06-27
22							# Perl 5.14 doesn't have unicore/UnicodeData.txt, so changing to use unicode/Decomposition.pl
23							# our $VERSION = '0.03'; # RMH 2011-06-24
24							# Change module name to Text::Unicode::Equivalents -- more acceptable to CPAN
25							# Eliminate all but one public function, which is renamed all_strings()
26							# Previous version didn't synthesize singletons
27							# Eliminate hard-coding of %nonStarterComposites
28							# Eliminate $ignoreSingletons parameter -- not very useful and implementation was squirrely anyay
29							# our $VERSION = '0.02'; # RMH 2004-11-08
30							# Added equivalents()
31							# Rewrote permuteCompositeChar() so composes medial sequences as well as initial
32							# As a result, it now composes 0308+0301 -> 0344
33							# our $VERSION = '0.01'; # RMH 2003-05-02 Original
34
35							=head1 NAME
36
37							Text::Unicode::Equivalents - synthesize canonically equivalent strings
38
39							=head1 SYNOPSIS
40
41							use Text::Unicode::Equivalents qw( all_strings);
42
43							$aref = all_strings ($string);
44							map {print "$_\n"} @{$aref};
45
46							=head1 DESCRIPTION
47
48							=cut
49
50							# The two things I can't seem to make the Unicode module do are to (1) compose two diacritics, e.g.,
51							# <0308+0301> => 0344 (Unicode calls such decompositions "non-starters" and won't compose them) and
52							# (2) compose a singleton. So I use unicore/Decomposition.pl to generate two hashes:
53
54							my %sSingletonCompositions; # keyed by single character string; returns its singleton composite, as a string.
55							my %cpNonStarterComposites; # keyed by two-character string that has a non-starter composition; returns codepoint of the composite.
56
57							=over
58
59							=item all_string($s)
60
61							Given an arbitrary string, C
62							returns a reference to an unsorted array of all unique strings that are canonically
63							equivalent to the argument.
64
65							=cut
66
67							sub all_strings
68							{
69	26			26	0	17096	my ($s, $trace) = @_;
70	26					41	my $i;
71
72							# If string starts with combining mark, prefix space so we get a proper cluster:
73							my $spaceAdded;
74	26	50				106	if ($s =~ /^\pM/)
75							{
76	0					0	$s = ' ' . $s ;
77	0					0	$spaceAdded = 1;
78							}
79
80							# Split string into Extended Grapheme Clusters
81
82							# NB:
83							# on Perl prior to v5.12, \X matches Unicode "combining character sequence", equivalent to (?>\PM\pM*)
84							# on Perl v5.12 and later, \X matches Unicode "eXtended grapheme cluster"
85							# Thus \X matches combining hangul jamo sequence such as "\x{1100}\x{1161}\x{11a8}" on 12.0, but not 10.1
86
87	26					159	my @clusters = ($s =~ m/(\X)/g);
88
89							# Generate all canonically equivalent permutation of each cluster:
90	26					1765	for $i (0 .. $#clusters)
91							{
92	41					90	$clusters[$i] = _permute_cluster ($clusters[$i], $trace);
93							# Note: result is a reference to an array!
94							}
95
96							# Now rebuild all possible combinations of the clusters:
97	26					78	my $res = _generator (\@clusters);
98	26	50				105	if ($spaceAdded)
99							{
100							# Need to remove that leading space from each:
101	0					0	foreach $i (0 .. $#{$res})
	0					0
102							{
103	0					0	$res->[$i] =~ s/^ //o;
104							}
105							}
106	26	50				53	if ($trace)
107							{
108	0					0	map { printMessage ($_) } @$res;
	0					0
109							}
110
111	26					263	return $res;
112							}
113
114							# Given a reference to a list of arrays of strings, C returns reference to an unsorted list
115							# of all strings that can be generated by concatenating together one string from each array in the list.
116
117							sub _generator
118							{
119	399			399		604	my ($a, # Initial parameter
120							$res, $i, $s # Parameters used in recursion
121							) = @_;
122	399	100				894	unless ($res)
123							{
124	26					42	$res = {};
125	26					36	$i = 0;
126	26					39	$s = '';
127							}
128	399	100				424	if ($i > $#{$a})
	399					735
129							{
130	298					583	$res->{$s} = 1;
131							}
132							else
133							{
134	101					221	foreach (@{$a->[$i]})
	101					187
135							{
136	373					851	_generator ($a, $res, $i+1, $s . $_);
137							}
138							}
139	399	100				1320	return if $i > 1;
140	174					199	return [ keys %{$res} ];
	174					929
141							}
142
143
144							# Given an L
145							# (EGC) C<_permute_cluster()> returns a reference to an unsorted array of all unique strings that
146							# are canonically equivalent to the EGC
147							#
148							# returns undef if the parameter is not an EGC, i.e. does not match C.
149
150							# Implemented by brute force evaluation of all permutations so isn't too clever.
151							# Could be made more efficient, but since EGCs are short the inefficiency isn't huge.
152
153							sub _permute_cluster {
154	41			41		71	my ($s, $trace) = @_;
155
156							# make sure argument is an EGC
157	41	50				4545	return undef unless $s =~ /^\X$/;
158
159							# retrieve required data from UnicodeData.txt
160	41	100				92	_getCompositions() unless %cpNonStarterComposites;
161
162	41					600	my %res; # Place to keep result strings (as keys so we eliminate duplicates)
163
164							# compute and save NFD of original -- we'll use it to tell whether a candidate
165							# is canonically equivalent to the original.
166	41					219	my $origNFD = NFD($s);
167
168							# Start with fully decomposed string:
169	41					72	$s = $origNFD;
170	41	50				131	if (length($s) == 1)
171							{
172							# we can short-circuit the computation if the length of the decomposed string == 1
173	0	0				0	if (exists $sSingletonCompositions{$s})
174	0					0	{ return [ $s, $sSingletonCompositions{$s} ]; }
175							else
176	0					0	{ return [ $s ]; }
177							}
178
179							# pick up the base character
180	41					101	my $base = substr($s, 0, 1);
181
182							# Now calculate all permutations of everything else. We'll figure out whether a given
183							# permutation is canonically equivalent to the original in a minute.
184
185	41					43	my %strList;
186	41					50	map { $strList{$base . $_ } = 1} @{_permute(substr($s,1))};
	99					253
	41					107
187
188							# Try every one of the generated permutations of marks:
189	41					176	foreach $s (keys %strList)
190							{
191	99	100				9216	next if NFD($s) ne $origNFD; # Not equivalent to original -- ignore it.
192	57	50				133	next if exists $res{$s}; # Already seen this sequence
193
194							# Now the fun! Generate every possible sequence from $s by composing pairs and singletons:
195
196	57					137	my @work = ( [$s, 0] );
197	57					140	while ($#work >= 0)
198							{
199	485					553	my ($s, $i) = @{pop @work};
	485					728
200	485	50				1574	printMessage ('POP:', $s, $i, length($s)) if $trace;
201	485	100				889	if ($i >= length ($s))
202							{
203							# We've worked our way to the end of the string. At this point we have
204							# some combination of composition and decomposition that should be equivalent
205							# to the original!
206	193					664	$res{$s} = 1; # Here's a keeper!
207							}
208							else
209							{
210	292					578	push @work, [ $s, $i+1 ];
211	292	100				905	if (exists $sSingletonCompositions{substr($s, $i, 1)})
212							{
213							# recompose this singleton and save result for work
214	30	50				57	printMessage("SING at $i:", substr($s, $i, 1), ' -> ', $sSingletonCompositions{substr($s, $i, 1)}) if $trace;
215	30					45	my $s2 = $s;
216	30					81	substr($s2, $i, 1) = $sSingletonCompositions{substr($s, $i, 1)};
217	30					69	push @work, [ $s2, $i ];
218							}
219	292					941	while ($i+1 < length($s))
220							{
221							# Try to combine two chars:
222	205					327	my $s2 = substr($s, $i, 2);
223	205					421	my ($u1, $u2) = unpack ( 'UU', $s2);
224	205		100			865	my $u = getComposite($u1, $u2) \|\| $cpNonStarterComposites{$s2};
225	205	0				368	printMessage ("COMP at $i:", sprintf('%04X',$u1), sprintf('%04X',$u2), '->', defined $u ? sprintf('%04X',$u) : 'undef') if $trace;
		50
226	205	100				697	last unless defined $u;
227	91					156	my $c = pack('U', $u);
228	91					201	substr($s, $i, 2) = $c;
229	91					226	push @work, [$s, $i+1];
230	91	100				370	if (exists $sSingletonCompositions{$c})
231							{
232	15	50				30	printMessage("SING at $i:", $c, '->', $sSingletonCompositions{$c}) if $trace;
233	15					21	my $s2 = $s;
234	15					38	substr($s2, $i, 1) = $sSingletonCompositions{$c};
235	15					86	push @work, [$s2, $i+1];
236							}
237							}
238							}
239							}
240							}
241
242							# All done. Return the results
243	41					343	[ keys(%res) ]
244							}
245
246							# I'm not happy with this hack. unicore/Decomposition.pl explicitly says the code is for internal use
247							# only, but I don't know any other reasonably efficient way to construct lists of Unicode compositions
248							# other than including my own copy of, for example UnicodeData.txt, but then I couldn't guarantee that
249							# my copy was in sync with the local Perl installation. Oh well.
250
251							sub _getCompositions {
252							# Next few lines stolen shamelessly from Unicode::UCD
253	1			1		4169	for (split /^/m, do "unicore/Decomposition.pl") {
254	5679					43011	my ($start, $end, $decomp) = / ^ (.+?) \t (.*?) \t (.+?)
255							\s* ( \# .* )? # Optional comment
256							$ /x;
257	5679	100				13568	$end = $start if $end eq "";
258
259	5679	100				22630	if ($decomp =~ /^([[:xdigit:]]{4,6})$/o) {
		100
260							# Singleton decomposition -- keep a record of these:
261	1023					1477	my $d = $1;
262	1023					1842	foreach my $c (hex($start) .. hex($end)) {
263	1035					4840	$sSingletonCompositions{pack('U', hex($d))} = pack('U', $c); # NB: hash values are strings
264							}
265							}
266							elsif ($decomp =~ /^([[:xdigit:]]{4,6})\s+([[:xdigit:]]{4,6})$/o) {
267							# Possible non-starter decompsition
268	1018					1336	my ($d1, $d2) = map{hex} ($1, $2);
	2036					4041
269	1018					2402	foreach my $c (hex($start) .. hex($end)) {
270	1018	100	100			6914	$cpNonStarterComposites{pack('UU', $d1, $d2)} = $c if getCombinClass($c) \|\| getCombinClass($d1); # NB: hash values are codepoints
271							}
272							}
273							}
274							}
275
276
277							# Given a string, return a reference to an unsorted array containing
278							# all permutations of the string. Does not filter out duplicates which
279							# can result if one or more chars of the string are the same.
280
281							# adaptation of the array permutation algorithm in FAQ 4
282							#(see "How do I permute N elements of a list?")
283							#
284							# I tried to making $list a hash rather than an array so as to eliminate duplicates,
285							# but Perl 5.6.1 had trouble figuring out that some strings were in fact
286							# UTF-8, so some data got munged. A hash would probably work on 5.8.
287
288							sub _permute {
289	248			248		401	my ($src, # initial parameter
290							$res , $list # Parameters used in recursion
291							) = @_;
292	248	100				477	unless ($list)
293							{
294	41					64	$list = {};
295	41					65	$res = '';
296							}
297	248	100				424	unless ($src) {
298	99					227	$list->{$res} = 1;
299							} else {
300	149					188	my($newsrc,$newres,$i);
301	149					300	foreach $i (0 .. length($src)-1) {
302	207					402	$newsrc = $src;
303	207					563	$newres = $res . substr($newsrc, $i, 1, "");
304	207					347	_permute($newsrc, $newres, $list);
305							}
306							}
307							# All done. Return the results
308	248					682	return [ keys %{$list} ];
	248					1092
309							}
310
311							sub printMessage
312							{
313	0			0	0		my $s = join(' ', @_);
314	0						print STDERR encode('ascii', $s, Encode::FB_PERLQQ) . "\n";
315							}
316
317							1;
318
319							=back
320
321							=head1 BUGS
322
323							Uses L. On some systems (e.g. ActiveState 5.6.1) Unicode::Normalize is aware
324							only of Unicode 3.0 and thus de/compositions introduced since Unicode 3.0 will not be used.
325
326							=head1 AUTHOR
327
328							Bob Hallissy
329
330							=head1 COPYRIGHT
331
332							Copyright(C) 2003-2011, SIL International.
333
334							This package is published under the terms of the Perl Artistic License.