File Coverage

blib/lib/Unicode/SetAutomaton.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package Unicode::SetAutomaton;
2
3	2	2	58728	use strict;
	2		8
	2		81
4	2	2	12	use warnings;
	2		4
	2		60
5	2	2	2216	use Set::IntSpan;
	2		33448
	2		140
6	2	2	2451	use Set::IntSpan::Partition;
	0
	0
7				use Storable qw(freeze);
8
9				our $VERSION = '0.01';
10
11				our @utf8hs = (0x00000000, 0x00000000, 0x0000c080, 0x00e08080, 0xf0808080);
12				our @utf8min = (0x00000000, 0x00000000, 0x0000C280, 0x00E0A080, 0xF0908080);
13				our @utf8max = (0x00000000, 0x0000007F, 0x0000DFBF, 0x00EFBFBF, 0xF48FBFBF);
14
15				sub _u8enc {
16
17				my $cp = shift;
18				my $ln = 4;
19
20				# Encodes code points as utf-8 integers;
21				# for example, U+00F6 becomes 0x0000C3B6
22
23				return $cp if $cp <= 0x7F;
24
25				# Spread the bits to their target locations
26				my $ret = (($cp << 0) & 0x0000003f) \|
27				(($cp << 2) & 0x00003f00) \|
28				(($cp << 4) & 0x003f0000) \|
29				(($cp << 6) & 0x3f000000) ;
30
31				# Count the length
32				$ln -= $cp <= 0xFFFF;
33				$ln -= $cp <= 0x07FF;
34
35				# Merge the spread bits with the mode bits
36				return $ret \| $utf8hs[$ln];
37				}
38
39				sub _get_info {
40
41				my $u8 = shift;
42
43				my $width = 4;
44				$width -= $u8 <= 0xFFFFFF;
45				$width -= $u8 <= 0xFFFF;
46				$width -= $u8 <= 0xFF;
47
48				my $s1 = ($width - 1) * 8;
49				my $s2 = (4 - $width) * 8;
50				my $ml = 0x00808080 >> $s2;
51				my $mh = 0x00BFBFBF >> $s2;
52				my $xl = 0x00800000 >> $s2;
53				my $xh = 0x00BF0000 >> $s2;
54
55				# The first byte in the partial utf-8 sequence in u8.
56				my $head = $u8 >> $s1;
57
58				# All bytes after the first, or zero if there are none.
59				my $tail = ($u8 & $mh);
60
61				# Indicates whether the first byte after head is 0x80.
62				my $islow = ($u8 & $xh) == $xl;
63
64				# Indicates whether the first byte after head is 0xBF.
65				my $isupp = ($u8 & $xh) == $xh;
66
67				# Last partial sequence before sequences with head
68				my $pmax = ($head - 1) << $s1 \| $mh;
69
70				# First partial sequence for sequences with head
71				my $pmin = ($head + 0) << $s1 \| $ml;
72
73				# Last partial sequence for sequences with head
74				my $hmax = ($head + 0) << $s1 \| $mh;
75
76				# First partial sequence after sequences with head
77				my $hmin = ($head + 1) << $s1 \| $ml;
78
79				# There are a few special cases for the min,max items
80				# if the respective head is not a continuation octet.
81				# E.g., for 0xE0 pmin should be E0A080 but is E08080.
82				# The caller handles them indirectly by splitting.
83
84				my $i = {
85				width => $width, head => $head, tail => $tail,
86				isLow => $islow, isUpp => $isupp, hmin => $hmin,
87				hmax => $hmax, pmin => $pmin, pmax => $pmax,
88				};
89
90				return $i;
91				}
92
93				sub _get_next {
94
95				my $iter = shift;
96				my ($clas, $cmin, $cmax, $nmin, $min, $max);
97
98				if ($iter->{splitix}) {
99
100				# When splitting ranges a separate array is used to keep
101				# track of artificial ranges. If there any, use them first.
102				$cmax = $iter->{split}->[ -- $iter->{splitix} ];
103				$cmin = $iter->{split}->[ -- $iter->{splitix} ];
104				$clas = $iter->{split}->[ -- $iter->{splitix} ];
105
106				} elsif ( $iter->{derivix} <= $iter->{end} ) {
107
108				# If there are none, we pick a new range from the input.
109				$clas = $iter->{deriv}->[ $iter->{derivix} ++ ];
110				$cmin = $iter->{deriv}->[ $iter->{derivix} ++ ];
111				$cmax = $iter->{deriv}->[ $iter->{derivix} ++ ];
112
113				} else {
114				return
115				}
116
117				# Compute various properties of the partial sequences.
118				$min = _get_info($cmin);
119				$max = _get_info($cmax);
120
121				if ($min->{width} != $max->{width}) {
122
123				# The range crosses width boundaries, so split it.
124				$nmin = $utf8min[ $min->{width} + 1 ];
125				$iter->{split}->[ $iter->{splitix} ++ ] = $clas;
126				$iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
127				$iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
128				$cmax = $utf8max[ $min->{width} ];
129				}
130
131				if ($cmin >= 0x00eda080 and $cmax <= 0x00edbfbf) {
132
133				# The current range contains only surrogate code points
134				# which are not allowed. So get the next range, if any.
135				return _get_next( $iter );
136
137				} elsif ($cmin >= 0x00eda080 and $cmin <= 0x00edbfbf) {
138
139				# cmin is somewhere inside the surrogate range and cmax
140				# is not. So we set cmin to the first non-surrogate.
141				$cmin = 0x00ee8080;
142
143				} elsif ($cmax >= 0x00eda080 and $cmax <= 0x00edbfbf) {
144
145				# cmax is somewhere inside the surrogate range and cmin
146				# is not. So we set cmax to the last non-surrogate.
147				$cmax = 0x00ed9fbf;
148
149				} elsif ($cmin < 0x00eda080 and $cmax > 0x00edbfbf) {
150
151				# The range includes code points before and after the
152				# surrogate range. So we have to split it into two.
153				$nmin = 0x00ee8080;
154				$iter->{split}->[ $iter->{splitix} ++ ] = $clas;
155				$iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
156				$iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
157				$cmax = 0x00ed9fbf;
158				}
159
160				# cmin and cmax may have changed so recompute the info
161				$min = _get_info($cmin);
162				$max = _get_info($cmax);
163
164				if (!($min->{head} == $max->{head}) && !($min->{isLow} && $max->{isUpp})) {
165
166				if ($min->{isLow}) {
167
168				# min is a lower and but max is not an upper end, so we split
169				# the range into two, one going from min to "one" before max,
170				# and the other going from the beginning of max's range to max.
171				$nmin = $max->{pmin};
172				$iter->{split}->[ $iter->{splitix} ++ ] = $clas;
173				$iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
174				$iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
175				$cmax = $max->{pmax};
176
177				} else {
178
179				# if the heads are different and min is not a lower end then
180				# we have to complete min's range first, so split the range.
181				$nmin = $min->{hmin};
182				$iter->{split}->[ $iter->{splitix} ++ ] = $clas;
183				$iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
184				$iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
185				$cmax = $min->{hmax};
186
187				}
188				}
189
190				# cmax may have changed so recompute the info
191				$max = _get_info($cmax);
192
193				return $clas, $min, $max;
194				}
195
196				sub _triples_to_dfa {
197
198				my @triples = @_;
199				my (@d, @todo, $d2s, $s2d);
200
201				# The deriv array stores all character class, or rather, utf-8 range
202				# information. Each class stores the length of the subsequent data as
203				# first item. For classes representing end states the next value is
204				# the number of the class. For other classes a list of
205				# triples follows. Array references and other structures could be
206				# used instead, however this structure mirrors the C implementation.
207
208				my @deriv = ( scalar(@triples), @triples );
209				my $lengthix = scalar @deriv;
210				my $nextix = scalar @deriv + 1;
211				my $nextnum = 0;
212				my $start = $nextnum++;
213				my $obj2num = { freeze(\@triples), $start };
214
215				$deriv[$lengthix] = 0;
216
217				push @todo, [ $start, $start ];
218
219				while (@todo) {
220
221				my ($index, $currentS) = @{ pop @todo };
222
223				# A special iterator is used to go over the utf-8 ranges in a class.
224				# The basic idea of this algorithm is to compute all derivatives of
225				# a given class of utf-8 ranges; the iterator automatically splits
226				# these ranges such that we can take the "heads" of a range as label
227				# for a transition, and the "tails" as range for the next class.
228
229				# As an example, consider a simple class from U+00E4 to U+00F6. The
230				# range is utf-8 encoded to 0x0000C3A4 .. 0x0000C3B6. Here the head
231				# range would be 0xC3 .. 0xC3 and the tail range 0xA4 .. 0xB6. Then
232				# ~~-- 0xC3 .. 0xC3 --> <1>, <1> -- 0xA4 .. 0xB6 --> would be~~
233				# the automaton. See the _get_next routine for details on splitting.
234
235				my $iter = {
236				split => [],
237				splitix => 0,
238				deriv => \@deriv,
239				derivix => $index + 1,
240				end => $index + $deriv[$index],
241				};
242
243				my $prev;
244
245				while (1) {
246
247				my $success = my ($cls, $min, $max) = _get_next($iter);
248
249				# If there are no more ranges in the current class, or if the latest
250				# head range is not equal to the previous one, we found the end of
251				# a new class. Note that head ranges can only be the same if min,max
252				# are equal for both the previous and the current range, so we only
253				# have to check the two min values.
254				if ((not $success) or (defined $prev && $prev->[1] ne $min->{head})) {
255
256				# The hash table obj2num is used to ensure that the same classes
257				# are assigned the same state number; not doing so would result
258				# an automaton that is not minimal in the number of states, and
259				# minimizing it later would be considerably more costly.
260
261				my @new = @deriv[($lengthix + 1) .. ($lengthix + $deriv[$lengthix])];
262				my $ice = freeze(\@new);
263				my $num = $obj2num->{$ice};
264
265				if (not defined $num) {
266				$num = $obj2num->{$ice} = $nextnum++;
267
268				# End states store only the number of the associated class;
269				# end states do not have outgoing transitions so we do not
270				# add them to the todo list. Other classes are added to it.
271
272				if ($deriv[$lengthix] > 1) {
273				push @todo, [$lengthix, $num]
274				} else {
275				$s2d->{ $num } = $deriv[$lengthix + 1];
276				$d2s->[ $deriv[$lengthix + 1] ] = $num;
277				}
278				}
279
280				# Record the newly found transition in the transition table
281				# as four-tuple .
282				push @d, [ $currentS, $prev->[1], $prev->[2], $num ];
283
284				$lengthix = $nextix++;
285				$deriv[$lengthix] = 0;
286				last unless $success;
287				}
288
289				# min and max always have the same width. If the width of the
290				# current range is one then we are moving to an end state. If
291				# it is greater than one, we are creating a new partial class.
292
293				if ($min->{width} != 1) {
294				$deriv[ $nextix++ ] = $cls;
295				$deriv[ $nextix++ ] = $min->{tail};
296				$deriv[ $nextix++ ] = $max->{tail};
297				$deriv[ $lengthix ] += 3;
298				} else {
299				$deriv[ $nextix++ ] = $cls;
300				$deriv[ $lengthix ] += 1;
301				}
302
303				$prev = [ $cls, $min->{head}, $max->{head} ]
304				}
305				}
306
307				return $start, \@d, $d2s, $s2d;
308				}
309
310				sub new {
311				my $class = shift;
312				my %param = @_;
313				my $self = bless { }, $class;
314				my @input = @{$param{classes}};
315				my @spans;
316
317				# A deterministic finite automaton can only be in a single state
318				# at a time, so split the input classes minimally such that each
319				# code point belongs to at most a single class, not multiple ones.
320				my @disjoint = intspan_partition(@input);
321
322				# intspan_partition unfortunately does not keep track of how it
323				# splits classes; we'd like to know, so restore the information.
324				for (my $i = 0; $i <= $#disjoint; $i++) {
325				for (my $j = 0; $j <= $#input; $j++) {
326				next unless $disjoint[$i]->subset($input[$j]);
327				push @{$self->{disjoint_to_input}->[$i]}, $j;
328				}
329				}
330
331				# The construction algorithm considers all spans at once, so we
332				# collect all into a single array, noting where each belongs.
333				for (my $i = 0; $i <= $#disjoint; $i++) {
334				foreach my $span ($disjoint[$i]->spans) {
335				push @spans, [ $i, @$span ];
336				}
337				}
338
339				# While not strictly necessary, it is better to sort the spans,
340				# so we do that here. Note that spans are disjoint, so we only
341				# have to compare the relevant minimum value for each span pair.
342				my @sorted = sort { $a->[1] <=> $b->[1] } @spans;
343
344				# Now we can generate a single list for all
345				# triples where min and max are utf-8 integers. It is easier to
346				# do this here then telling a complete class apart from partial
347				# classes generated later; the spans are not array references
348				# mainly because that mirrors the C implementation more closely.
349				my @u8triples = map {
350				$_->[0], _u8enc($_->[1]), _u8enc($_->[2])
351				} @sorted;
352
353				my ($start, $d, $d2s, $s2d) = _triples_to_dfa(@u8triples);
354
355				$self->{state_to_disjoint} = $s2d;
356				$self->{disjoint_to_state} = $d2s;
357				$self->{disjoint_classes} = \@disjoint;
358				$self->{input_classes} = \@input;
359				$self->{start_state} = $start;
360				$self->{transitions} = $d;
361
362				return $self;
363				}
364
365				sub _regex_append {
366				my $node = shift;
367				my $type = shift;
368
369				if (UNIVERSAL::isa($node, 'Set::IntSpan')) {
370
371				if ($node->size == 1) {
372				$_[0] .= sprintf "\\x%02x", $node->elements
373
374				} else {
375				$_[0] .= "[";
376
377				foreach my $span ($node->spans) {
378				if ($span->[0] == $span->[1]) {
379				$_[0] .= sprintf "\\x%02x", $span->[0]
380				} else {
381				$_[0] .= sprintf "\\x%02x-\\x%02x", @$span
382				}
383				}
384
385				$_[0] .= "]";
386				}
387
388				} elsif ($node->[0] eq 'Group') {
389				_regex_append($node->[1], 'Group', $_[0]);
390				_regex_append($node->[2], 'Group', $_[0]);
391
392				} elsif ($node->[0] eq 'Choice' and $type eq 'Group') {
393				$_[0] .= "(";
394				_regex_append($node->[1], 'Choice', $_[0]);
395				$_[0] .= "\|";
396				_regex_append($node->[2], 'Choice', $_[0]);
397				$_[0] .= ")";
398
399				} elsif ($node->[0] eq 'Choice') {
400				_regex_append($node->[1], 'Choice', $_[0]);
401				$_[0] .= "\|";
402				_regex_append($node->[2], 'Choice', $_[0]);
403
404				} else {
405				die
406				}
407
408				}
409
410				sub as_expressions {
411				my $self = shift;
412				my $last = 0;
413				my @m;
414
415				require Graph::Directed;
416				my $g = Graph::Directed->new;
417
418				# Convert the transitions into a matrix using Set::IntSpan objects
419				# to represent byte classes and use a graph to keep track of the
420				# predecessors and successors of each state. Would be nice if the
421				# Set::IntSpan::union method accepted undef as set to avoid the if.
422
423				foreach my $transition (@{ $self->{transitions} }) {
424				my ($src, $min, $max, $dst) = @$transition;
425
426				if (defined $m[$src][$dst]) {
427				$m[$src][$dst] = $m[$src][$dst]->union([[$min,$max]]);
428				} else {
429				$m[$src][$dst] = Set::IntSpan->new([[$min,$max]]);
430				}
431
432				$g->add_edge($src, $dst);
433				$last = $dst > $last ? $dst : $last;
434				}
435
436				# States will be eliminated in the reverse order of their creation.
437				# I am unsure if that produces the best result but could so far not
438				# find counter-examples. A more elaborate algorithm would make sure
439				# a state is removed before any, if that is possible, that must be
440				# visited before or after when going from start state to final state.
441
442				my @order = grep {
443
444				# We only remove a state if it is neither the start state nor
445				# a final state. $self->{state_to_disjoint} holds final ones.
446
447				$_ != $self->{start_state} and
448				not exists $self->{state_to_disjoint}->{$_}
449
450				} (0 .. $last);
451
452				while (@order) {
453
454				my $curr = pop @order;
455				my @pred = $g->predecessors($curr);
456				my @succ = $g->successors($curr);
457
458				# A state is eliminated by connecting all predecessors with all
459				# successors by an increasingly complex regular expression. We
460				# store the regular expression as a binary tree to ease adding
461				# needed braces later. Note that the transition graph does not
462				# have cycles, otherwise we would have to encode the cycle too.
463
464				foreach my $pred (@pred) {
465				foreach my $succ (@succ) {
466
467				my $group = [ Group => $m[$pred][$curr], $m[$curr][$succ] ];
468				if ($m[$pred][$succ]) {
469				$m[$pred][$succ] = [ Choice => $m[$pred][$succ], $group ];
470				} else {
471				$m[$pred][$succ] = $group;
472				$g->add_edge($pred, $succ);
473				}
474				}
475				}
476
477				$g->delete_vertex($curr);
478				}
479
480				# Now the matrix has a regular expression for each of the dis-
481				# joint classes at m[start_state][final_state]. We iterate over
482				# the disjoint classes, pretty print the expression, and return
483				# them in the order of the disjoint classes.
484
485				my @expressions;
486
487				for (my $i = 0; $i <= $#{ $self->{disjoint_to_state} }; $i++) {
488				my $final = $self->{disjoint_to_state}->[$i];
489				my $regex = "";
490				_regex_append($m[$self->{start_state}][$final], 'Root', $regex);
491				push @expressions, $regex;
492				}
493
494				return @expressions;
495				}
496
497				1;
498
499				__END__