File Coverage

blib/lib/Unicode/SetAutomaton.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package Unicode::SetAutomaton;
2            
3 2     2   58728 use strict;
  2         8  
  2         81  
4 2     2   12 use warnings;
  2         4  
  2         60  
5 2     2   2216 use Set::IntSpan;
  2         33448  
  2         140  
6 2     2   2451 use Set::IntSpan::Partition;
  0            
  0            
7             use Storable qw(freeze);
8            
9             our $VERSION = '0.01';
10            
11             our @utf8hs = (0x00000000, 0x00000000, 0x0000c080, 0x00e08080, 0xf0808080);
12             our @utf8min = (0x00000000, 0x00000000, 0x0000C280, 0x00E0A080, 0xF0908080);
13             our @utf8max = (0x00000000, 0x0000007F, 0x0000DFBF, 0x00EFBFBF, 0xF48FBFBF);
14            
15             sub _u8enc {
16            
17             my $cp = shift;
18             my $ln = 4;
19            
20             # Encodes code points as utf-8 integers;
21             # for example, U+00F6 becomes 0x0000C3B6
22            
23             return $cp if $cp <= 0x7F;
24            
25             # Spread the bits to their target locations
26             my $ret = (($cp << 0) & 0x0000003f) |
27             (($cp << 2) & 0x00003f00) |
28             (($cp << 4) & 0x003f0000) |
29             (($cp << 6) & 0x3f000000) ;
30            
31             # Count the length
32             $ln -= $cp <= 0xFFFF;
33             $ln -= $cp <= 0x07FF;
34            
35             # Merge the spread bits with the mode bits
36             return $ret | $utf8hs[$ln];
37             }
38            
39             sub _get_info {
40            
41             my $u8 = shift;
42            
43             my $width = 4;
44             $width -= $u8 <= 0xFFFFFF;
45             $width -= $u8 <= 0xFFFF;
46             $width -= $u8 <= 0xFF;
47            
48             my $s1 = ($width - 1) * 8;
49             my $s2 = (4 - $width) * 8;
50             my $ml = 0x00808080 >> $s2;
51             my $mh = 0x00BFBFBF >> $s2;
52             my $xl = 0x00800000 >> $s2;
53             my $xh = 0x00BF0000 >> $s2;
54            
55             # The first byte in the partial utf-8 sequence in u8.
56             my $head = $u8 >> $s1;
57            
58             # All bytes after the first, or zero if there are none.
59             my $tail = ($u8 & $mh);
60            
61             # Indicates whether the first byte after head is 0x80.
62             my $islow = ($u8 & $xh) == $xl;
63            
64             # Indicates whether the first byte after head is 0xBF.
65             my $isupp = ($u8 & $xh) == $xh;
66            
67             # Last partial sequence before sequences with head
68             my $pmax = ($head - 1) << $s1 | $mh;
69            
70             # First partial sequence for sequences with head
71             my $pmin = ($head + 0) << $s1 | $ml;
72            
73             # Last partial sequence for sequences with head
74             my $hmax = ($head + 0) << $s1 | $mh;
75            
76             # First partial sequence after sequences with head
77             my $hmin = ($head + 1) << $s1 | $ml;
78            
79             # There are a few special cases for the min,max items
80             # if the respective head is not a continuation octet.
81             # E.g., for 0xE0 pmin should be E0A080 but is E08080.
82             # The caller handles them indirectly by splitting.
83            
84             my $i = {
85             width => $width, head => $head, tail => $tail,
86             isLow => $islow, isUpp => $isupp, hmin => $hmin,
87             hmax => $hmax, pmin => $pmin, pmax => $pmax,
88             };
89            
90             return $i;
91             }
92            
93             sub _get_next {
94            
95             my $iter = shift;
96             my ($clas, $cmin, $cmax, $nmin, $min, $max);
97            
98             if ($iter->{splitix}) {
99            
100             # When splitting ranges a separate array is used to keep
101             # track of artificial ranges. If there any, use them first.
102             $cmax = $iter->{split}->[ -- $iter->{splitix} ];
103             $cmin = $iter->{split}->[ -- $iter->{splitix} ];
104             $clas = $iter->{split}->[ -- $iter->{splitix} ];
105            
106             } elsif ( $iter->{derivix} <= $iter->{end} ) {
107            
108             # If there are none, we pick a new range from the input.
109             $clas = $iter->{deriv}->[ $iter->{derivix} ++ ];
110             $cmin = $iter->{deriv}->[ $iter->{derivix} ++ ];
111             $cmax = $iter->{deriv}->[ $iter->{derivix} ++ ];
112            
113             } else {
114             return
115             }
116            
117             # Compute various properties of the partial sequences.
118             $min = _get_info($cmin);
119             $max = _get_info($cmax);
120            
121             if ($min->{width} != $max->{width}) {
122            
123             # The range crosses width boundaries, so split it.
124             $nmin = $utf8min[ $min->{width} + 1 ];
125             $iter->{split}->[ $iter->{splitix} ++ ] = $clas;
126             $iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
127             $iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
128             $cmax = $utf8max[ $min->{width} ];
129             }
130            
131             if ($cmin >= 0x00eda080 and $cmax <= 0x00edbfbf) {
132            
133             # The current range contains only surrogate code points
134             # which are not allowed. So get the next range, if any.
135             return _get_next( $iter );
136            
137             } elsif ($cmin >= 0x00eda080 and $cmin <= 0x00edbfbf) {
138            
139             # cmin is somewhere inside the surrogate range and cmax
140             # is not. So we set cmin to the first non-surrogate.
141             $cmin = 0x00ee8080;
142            
143             } elsif ($cmax >= 0x00eda080 and $cmax <= 0x00edbfbf) {
144            
145             # cmax is somewhere inside the surrogate range and cmin
146             # is not. So we set cmax to the last non-surrogate.
147             $cmax = 0x00ed9fbf;
148            
149             } elsif ($cmin < 0x00eda080 and $cmax > 0x00edbfbf) {
150            
151             # The range includes code points before and after the
152             # surrogate range. So we have to split it into two.
153             $nmin = 0x00ee8080;
154             $iter->{split}->[ $iter->{splitix} ++ ] = $clas;
155             $iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
156             $iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
157             $cmax = 0x00ed9fbf;
158             }
159            
160             # cmin and cmax may have changed so recompute the info
161             $min = _get_info($cmin);
162             $max = _get_info($cmax);
163            
164             if (!($min->{head} == $max->{head}) && !($min->{isLow} && $max->{isUpp})) {
165            
166             if ($min->{isLow}) {
167            
168             # min is a lower and but max is not an upper end, so we split
169             # the range into two, one going from min to "one" before max,
170             # and the other going from the beginning of max's range to max.
171             $nmin = $max->{pmin};
172             $iter->{split}->[ $iter->{splitix} ++ ] = $clas;
173             $iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
174             $iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
175             $cmax = $max->{pmax};
176            
177             } else {
178            
179             # if the heads are different and min is not a lower end then
180             # we have to complete min's range first, so split the range.
181             $nmin = $min->{hmin};
182             $iter->{split}->[ $iter->{splitix} ++ ] = $clas;
183             $iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
184             $iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
185             $cmax = $min->{hmax};
186            
187             }
188             }
189            
190             # cmax may have changed so recompute the info
191             $max = _get_info($cmax);
192            
193             return $clas, $min, $max;
194             }
195            
196             sub _triples_to_dfa {
197            
198             my @triples = @_;
199             my (@d, @todo, $d2s, $s2d);
200            
201             # The deriv array stores all character class, or rather, utf-8 range
202             # information. Each class stores the length of the subsequent data as
203             # first item. For classes representing end states the next value is
204             # the number of the class. For other classes a list of
205             # triples follows. Array references and other structures could be
206             # used instead, however this structure mirrors the C implementation.
207            
208             my @deriv = ( scalar(@triples), @triples );
209             my $lengthix = scalar @deriv;
210             my $nextix = scalar @deriv + 1;
211             my $nextnum = 0;
212             my $start = $nextnum++;
213             my $obj2num = { freeze(\@triples), $start };
214            
215             $deriv[$lengthix] = 0;
216            
217             push @todo, [ $start, $start ];
218            
219             while (@todo) {
220            
221             my ($index, $currentS) = @{ pop @todo };
222            
223             # A special iterator is used to go over the utf-8 ranges in a class.
224             # The basic idea of this algorithm is to compute all derivatives of
225             # a given class of utf-8 ranges; the iterator automatically splits
226             # these ranges such that we can take the "heads" of a range as label
227             # for a transition, and the "tails" as range for the next class.
228            
229             # As an example, consider a simple class from U+00E4 to U+00F6. The
230             # range is utf-8 encoded to 0x0000C3A4 .. 0x0000C3B6. Here the head
231             # range would be 0xC3 .. 0xC3 and the tail range 0xA4 .. 0xB6. Then
232             # -- 0xC3 .. 0xC3 --> <1>, <1> -- 0xA4 .. 0xB6 --> would be
233             # the automaton. See the _get_next routine for details on splitting.
234            
235             my $iter = {
236             split => [],
237             splitix => 0,
238             deriv => \@deriv,
239             derivix => $index + 1,
240             end => $index + $deriv[$index],
241             };
242            
243             my $prev;
244            
245             while (1) {
246            
247             my $success = my ($cls, $min, $max) = _get_next($iter);
248            
249             # If there are no more ranges in the current class, or if the latest
250             # head range is not equal to the previous one, we found the end of
251             # a new class. Note that head ranges can only be the same if min,max
252             # are equal for both the previous and the current range, so we only
253             # have to check the two min values.
254             if ((not $success) or (defined $prev && $prev->[1] ne $min->{head})) {
255            
256             # The hash table obj2num is used to ensure that the same classes
257             # are assigned the same state number; not doing so would result
258             # an automaton that is not minimal in the number of states, and
259             # minimizing it later would be considerably more costly.
260            
261             my @new = @deriv[($lengthix + 1) .. ($lengthix + $deriv[$lengthix])];
262             my $ice = freeze(\@new);
263             my $num = $obj2num->{$ice};
264            
265             if (not defined $num) {
266             $num = $obj2num->{$ice} = $nextnum++;
267            
268             # End states store only the number of the associated class;
269             # end states do not have outgoing transitions so we do not
270             # add them to the todo list. Other classes are added to it.
271            
272             if ($deriv[$lengthix] > 1) {
273             push @todo, [$lengthix, $num]
274             } else {
275             $s2d->{ $num } = $deriv[$lengthix + 1];
276             $d2s->[ $deriv[$lengthix + 1] ] = $num;
277             }
278             }
279            
280             # Record the newly found transition in the transition table
281             # as four-tuple .
282             push @d, [ $currentS, $prev->[1], $prev->[2], $num ];
283            
284             $lengthix = $nextix++;
285             $deriv[$lengthix] = 0;
286             last unless $success;
287             }
288            
289             # min and max always have the same width. If the width of the
290             # current range is one then we are moving to an end state. If
291             # it is greater than one, we are creating a new partial class.
292            
293             if ($min->{width} != 1) {
294             $deriv[ $nextix++ ] = $cls;
295             $deriv[ $nextix++ ] = $min->{tail};
296             $deriv[ $nextix++ ] = $max->{tail};
297             $deriv[ $lengthix ] += 3;
298             } else {
299             $deriv[ $nextix++ ] = $cls;
300             $deriv[ $lengthix ] += 1;
301             }
302            
303             $prev = [ $cls, $min->{head}, $max->{head} ]
304             }
305             }
306            
307             return $start, \@d, $d2s, $s2d;
308             }
309            
310             sub new {
311             my $class = shift;
312             my %param = @_;
313             my $self = bless { }, $class;
314             my @input = @{$param{classes}};
315             my @spans;
316            
317             # A deterministic finite automaton can only be in a single state
318             # at a time, so split the input classes minimally such that each
319             # code point belongs to at most a single class, not multiple ones.
320             my @disjoint = intspan_partition(@input);
321            
322             # intspan_partition unfortunately does not keep track of how it
323             # splits classes; we'd like to know, so restore the information.
324             for (my $i = 0; $i <= $#disjoint; $i++) {
325             for (my $j = 0; $j <= $#input; $j++) {
326             next unless $disjoint[$i]->subset($input[$j]);
327             push @{$self->{disjoint_to_input}->[$i]}, $j;
328             }
329             }
330            
331             # The construction algorithm considers all spans at once, so we
332             # collect all into a single array, noting where each belongs.
333             for (my $i = 0; $i <= $#disjoint; $i++) {
334             foreach my $span ($disjoint[$i]->spans) {
335             push @spans, [ $i, @$span ];
336             }
337             }
338            
339             # While not strictly necessary, it is better to sort the spans,
340             # so we do that here. Note that spans are disjoint, so we only
341             # have to compare the relevant minimum value for each span pair.
342             my @sorted = sort { $a->[1] <=> $b->[1] } @spans;
343            
344             # Now we can generate a single list for all
345             # triples where min and max are utf-8 integers. It is easier to
346             # do this here then telling a complete class apart from partial
347             # classes generated later; the spans are not array references
348             # mainly because that mirrors the C implementation more closely.
349             my @u8triples = map {
350             $_->[0], _u8enc($_->[1]), _u8enc($_->[2])
351             } @sorted;
352            
353             my ($start, $d, $d2s, $s2d) = _triples_to_dfa(@u8triples);
354            
355             $self->{state_to_disjoint} = $s2d;
356             $self->{disjoint_to_state} = $d2s;
357             $self->{disjoint_classes} = \@disjoint;
358             $self->{input_classes} = \@input;
359             $self->{start_state} = $start;
360             $self->{transitions} = $d;
361            
362             return $self;
363             }
364            
365             sub _regex_append {
366             my $node = shift;
367             my $type = shift;
368            
369             if (UNIVERSAL::isa($node, 'Set::IntSpan')) {
370            
371             if ($node->size == 1) {
372             $_[0] .= sprintf "\\x%02x", $node->elements
373            
374             } else {
375             $_[0] .= "[";
376            
377             foreach my $span ($node->spans) {
378             if ($span->[0] == $span->[1]) {
379             $_[0] .= sprintf "\\x%02x", $span->[0]
380             } else {
381             $_[0] .= sprintf "\\x%02x-\\x%02x", @$span
382             }
383             }
384            
385             $_[0] .= "]";
386             }
387            
388             } elsif ($node->[0] eq 'Group') {
389             _regex_append($node->[1], 'Group', $_[0]);
390             _regex_append($node->[2], 'Group', $_[0]);
391            
392             } elsif ($node->[0] eq 'Choice' and $type eq 'Group') {
393             $_[0] .= "(";
394             _regex_append($node->[1], 'Choice', $_[0]);
395             $_[0] .= "|";
396             _regex_append($node->[2], 'Choice', $_[0]);
397             $_[0] .= ")";
398            
399             } elsif ($node->[0] eq 'Choice') {
400             _regex_append($node->[1], 'Choice', $_[0]);
401             $_[0] .= "|";
402             _regex_append($node->[2], 'Choice', $_[0]);
403            
404             } else {
405             die
406             }
407            
408             }
409            
410             sub as_expressions {
411             my $self = shift;
412             my $last = 0;
413             my @m;
414            
415             require Graph::Directed;
416             my $g = Graph::Directed->new;
417            
418             # Convert the transitions into a matrix using Set::IntSpan objects
419             # to represent byte classes and use a graph to keep track of the
420             # predecessors and successors of each state. Would be nice if the
421             # Set::IntSpan::union method accepted undef as set to avoid the if.
422            
423             foreach my $transition (@{ $self->{transitions} }) {
424             my ($src, $min, $max, $dst) = @$transition;
425            
426             if (defined $m[$src][$dst]) {
427             $m[$src][$dst] = $m[$src][$dst]->union([[$min,$max]]);
428             } else {
429             $m[$src][$dst] = Set::IntSpan->new([[$min,$max]]);
430             }
431            
432             $g->add_edge($src, $dst);
433             $last = $dst > $last ? $dst : $last;
434             }
435            
436             # States will be eliminated in the reverse order of their creation.
437             # I am unsure if that produces the best result but could so far not
438             # find counter-examples. A more elaborate algorithm would make sure
439             # a state is removed before any, if that is possible, that must be
440             # visited before or after when going from start state to final state.
441            
442             my @order = grep {
443            
444             # We only remove a state if it is neither the start state nor
445             # a final state. $self->{state_to_disjoint} holds final ones.
446            
447             $_ != $self->{start_state} and
448             not exists $self->{state_to_disjoint}->{$_}
449            
450             } (0 .. $last);
451            
452             while (@order) {
453            
454             my $curr = pop @order;
455             my @pred = $g->predecessors($curr);
456             my @succ = $g->successors($curr);
457            
458             # A state is eliminated by connecting all predecessors with all
459             # successors by an increasingly complex regular expression. We
460             # store the regular expression as a binary tree to ease adding
461             # needed braces later. Note that the transition graph does not
462             # have cycles, otherwise we would have to encode the cycle too.
463            
464             foreach my $pred (@pred) {
465             foreach my $succ (@succ) {
466            
467             my $group = [ Group => $m[$pred][$curr], $m[$curr][$succ] ];
468             if ($m[$pred][$succ]) {
469             $m[$pred][$succ] = [ Choice => $m[$pred][$succ], $group ];
470             } else {
471             $m[$pred][$succ] = $group;
472             $g->add_edge($pred, $succ);
473             }
474             }
475             }
476            
477             $g->delete_vertex($curr);
478             }
479            
480             # Now the matrix has a regular expression for each of the dis-
481             # joint classes at m[start_state][final_state]. We iterate over
482             # the disjoint classes, pretty print the expression, and return
483             # them in the order of the disjoint classes.
484            
485             my @expressions;
486            
487             for (my $i = 0; $i <= $#{ $self->{disjoint_to_state} }; $i++) {
488             my $final = $self->{disjoint_to_state}->[$i];
489             my $regex = "";
490             _regex_append($m[$self->{start_state}][$final], 'Root', $regex);
491             push @expressions, $regex;
492             }
493            
494             return @expressions;
495             }
496            
497             1;
498            
499             __END__