File Coverage

blib/lib/Regexp/ERE.pm

Criterion	Covered	Total	%
statement	1222	1354	90.2
branch	508	606	83.8
condition	231	289	79.9
subroutine	55	60	91.6
pod	22	41	53.6
total	2038	2350	86.7

line	stmt	bran	cond	sub	pod	time	code
1	7			7		7788	use 5.008008;
	7					24
	7					312
2	7			7		37	use strict;
	7					10
	7					196
3	7			7		34	use warnings;
	7					25
	7					187
4	7			7		6640	use integer;
	7					70
	7					36
5
6							package Regexp::ERE;
7							our $VERSION = '0.02';
8
9							BEGIN {
10	7			7		461	use Exporter ();
	7					12
	7					521
11	7			7		16	our (@ISA, @EXPORT_OK);
12	7					120	@ISA = qw(Exporter);
13	7					522	@EXPORT_OK = qw(
14							&ere_to_nfa
15							&ere_to_tree
16							&ere_to_regex
17							&ere_to_input_constraints
18							&nfa_to_tree
19							&nfa_to_regex
20							&nfa_to_input_constraints
21							&nfa_clone
22							&nfa_concat
23							&nfa_union
24							&nfa_inter
25							&nfa_match
26							&nfa_quant
27							&nfa_isomorph
28							&nfa_to_dfa
29							&dfa_to_min_dfa
30							&nfa_to_min_dfa
31							&tree_to_regex
32							&tree_to_input_constraints
33							&char_to_cc
34							&interval_list_to_cc
35							&cc_union
36							);
37							}
38
39							=encoding utf8
40
41							=head1 NAME
42
43							Regexp::ERE - extended regular expressions and finite automata
44
45							=head1 SYNOPSIS
46
47							use Regexp::ERE qw(
48							&ere_to_nfa
49							&nfa_inter
50							&nfa_to_regex
51							&nfa_to_input_constraints
52							&nfa_to_dfa
53							&dfa_to_min_dfa
54							);
55
56							# condition 1: begins with abc or def
57							my $nfa1 = ere_to_nfa('^(abc\|def)');
58
59							# condition 2: ends with 123 or 456
60							my $nfa2 = ere_to_nfa('(123\|456)$');
61
62							# condition 1 and condition 2
63							my $inter_nfa = nfa_inter($nfa1, $nfa2);
64
65							# compute extended regular expression (string)
66							my $ere = nfa_to_regex($inter_nfa);
67
68							# compute perl regular expression
69							my $perlre = nfa_to_regex($inter_nfa, 1);
70
71							# compute weaker input constraints suitable for widgets
72							my ($input_constraints, $split_perlre)
73							= nfa_to_input_constraints($inter_nfa);
74
75							# minimal dfa (simpler regular expression happens to result)
76							my $nfa3 = ere_to_nfa('^(a\|ab\|b)*$');
77							my $dfa3 = nfa_to_dfa($nfa3);
78							my $min_dfa3 = dfa_to_min_dfa($dfa3);
79							my $ere3 = nfa_to_regex($min_dfa3);
80
81							=head1 DESCRIPTION
82
83							Pure-perl module for:
84
85							=over 4
86
87							=item *
88
89							Parsing POSIX Extended Regular Expressions (C<$ere>) into
90							Non-Deterministic Finite Automata (C<$nfa>)
91
92							=item *
93
94							Manipulating C<$nfa>s (concatenating, or-ing, and-ing)
95
96							=item *
97
98							Computing Deterministic Finite Automata (C<$dfa>s) from C<$nfa>s
99							(powerset construction)
100
101							=item *
102
103							Computing minimal C<$dfa>s from C<$dfa>s (Hopcroft's algorithm)
104
105							=item *
106
107							Computing C<$ere>s or Perl Regular Expressions from C<$nfa> or C<$dfa>
108							(Warshall algorithm)
109
110							=item *
111
112							Heuristically deriving (possibly weaker) constraints from a C<$nfa> or C<$dfa>
113							suitable for display in a graphical user interface,
114							i.e. a sequence of widgets of type 'free text' and 'drop down';
115
116							Example: '^(abc\|def)' => $nfa => [['abc', 'def'], 'free text']
117
118							=back
119
120							=head1 GLOSSARY AND CONVERSIONS OVERVIEW
121
122							=head2 Conversions overview
123
124							$ere -> $nfa -> $tree -> $regex ($ere or $perlre)
125							-> $input_constraints
126
127							The second argument of -> $regex conversions is an optional boolean,
128							true : conversion to a compiled perl regular expression
129							false: conversion to an ere string
130
131							The -> $input_constraints conversions return a pair (
132							$input_constraints: aref as described at tree_to_input_constraints()
133							$split_perlre : a compiled perl regular expression
134							)
135
136
137							=head2 Glossary
138
139							=over 4
140
141							=item $char_class
142
143							A set of unicode characters.
144
145							=item $ere
146
147							Extended regular expression (string).
148							See C for the exact syntax.
149
150							=item $perlre
151
152							Perl regular expression
153
154							=item $nfa
155
156							Non-deterministic finite automaton
157
158							=item $dfa
159
160							Deterministic finite automaton (special case of C<$nfa>)
161
162							=item $tree
163
164							Intermediate hierarchical representation of a regular expression
165							(which still can be manipulated before stringification),
166							similar to a parse tree (but used for generating, not for parsing).
167
168							=item $input_constraints
169
170							Ad-hoc data structure representing a list of gui-widgets
171							(free text fields and drop-down lists),
172							a helper for entering inputs
173							conforming to a given C<$nfa>.
174
175							=back
176
177							=cut
178
179
180							##############################################################################
181							# Config
182							##############################################################################
183
184							# If true, nfa_to_tree() always expands concatned alternations.
185							# Example: (ab\|cd) (ef\|gh) -> (abef\|abgh\|cdef\|cdgh)
186							our $TREE_CONCAT_FULL_EXPAND = 0;
187
188							# If true, prefixes and suffixes are factorized out even for
189							# trees with a single alternation.
190							# Example: (a1b\|a2b) -> a(1\|2)b
191							our $FULL_FACTORIZE_FIXES = 0;
192
193							# Should be 0. Else, traces nfa_to_tree() on STDERR.
194							use constant {
195	7					602	TRACE_NFA_TO_TREE => 0
196	7			7		37	};
	7					21
197
198							use constant {
199	7					2406	MAX_CHAR => 0x10FFFF
200							, CHAR_CLASS => 'cc' # for blessing $char_classes (label only, no methods)
201	7			7		33	};
	7					15
202
203
204							=head1 DATA STRUCTURES AND SUBROUTINES
205
206							Each of the documented subroutines can be imported,
207							for instance C.
208
209							=cut
210
211
212							##############################################################################
213							# $char_class
214							##############################################################################
215
216							=head2 Character class
217
218
219							WARNING: C<$char_class>es must be created exclusively by
220							char_to_cc()
221							or interval_list_to_cc()
222							for equivalent character classes to be always the same array reference.
223							For the same reason, C<$char_class>es must never be mutated.
224
225							In this implementation, the state transitions of a C<$nfa> are based upon
226							character classes (not single characters). A character class is an ordered
227							list of disjunct, non-mergeable intervals (over unicode code points,
228							i.e. positive integers).
229
230							$char_class = [
231							[ $low_0, $high_0 ] # $interval_0
232							, [ $low_1, $high_1 ] # $interval_1
233							, ...
234							]
235
236
237							Constraints:
238
239							1: 0 <= $$char_class[$i][0] (0 <= low)
240							2: $$char_class[$i][1] <= MAX_CHAR (high <= MAX_CHAR)
241							3: $$char_class[$i][0] <= $$char_class[$i][1] (low <= high)
242							4: $$char_class[$i][1] + 1 < $$char_class[$i+1][0] (non mergeable)
243
244
245							Exceptions (anchors used only in the parsing phase only):
246
247							begin : [ -2, -1 ]
248							end : [ -3, -2 ]
249							begin or end : [ -3, -1 ]
250
251							Immediately after parsing, such pseudo-character classes
252							are removed by C.
253
254							=over 4
255
256							=cut
257
258							our $ERE_litteral = qr/ [^.[\\()*+?{\|^\$] /xms;
259							our $PERLRE_char_class_special = qr/ [\[\]\\\^\-] /xms;
260
261							our $cc_any = bless([[ 0, MAX_CHAR ]], CHAR_CLASS);
262							our $cc_none = bless([], CHAR_CLASS);
263							our $cc_beg = bless([[ -2, -1]], CHAR_CLASS);
264							our $cc_end = bless([[ -3, -2]], CHAR_CLASS);
265							{
266
267	7			7		40	no warnings qw(utf8); # in particular for 0x10FFFF
	7					14
	7					126162
268
269							my %cc_cache;
270							# keys: join(',',1,map{@$_}@{$char_class})
271
272							for ($cc_any, $cc_none, $cc_beg, $cc_end) {
273							$cc_cache{ join(',', 1, map {@$_} @$_) } = $_;
274							}
275
276							=item char_to_cc($c)
277
278							Returns the unique $char_class equivalent to C<[[ord($c), ord($c)]]>.
279
280							=cut
281
282							sub char_to_cc {
283	196		100	196	1	8886	return $cc_cache{ join(',', 1, (ord($_[0])) x 2) }
284							\|\|= bless([[ord($_[0]), ord($_[0])]], CHAR_CLASS);
285							}
286
287							# $interval_list is the same data structure as $char_class.
288							# Constraints 1, 2 are assumed.
289							# Constraints 3, 4 are enforced.
290
291							=item interval_list_to_cc($interval_list)
292
293							C<$interval_list> is an arbitrary list of intervals.
294							Returns the unique C<$char_class> whose reunion of intervals
295							is the same set as the reunion of the intervals of C<$interval_list>.
296
297							Example:
298
299							interval_list_to_cc([[102, 112], [65, 90], [97, 102], [113, 122]])
300							returns [[65, 90], [97, 122]]
301							(i.e [f-p]\|[A-Z]\|[a-f]\|[q-z] => [A-Z]\|[a-z])
302
303							Note that both $interval_list and $char_class are lists of intervals,
304							but only $char_class obeys the constraints above,
305							while $interval_list does not.
306
307							Remark also that C is the identity
308							(returns the same reference as given) on C<$char_class>es returned
309							by either C or C.
310
311							=cut
312
313							sub interval_list_to_cc {
314	4171			4171	1	4905	my ($interval_list) = @_;
315							my @sorted
316	4384					6261	= sort { $$a[0] <=> $$b[0] }
	6548					16986
317	4171					5462	grep { $$_[0] <= $$_[1] }
318							@$interval_list
319							;
320	4171					7854	my $char_class = bless([], CHAR_CLASS);
321	4171					5153	my $i = 0;
322	4171					8812	while ($i != @sorted) {
323	4519					5415	my $interval = $sorted[$i];
324	4519					4465	$i++;
325	4519		100			12929	while ($i != @sorted && $$interval[1] + 1 >= $sorted[$i][0]) {
326	2029	100				4279	if ($$interval[1] < $sorted[$i][1]) {
327	2028					2985	$$interval[1] = $sorted[$i][1];
328							}
329	2029					7368	$i++;
330							}
331	4519					11944	push(@$char_class, $interval);
332							}
333	4171		66			6454	return $cc_cache{ join(',', 1, map {@$_} @$char_class) }
	4519					24913
334							\|\|= $char_class;
335							}
336
337							sub cc_neg {
338	1051			1051	0	1239	my ($char_class) = @_;
339
340	1051	100				2152	if (!@$char_class) { return $cc_any; }
	81					493
341
342	970					1672	my $neg = bless([], CHAR_CLASS);
343	970	100				2304	if ($$char_class[0][0] != 0) {
344	932					2373	push(@$neg, [0, $$char_class[0][0] - 1]);
345							}
346	970					1094	my $i = 0;
347	970					2397	while ($i != $#$char_class) {
348	153					478	push(@$neg, [$$char_class[$i][1] + 1, $$char_class[$i+1][0] - 1]);
349	153					342	$i++;
350							}
351	970	100				2134	if ($$char_class[$i][1] != MAX_CHAR) {
352	932					2223	push(@$neg, [$$char_class[$i][1] + 1, MAX_CHAR]);
353							}
354	970		66			1503	return $cc_cache{ join(',', 1, map{@$_} @$neg) } \|\|= $neg;
	2017					19610
355							}
356
357							sub cc_inter2 {
358	354			354	0	489	my ($char_class_0, $char_class_1) = @_;
359
360	354					632	my $inter = bless([], CHAR_CLASS);
361	354					393	my $i_0 = 0;
362	354					336	my $i_1 = 0;
363	354		100			1472	while ($i_0 < @$char_class_0 && $i_1 < @$char_class_1) {
364
365							# skip interval_0 if interval_0 < interval_1
366	416		66			2577	while (
			100
367							$i_0 < @$char_class_0
368							&& $i_1 < @$char_class_1
369							&& $$char_class_0[$i_0][1] < $$char_class_1[$i_1][0]
370							) {
371	312					1608	$i_0++;
372							}
373
374							# skip interval_1 if interval_1 < interval_0
375	416		100			2243	while (
			100
376							$i_0 < @$char_class_0
377							&& $i_1 < @$char_class_1
378							&& $$char_class_1[$i_1][1] < $$char_class_0[$i_0][0]
379							) {
380	141					601	$i_1++;
381							}
382
383							# Check that the exit condition of the first while still holds.
384	416	100	100			2402	if (
			100
385							$i_0 < @$char_class_0
386							&& $i_1 < @$char_class_1
387							&& $$char_class_1[$i_1][0] <= $$char_class_0[$i_0][1]
388							) {
389							# The exit conditions of both whiles hold:
390							#
391							# $$char_class_0[$i_0][1] >= $$char_class_1[$i_1][0]
392							# && $$char_class_1[$i_1][1] >= $$char_class_0[$i_0][0]
393							#
394							# short:
395							# high_0 >= low_1
396							# high_1 >= low_0
397							#
398							# furthermore:
399							# high_0 >= low_0
400							# high_1 >= low_1
401							#
402							# with:
403							# min_high := min(high_0, high_1)
404							# max_low := max(low_0, low_1)
405							#
406							# holds:
407							# min_high >= max_low_0
408
409	213					188	my ($interval_0_done, $interval_1_done);
410
411	213	100				502	my $max_low =
412							$$char_class_0[$i_0][0] > $$char_class_1[$i_1][0]
413							? $$char_class_0[$i_0][0]
414							: $$char_class_1[$i_1][0]
415							;
416
417	213					313	my $min_high;
418	213	100				488	if ($$char_class_0[$i_0][1] <= $$char_class_1[$i_1][1]) {
419	140					186	$min_high = $$char_class_0[$i_0][1];
420							# interval_0 < next interval_1
421	140					155	$interval_0_done = 1;
422							}
423	213	100				423	if ($$char_class_1[$i_1][1] <= $$char_class_0[$i_0][1]) {
424	157					183	$min_high = $$char_class_1[$i_1][1];
425							# interval_1 < next interval_0
426	157					156	$interval_1_done = 1;
427							}
428	213	100				374	if ($interval_0_done) { $i_0++; }
	140					152
429	213	100				337	if ($interval_1_done) { $i_1++; }
	157					144
430
431	213					1144	push(@$inter, [$max_low, $min_high]);
432							}
433							}
434	354		33			1567	return $cc_cache{ join(',', 1, map{@$_} @$inter) } \|\|=$inter;
	213					1493
435							}
436							}
437
438							sub cc_match {
439	86306			86306	0	119665	my ($char_class, $c) = @_;
440	86306					116617	for my $interval (@$char_class) {
441	98032	100				245638	if ($c < $$interval[0]) {
		100
442	34606					110937	return 0;
443							}
444							elsif ($c <= $$interval[1]) {
445	19484					61529	return 1;
446							}
447							}
448	32216					97559	return 0;
449							}
450
451							=item cc_union(@char_classes)
452
453							Returns the unique C<$char_class> containing all characters of all given
454							C<@char_classes>.
455
456							=cut
457
458							sub cc_union {
459	1234			1234	1	1879	return interval_list_to_cc( [ map { map { [@$_] } @$_ } @_ ] );
	3216					4083
	3380					9697
460							}
461
462							sub cc_is_subset {
463	53			53	0	71	my ($char_class_0, $char_class_1) = @_;
464	53					80	for my $c ( map { @$_ } @$char_class_0 ) {
	54					160
465	86	100				252	if (!cc_match($char_class_1, $c)) { return 0; }
	29					100
466							}
467	24					95	return 1;
468							}
469
470							# $to_perlre (boolean)
471							# true : perl syntax
472							# false: ere syntax
473							sub cc_to_regex {
474	554			554	0	752	my ($char_class, $to_perlre) = (@_, 0);
475
476	554					532	my @items;
477	554	50	66			2166	if (@$char_class && $$char_class[0][0] < 0) {
478	0	0				0	if ($$char_class[0][0] == -2) {
479	0	0				0	if ($$char_class[0][1] == -1) {
480	0					0	push(@items, '^');
481							}
482							else {
483	0					0	push(@items, '^$');
484							}
485							}
486							else {
487	0	0				0	if ($$char_class[0][1] == -2) {
488	0					0	push(@items, '$');
489							}
490							else {
491	0					0	push(@items, '^', '$');
492							}
493							}
494	0					0	$char_class = [@$char_class[1..$#$char_class]];
495							}
496	554	100				936	if (@$char_class) {
497	514	100	100			2020	if (
		100	100
		100	66
498							@$char_class == 1
499							&& $$char_class[0][0] == $$char_class[0][1]
500							) {
501	500					809	my $c = chr($$char_class[0][0]);
502	500	100				687	if ($to_perlre) {
503	41					84	push(@items, quotemeta($c))
504							}
505							else {
506	459	100				1548	push(@items,
507							$c =~ /$ERE_litteral/o
508							? $c
509							: "\\$c"
510							);
511							}
512							}
513							elsif (
514							@$char_class == 1
515							&& $$char_class[0][0] == 0
516							&& $$char_class[0][1] == MAX_CHAR
517							) {
518	4					9	push(@items, '.');
519							}
520							elsif ($$char_class[$#$char_class][1] == MAX_CHAR) {
521	6	100				15	if ($to_perlre) {
522	1					5	push(@items,
523							'[^' . _cc_to_perlre(cc_neg($char_class)) . ']'
524							);
525							}
526							else {
527	5					16	push(@items,
528							'[^' . _cc_to_ere(cc_neg($char_class)) . ']'
529							);
530							}
531							}
532							else {
533	4	100				11	if ($to_perlre) {
534	1					2	push(@items, '[' . _cc_to_perlre($char_class) . ']');
535							}
536							else {
537	3					13	push(@items, '[' . _cc_to_ere($char_class) . ']');
538							}
539							}
540							}
541
542	554					564	my $regex;
543	554	100				1219	if (@items == 0) {
		50
544	40					188	return '';
545							}
546							elsif (@items == 1) {
547	514					2885	return $items[0];
548							}
549							else {
550	0	0				0	if ($to_perlre) {
551	0					0	return '(?:' . join('\|', @items) . ')';
552							}
553							else {
554	0					0	return '(' . join('\|', @items) . ')';
555							}
556							}
557							}
558
559							sub _cc_to_ere {
560	8			8		14	my ($char_class) = @_;
561	8					12	my $has_minus;
562							my $has_r_bracket;
563							my $ere = join('',
564							map {
565	8	100				18	if ($$_[0] == $$_[1]) {
	10					33
566	5	50				21	if ($$_[0] == ord('-')) {
		50
567	0					0	$has_minus = 1;
568	0					0	'';
569							}
570							elsif ($$_[0] == ord(']')) {
571	0					0	$has_r_bracket = 1;
572	0					0	'';
573							}
574							else {
575	5					22	chr($$_[0]);
576							}
577							}
578							else {
579	5	50	33			32	if (
580							$$_[0] == ord('-')
581							\|\| $$_[0] == ord(']')
582							) {
583	0	0				0	if ($$_[0] == ord('-')) {
584	0					0	$has_minus = 1;
585							}
586							else {
587	0					0	$has_r_bracket = 1;
588							}
589	0	0				0	if ($$_[1] == $$_[0] + 1) {
		0
590	0					0	chr($$_[1]);
591							}
592							elsif ($$_[1] == $$_[0] + 2) {
593	0					0	chr($$_[0] + 1) . chr($$_[1]);
594							}
595							else {
596	0					0	chr($$_[0] + 1) . '-' . chr($$_[1]);
597							}
598							}
599							else {
600	5	100				19	if ($$_[1] == $$_[0] + 1) {
601	4					22	chr($$_[0]) . chr($$_[1]);
602							}
603							else {
604	1					7	chr($$_[0]) . '-' . chr($$_[1]);
605							}
606							}
607							}
608							}
609							@$char_class
610							);
611	8	50				26	if ($has_minus) { $ere .= '-'; }
	0					0
612	8	50				20	if ($has_r_bracket) { $ere = "]$ere"; }
	0					0
613	8					30	return $ere;
614							}
615
616							sub _cc_to_perlre {
617	2			2		4	my ($char_class) = @_;
618							return join('',
619							map {
620	2	100				3	if ($$_[0] == $$_[1]) {
	2					7
621	1					3	my $c = chr($$_[0]);
622	1	50				16	$c =~ /$PERLRE_char_class_special/o ? "\\$c" : $c;
623							}
624							else {
625	1					4	my ($c1, $c2) = (chr($$_[0]), chr($$_[1]));
626	1	50				22	($c1 =~ /$PERLRE_char_class_special/o ? "\\$c1" : $c1)
		50
		50
627							. ($$_[0] + 1 < $$_[1] ? '-' : '')
628							. ($c2 =~ /$PERLRE_char_class_special/o ? "\\$c2" : $c2)
629							}
630							} @$char_class
631							);
632							}
633
634
635							##############################################################################
636							# $nfa
637							##############################################################################
638
639							=back
640
641							=head2 Nfa
642
643
644							WARNING: C routines are destructive,
645							the C<$nfa> references given as arguments will not be valid C<$nfa> any more.
646							Furthermore, the same C<$nfa> reference must be used only once as argument.
647							For instance, for concatenating a C<$nfa> with itself, C
648							does not work; instead, C must be used;
649							or even C if the original
650							C<$nfa> is to be used further.
651
652							$nfa = [ $state_0, $state_1, ... ]
653
654							$state = [
655							$accepting
656							, $transitions
657							]
658
659							$transitions = [
660							[ $char_class_0 => $state_ind_0 ]
661							, [ $char_class_1 => $state_ind_1 ]
662							, ...
663							]
664
665							In the same C<$transition>, C<$state_ind_i> are pairwise different and are
666							valid indexes of C<@$nfa>. There is exactly one initial state at index 0.
667
668							=over 4
669
670							=item C
671
672							Maps each of the given C<@nfas> to a clone.
673
674							=cut
675
676							sub nfa_clone {
677							return
678	266			266	1	432	map { [
	541					1666
679	288					368	map { [
680							$$_[0] # accepting
681	711					996	, [ map { [ @$_ ] } @{$$_[1]} ] # transitions
	711					1758
682							] }
683							@$_ # states of the $nfa
684							] } @_ # list of $nfas
685							;
686							}
687
688							sub _transitions_is_subset {
689	604			604		855	my ($transitions_0, $transitions_1, $state_ind_map) = @_;
690	705	100	66			4722	my %state_ind_to_t_1
691	604					896	= map {(
692							$state_ind_map && exists($$state_ind_map{$$_[1]})
693							? $$state_ind_map{$$_[1]}
694							: $$_[1]
695							=> $_
696							)}
697							@$transitions_1
698							;
699	604					1211	for my $t_0 (@$transitions_0) {
700	184	100	66			921	my $state_ind_0
701							= $state_ind_map && exists($$state_ind_map{$$t_0[1]})
702							? $$state_ind_map{$$t_0[1]}
703							: $$t_0[1]
704							;
705	184	100				629	if (!exists($state_ind_to_t_1{$state_ind_0})) { return 0; }
	158					680
706	26					33	my $t_1 = $state_ind_to_t_1{$state_ind_0};
707	26	100				68	if (!cc_is_subset($$t_0[0], $$t_1[0])) { return 0; }
	16					63
708							}
709	430					1366	return 1;
710							}
711
712							# The keys of %$state_ind_to_equiv are state_inds of @$nfa to be removed.
713							# State indexes in transitions are remapped following %$state_ind_to_equiv.
714							# A state index mapped to itself denotes an unreachable state index.
715							sub _nfa_shrink_equiv {
716	733			733		984	my ($nfa, $state_ind_to_equiv) = @_;
717	733					847	my $i = 0;
718	3099					7130	my %compact_map
719	4282					9459	= map { ($_ => $i++) }
720							my @active_state_inds
721	733					1436	= grep { !exists($$state_ind_to_equiv{$_}) }
722							(0..$#$nfa)
723							;
724
725	733					1597	my %equiv_index_to_char_classes;
726							my %plain_index_to_char_class;
727	733					3402	for (@$nfa = @$nfa[@active_state_inds]) {
728
729							# update $state_ind
730							# -> $compact_map{$state_ind}
731							# or $compact_map{$$state_ind_to_equiv{$state_ind}}
732	3099					4619	%equiv_index_to_char_classes = ();
733	3099					3959	%plain_index_to_char_class = ();
734	3099					2905	for (@{$$_[1]}) { # transition list
	3099					5160
735	5788	100				10700	if (exists($$state_ind_to_equiv{$$_[1]})) {
736							push(
737	868					847	@{$equiv_index_to_char_classes{
738	868					3774	$$_[1]
739							= $compact_map{$$state_ind_to_equiv{$$_[1]}}
740							}}
741							, $$_[0]
742							);
743							}
744							else {
745							$plain_index_to_char_class{
746	4920					12515	$$_[1]
747							= $compact_map{$$_[1]}
748							} = $$_[0];
749							}
750							}
751							# merge char_classes to the same state index
752	3099	100				7524	if (keys(%equiv_index_to_char_classes)) {
753	727					3038	@{$$_[1]} = ((
	87					203
754	739					2899	map {[
755							exists($equiv_index_to_char_classes{$_})
756							? cc_union(
757							$plain_index_to_char_class{$_}
758	579	100				1905	, @{$equiv_index_to_char_classes{$_}}
759							)
760							: $plain_index_to_char_class{$_}
761							, $_
762							]}
763							keys(%plain_index_to_char_class)
764							) , (
765	826					1762	map {[
766	11					31	@{$equiv_index_to_char_classes{$_}} == 1
767							? $equiv_index_to_char_classes{$_}[0]
768	739	100				739	: cc_union(@{$equiv_index_to_char_classes{$_}})
769							, $_
770							]}
771	727					1491	grep { !exists($plain_index_to_char_class{$_}) }
772							keys(%equiv_index_to_char_classes)
773							))
774							}
775							}
776	733					6405	return $nfa;
777							}
778
779							=item C
780
781							Precondition: C<0 <= $min && ( $max eq '' \|\| $min <= $max)>
782
783							Returns C<$out_nfa>, a C<$nfa> computed from C<$in_nfa>.
784
785							Let L be the language accepted by C<$in_nfa> and M the language accepted
786							by C<$out_nfa>. Then a word m belongs to M if and only if and ordered list
787							(l_1, ..., l_r) of words belonging to L exists such that:
788
789							$min <= r
790							and ($max eq '' or r <= $max)
791							and m is the concatenation of (l_1, ..., l_r)
792
793							Examples with C<$in_nfa> being a C<$nfa> accepting C<'^a$'>:
794
795							nfa_quant($in_nfa, 2, 4 ) accepts '^a{2,4}$'
796							nfa_quant($in_nfa, 0, '') accepts '^a{0,}$' (i.e. '^a*$')
797
798							=cut
799
800							sub nfa_quant {
801	245			245	1	368	my ($nfa, $min, $max) = @_;
802	245					244	my @quant_parts;
803	245	100				473	if ($min > 0) {
804	13					46	push(@quant_parts, nfa_concat(nfa_clone(($nfa) x $min)));
805							}
806
807	245					270	my $optional_part;
808	245	100	100			681	if (
809							length($max) == 0
810							\|\| $max > $min
811							) {
812	244	100				581	if ($$nfa[0][0]) {
	455	100				1029
813							# initial state already accepting
814							# (a*)?
815	7					15	($optional_part) = nfa_clone($nfa);
816							}
817							elsif (
818	579					1083	!grep { $$_[1] == 0 }
819	579					513	map { @{$$_[1]} }
820							@$nfa
821							) {
822							# initial state not accepting and unreachable
823							# (a)?
824	234					536	($optional_part) = nfa_clone($nfa);
825	234					424	$$optional_part[0][0] = 1;
826							}
827							else {
828							# initial state not accepting and reachable
829							# (a*b)?
830	6					22	$optional_part = [
831							# additional root initial state accepting state
832							[
833							1 # accepting
834	3					8	, [ map {[$$_[0] , $$_[1]+1]} @{$$nfa[0][1]} ] # transitions
	6					19
835							]
836							# original states with offset 1
837	3					6	, map { [
838							$$_[0] # accepting
839	6					9	, [ map {[ $$_[0], $$_[1]+1 ]} @{$$_[1]} ] # transitions
	6					16
840							] }
841							@$nfa
842							];
843							}
844							}
845	245	100				464	if (length($max) == 0) {
		100
846
847							# starify optional part
848
849	270					863	my %root_index_to_char_class
850	232					374	= map { ($$_[1] => $$_[0]) }
851	232					364	@{$$optional_part[0][1]}
852							;
853
854	232					428	my $state_ind_to_equiv = {};
855							# loop over accepting state indexes
856	232					432	for (grep { $$optional_part[$_][0] } (1..$#$optional_part)) {
	325					640
857	232	100				875	if (
858							_transitions_is_subset(
859							$$optional_part[$_][1]
860							, $$optional_part[0][1]
861							, { $_ => 0 }
862							)
863							) {
864							# Accepting states whose transitions are
865							# a subset of the transitions of the initial state
866							# are equivalent to the initial state.
867	228					931	$$state_ind_to_equiv{$_} = 0;
868							}
869							else {
870	4	100				7	if (
871	4					14	grep { exists($root_index_to_char_class{$_}) }
	4					10
872	4					25	map { $$_[1] }
873							@{$$optional_part[$_][1]}
874							) {
875							# merge char classes to the same state index
876	2					7	my %new_index_to_char_classes
877	2					4	= map { ($$_[1] => [$$_[0]]) }
878	2					3	@{$$optional_part[$_][1]}
879							;
880	2					7	for (keys(%root_index_to_char_class)) {
881	4					10	push (
882	4					4	@{$new_index_to_char_classes{$_}}
883							, $root_index_to_char_class{$_}
884							);
885							}
886	2					11	@{$$optional_part[$_][1]}
	4					13
887	2					6	= map {[
888	2					4	@{$new_index_to_char_classes{$_}} == 1
889							? $new_index_to_char_classes{$_}[0]
890	4	100				3	: cc_union(@{$new_index_to_char_classes{$_}})
891							, $_
892							]}
893							keys(%new_index_to_char_classes)
894							;
895							}
896							else {
897	2					6	push(
898	4					15	@{$$optional_part[$_][1]}
899	2					3	, map { [@$_] } @{$$optional_part[0][1]}
	2					6
900							);
901							}
902							}
903							}
904
905	232	100				752	push(@quant_parts,
906							keys(%$state_ind_to_equiv)
907							? _nfa_shrink_equiv($optional_part, $state_ind_to_equiv)
908							: $optional_part
909							);
910							}
911							elsif ($max > $min) {
912
913							# concatenate optional_part $max - $min times
914
915	12					31	push(@quant_parts, _nfa_concat(1, nfa_clone(
916							($optional_part) x ($max - $min)
917							)));
918							}
919	245	100				788	return @quant_parts == 1 ? $quant_parts[0] : nfa_concat(@quant_parts);
920							}
921
922							=item C
923
924							Returns C<$out_nfa>, a C<$nfa> computed from C<@in_nfas>.
925
926							Let r be the number of given C<@in_nfas>,
927							L_i the language accepted by C<$in_nfas[$i]> and M the language accepted
928							by C<$out_nfa>. Then a word m belongs to M if and only if an ordered list
929							(l_1, ..., l_r) of words exists, l_i belonging to L_i, such that
930							m is the concatenation of (l_1, ..., l_r).
931
932							=cut
933
934							sub nfa_concat {
935	245			245	1	531	_nfa_concat(0, @_);
936							}
937
938							sub _nfa_concat {
939	257			257		474	my $starifying = shift(@_);
940	257	50				586	if (!@_) {
941	0					0	return [[1, []]]; # neutral element: accepting empty string
942							}
943	257					322	my $concat = shift(@_);
944	257					478	my @accepting_state_inds = grep { $$concat[$_][0] } (0..$#$concat);
	643					1179
945	257					378	my $state_ind_to_equiv = {};
946							my (
947	257					324	$nfa
948							, $state
949							, $init_state_ind
950							, $init_reachable
951							, $init_equiv_reachable
952							, $init_skipped
953							, @new_accepting_state_inds
954							);
955	257					591	while (@_) {
956	346					400	$nfa = shift(@_);
957	346					378	$init_state_ind = @$concat;
958	346					314	$init_reachable = 0;
959	346					521	$init_equiv_reachable = 0;
960	346					354	$init_skipped = 0;
961							@new_accepting_state_inds
962	385					873	= map { $_ + $init_state_ind }
	823					1210
963	346					596	grep { $$nfa[$_][0] }
964							(0..$#$nfa)
965							;
966	346					535	for (map { @{$$_[1]} } @$nfa) {
	823					760
	823					1588
967	813	100	100			2572	($$_[1] += $init_state_ind) == $init_state_ind
968							&& ($init_reachable \|\|= 1);
969							}
970	346					607	for my $acc_ind (@accepting_state_inds) {
971	417					604	$state = $$concat[$acc_ind];
972	417					570	$$state[0] = $$nfa[0][0];
973	417	100	100			392	if (
		100	100
974	417					2051	@{$$state[1]} <= 1
975							&& _transitions_is_subset(
976							$$state[1] # transitions of the old accepting state
977							, $$nfa[0][1] # transitions of the new initial state
978							, { $acc_ind => $init_state_ind }
979							)
980	245					924	) {
981
982							# Old accepting states whose transitions are
983							# a subset of the transitions of the new initial state
984							# are equivalent to the initial state.
985							#
986							# Note that such an old accepting states can have either
987							# no transition or one self-transition;
988							# the case that the old accepting state has no transition
989							# occurs very often.
990							#
991							# %$state_ind_to_equiv gets extended by
992							#
993							# $acc_ind_ (old accepting state) => $init_state_ind
994							#
995							# But the keys and the values of %$state_ind_to_equiv
996							# MUST remain disjoint (except for pairs key = val).
997							#
998							# Since $init_state_index are growing
999							# and $acc_ind < $init_state_index:
1000							# - the new value does not belong the the keys
1001							# - the new key may belong to the vals,
1002							# such values must be updated.
1003							#
1004							# Example:
1005							# 0 => 1 ( %$state_ind_to_equiv )
1006							# 1 => 2 ( $acc_ind => $init_state_index )
1007							# %$state_ind_to_equiv must be updated to
1008							# 0 => 2
1009							# before being extended by
1010							# 1 => 2
1011	202					556	for (grep { $_ == $acc_ind } values(%$state_ind_to_equiv)) {
	111					347
1012	3					7	$_ = $init_state_ind;
1013							}
1014	202					386	$$state_ind_to_equiv{$acc_ind} = $init_state_ind;
1015	202					508	$init_equiv_reachable = 1;
1016							}
1017							elsif (
1018	215					403	(grep { $$_[1] == $init_state_ind } @{$$nfa[0][1]})
	27					53
1019							&& cc_is_subset(
1020
1021							# char_class of the self-transition
1022							# of the new initial state
1023							(
1024	29					62	map { $$_[0] }
1025	27					50	grep { $$_[1] == $init_state_ind }
1026	22					51	@{$$nfa[0][1]}
1027							)
1028
1029							# char_class of the self-transition
1030							# of the old accepting state
1031							, (
1032	38					91	map { $$_[0] }
1033	27					40	grep { $$_[1] == $acc_ind }
1034							@{$$state[1]}
1035							)
1036							)
1037							) {
1038							# If the self-transitions of the new init state are
1039							# a subset of the transitions of the old accepting state,
1040							# the new state is not needed for looping;
1041							# the transition to the new init state can be skipped.
1042							#
1043							# Example 1:
1044							# [ab]a
1045							# the state for a* is superfluous.
1046							# Example 2:
1047							# ( x[ab]* \| y[ac]* \| z[bc]* ) a* c
1048							# the state for a* is only needed after [bc]*
1049							# the regular expression is equivalent to:
1050							# [ab]c \| y[ac]c \| z[bc]ac
1051							#
1052							# Note that this one-letter-star optimization is
1053							# probably not very useful for practical purposes;
1054							# more general equivalences like (abc)(abc) ~ (abc)*
1055							# are not catched up, while the focused use cases
1056							# of prefix and suffix recognition need no star at all.
1057							#
1058							# It is merely a toy optimization for solving some exercices
1059							# of an introductory course on regexes.
1060							#
1061	14					31	push(@{$$state[1]},
	2					7
1062	16					38	map { [ @$_ ] }
1063	14					28	grep { $$_[1] != $init_state_ind}
1064	14					19	@{$$nfa[0][1]})
1065							;
1066	14					44	$init_skipped++;
1067							}
1068							else {
1069	201					374	push(@{$$state[1]},
	229					1113
1070	201					334	map { [ @$_ ] }
1071	201					212	@{$$nfa[0][1]})
1072							;
1073							}
1074							}
1075	346	100	100			2068	if (
			100
1076							!$init_reachable && !$init_equiv_reachable
1077							\|\| $init_skipped == @accepting_state_inds
1078							) {
1079							# for being removed by _nfa_shrink_equiv()
1080	137					311	$$state_ind_to_equiv{$init_state_ind} = $init_state_ind;
1081							}
1082
1083	346	100				1059	if (!$$nfa[0][0]) {
		100
1084	168					309	@accepting_state_inds = ();
1085							}
1086							elsif ($starifying) {
1087							# $starifying set for optimizing x{n,m}.
1088							# The old accepting states are redundant,
1089							# since reacheble iff the newer ones are.
1090	5					15	for (@accepting_state_inds[1..$#accepting_state_inds]) {
1091	7					13	$$concat[$_][0] = 0;
1092							}
1093	5	50				14	if (!$init_reachable) {
1094	5					10	$$nfa[0][0] = 0;
1095	5					5	shift(@new_accepting_state_inds);
1096							}
1097	5					21	@accepting_state_inds = (0);
1098							}
1099							else {
1100							@accepting_state_inds
1101	173					242	= grep { !exists($$state_ind_to_equiv{$_}) }
	190					557
1102							@accepting_state_inds
1103							;
1104							}
1105
1106	346					810	push(@$concat, @$nfa);
1107	346					781	push(@accepting_state_inds, @new_accepting_state_inds);
1108							}
1109	257	100				599	if (keys(%$state_ind_to_equiv)) {
1110	211					420	return _nfa_shrink_equiv($concat, $state_ind_to_equiv);
1111							}
1112							else {
1113	46					164	return $concat;
1114							}
1115							}
1116
1117							=item C
1118
1119							Returns C<$out_nfa>, a C<$nfa> computed from C<@in_nfas>.
1120
1121							C<$out_nfa> accepts a word w if and only if at least one of C<@in_nfas>
1122							accepts w.
1123
1124							=cut
1125
1126							# Adds the total number of states
1127							sub nfa_union {
1128	125			125	1	294	my $union = [[0, []]]; # root, neutral element: accepting nothing
1129	125					185	my $state_ind_to_equiv = {};
1130	125					215	my $first_trivial_accepting_state_ind;
1131							my (
1132	125					142	$nfa
1133							, $init_state_ind
1134							, $init_reachable
1135							, $orig_state
1136							);
1137
1138	125					217	for $nfa (@_) {
1139
1140							# merge initial $accepting
1141	293		100			1243	$$union[0][0] \|\|= $$nfa[0][0];
1142	293	100	100			651	if (@$nfa == 1 && @{$$nfa[0][1]} == 0) {
	42					149
1143	31					48	next;
1144							# Must be skipped because such a trivial state
1145							# would be removed below (!$init_reachable)
1146							# although it may be the $first_trivial_accepting state.
1147							#
1148							# On the other side, a well defined $nfa
1149							# with a single state and with a non-empty transition list
1150							# must loop to itself, thus $init_reachable.
1151							}
1152
1153	262					293	$init_state_ind = @$union;
1154	262					254	$init_reachable = 0;
1155	262					512	for (0..$#$nfa) {
1156	1106					1404	$orig_state = $$nfa[$_];
1157	1106	100	100			2549	if (
1158	269					815	$$orig_state[0] # accepting
1159							&& !@{$$orig_state[1]} # trivial
1160							) {
1161	214	100				364	if (defined($first_trivial_accepting_state_ind)) {
1162	115					342	$$state_ind_to_equiv{$_ + $init_state_ind}
1163							= $first_trivial_accepting_state_ind;
1164							}
1165							else {
1166	99					315	$first_trivial_accepting_state_ind
1167							= $_ + $init_state_ind;
1168							}
1169							}
1170							else {
1171	892					783	for ( @{$$orig_state[1]} ) { # transition list
	892					1362
1172	1124	100	100			3336	($$_[1] += $init_state_ind) == $init_state_ind
1173							&& ($init_reachable \|\|= 1);
1174							}
1175							}
1176							};
1177	262					683	push(@$union, @$nfa);
1178
1179							# merge initial $transitions
1180	262					273	push(@{$$union[0][1]}, map { [ @$_ ] } @{$$nfa[0][1]});
	262					389
	294					812
	262					394
1181	262	100				712	if (!$init_reachable) {
1182							# for being removed by _nfa_shrink_equiv()
1183	237					618	$$state_ind_to_equiv{$init_state_ind} = $init_state_ind;
1184							}
1185							};
1186	125	100				391	if (keys(%$state_ind_to_equiv)) {
1187	116					226	return _nfa_shrink_equiv($union, $state_ind_to_equiv);
1188							}
1189							else {
1190	9					39	return $union;
1191							}
1192							}
1193
1194							{
1195
1196							my %cached_cc_inter2;
1197
1198							=item C
1199
1200							Returns C<$out_nfa>, a $C<$nfa> computed from C<@in_nfas>.
1201
1202							C<$out_nfa> accepts a word w if and only if each of C<@in_nfas> accepts w.
1203
1204							=cut
1205
1206							sub nfa_inter {
1207	16			16	1	804	my ($inter, @nfas) = sort { @$a <=> @$b } @_;
	21					62
1208	16					30	for (@nfas) { $inter = nfa_inter2($inter, $_); }
	18					58
1209							return
1210	16		50			123	$inter
1211							\|\| [[1, [[$cc_any, 0]]]] # neutral element: accepting anything
1212							;
1213							}
1214
1215							# Multiplies the total number of states
1216							sub nfa_inter2 {
1217	18			18	0	28	my ($nfa_0, $nfa_1) = @_;
1218
1219							# computed states
1220	18					70	my @todo = (0);
1221	18					26	my %todo_seen; # set of state_inds
1222							my %done; # key-subset of %todo_seen (values are states)
1223							# After the following while, %done are %todo_seen the same set.
1224
1225							# dead end detection
1226	0					0	my %path_tr;
1227	0					0	my @cur_livings;
1228	0					0	my %livings;
1229
1230							# tmp variables
1231							my (
1232	0					0	$from_state_ind, $to_state_ind
1233							, $nfa_0_accepting, $nfa_0_transitions
1234							, $nfa_1_accepting, $nfa_1_transitions
1235							, $t_0, $t_1
1236							, $char_class
1237							, $accepting
1238							, @keys_path_to_state_ind
1239							);
1240
1241	18					28	my $nfa_1_len = @$nfa_1;
1242
1243	18					49	while (@todo) {
1244	831					1513	$todo_seen{$from_state_ind} = $from_state_ind = pop(@todo);
1245
1246	831					1478	($nfa_0_accepting, $nfa_0_transitions)
1247	831					854	= @{$$nfa_0[$from_state_ind / $nfa_1_len]}; # i-th state
1248	831					1349	($nfa_1_accepting, $nfa_1_transitions)
1249	831					864	= @{$$nfa_1[$from_state_ind % $nfa_1_len]}; # j-th state
1250
1251	831					1142	my $new_transitions = [];
1252	831					1056	for $t_0 (@$nfa_0_transitions) {
1253	1742					2079	for $t_1 (@$nfa_1_transitions) {
1254
1255	4160	100	66			18349	if (
1256							(
1257							$char_class
1258							= $cached_cc_inter2{$$t_0[0]}{$$t_1[0]}
1259							\|\|= &cc_inter2($$t_0[0], $$t_1[0])
1260							) != $cc_none
1261							) {
1262	1942					4736	push (@$new_transitions, [
1263							$char_class
1264							, $to_state_ind = $$t_0[1] * $nfa_1_len + $$t_1[1]
1265							]);
1266	1942	100				4093	if (!exists($todo_seen{$to_state_ind})) {
1267	813					3309	push(@todo,
1268							$todo_seen{$to_state_ind} = $to_state_ind);
1269							}
1270	1942					5748	$path_tr{$to_state_ind}{$from_state_ind} = undef;
1271							}
1272							}
1273							}
1274	831	100	100			2358	if ($accepting = $nfa_0_accepting && $nfa_1_accepting) {
1275	19					37	push(@cur_livings, $from_state_ind);
1276							}
1277	831					3085	$done{$from_state_ind} = [
1278							$accepting
1279							, $new_transitions
1280							];
1281							}
1282
1283							# remove dead ends
1284	18					37	%livings = map { ($_ => $_) } @cur_livings;
	19					89
1285	18					53	while (@cur_livings) {
1286	611					1522	push(@cur_livings,
1287	1518					2670	map { $livings{$_} = $_ }
1288	630					1437	grep { !exists($livings{$_}) }
1289	630					634	keys(%{$path_tr{pop(@cur_livings)}})
1290							);
1291							}
1292
1293	18	50				59	if (keys(%livings) == 0) {
1294	0					0	return [[0, []]];
1295							}
1296
1297							# compact renumbering
1298	18					28	my @sorted_keys;
1299							my $inter = [@done{
1300	18					185	@sorted_keys = sort { $a <=> $b } keys(%livings)
	2879					3145
1301							}];
1302	18					61	my $i = 0;
1303	18					40	my %compact_map = map { ($_ => $i++) } @sorted_keys;
	630					947
1304
1305	18					74	for (
1306	630					1582	map {
1307	1638					3114	@{$$_[1]}
1308	630					841	= grep { exists($compact_map{$$_[1]}) }
1309	630					559	@{$$_[1]}
1310							}
1311							@$inter
1312							) {
1313	1518					1898	$$_[1] = $compact_map{$$_[1]};
1314							}
1315	18					1000	return $inter;
1316							}
1317							}
1318
1319							sub nfa_resolve_anchors {
1320	10			10	0	17	my ($nfa) = @_;
1321
1322							# find state_inds reachable from the root by begin-anchor transitions
1323	10					26	my %begs = (0 => undef);
1324	10					21	my @todo = (0);
1325	10					30	while (defined(my $beg = pop(@todo))) {
1326	12					16	for (
1327	2					4	map { $$_[1] } # state_ind
	21					85
1328	12					29	grep { $$_[0][0][1] == -1 } # begin-achor
1329							@{$$nfa[$beg][1]}
1330							) {
1331	2	50				7	if (!exists($begs{$_})) {
1332	2					4	$begs{$_} = undef;
1333	2					7	push(@todo, $_);
1334							}
1335							}
1336							}
1337
1338							# find state_inds leading to an accepting state by end-anchor transitions
1339	10					13	my @cur_livings;
1340							my %path_tr;
1341	10					29	for my $from_state_ind (0..$#$nfa) {
1342	37					41	for (@{$$nfa[$from_state_ind][1]}) {
	37					68
1343	44					147	$path_tr{$$_[1]}{$from_state_ind} = $$_[0];
1344							}
1345	37	100				91	if ($$nfa[$from_state_ind][0]) {
1346	10					24	push(@cur_livings, $from_state_ind);
1347							}
1348							}
1349	10					22	my %livings = map {($_ => undef)} @cur_livings;
	10					32
1350	10					35	while (defined(my $end = pop(@cur_livings))) {
1351	10					17	for (
1352	16					127	grep {
1353	10					29	$path_tr{$end}{$_}[0][0] == -3; # end-anchor
1354							}
1355							keys(%{$path_tr{$end}})
1356							) {
1357	0	0				0	if (!exists($livings{$_})) {
1358	0					0	push(@cur_livings, $livings{$_} = undef);
1359	0					0	$$nfa[$_][0] = 1;
1360							}
1361							}
1362							}
1363
1364	10					15	my $accept_empty;
1365	10	100				23	if (!($accept_empty = scalar(grep {$$nfa[$_][0]} keys(%begs)) ? 1 : 0)) {
	12	100				58
1366							# special case for $^ for and the like: empty string matches
1367	9					13	my %begends;
1368	9					19	my @todo = keys(%begs);
1369	9					134	while (defined(my $begend = pop(@todo))) {
1370	22					26	for (
1371	16					35	map { $$_[1] } # state_ind
	31					86
1372	22					43	grep { $$_[0][0][1] < 0 } # achor
1373							@{$$nfa[$begend][1]}
1374							) {
1375	16	50	66			66	if (!exists($begs{$_}) && !exists($begends{$_})) {
1376	14	100				43	if ($$nfa[$_][0]) {
1377	3					6	$accept_empty = 1;
1378	3					7	@todo = ();
1379	3					15	last;
1380							}
1381	11					16	$begends{$_} = undef;
1382	11					44	push(@todo, $_);
1383							}
1384							}
1385							}
1386							}
1387
1388							# remove anchors
1389	10					25	for my $from_state_ind (
	44					92
1390							grep {
1391	37					56	grep { $$_[0][0][0] < 0 } # anchor
1392	37					39	@{$$nfa[$_][1]} # transitions
1393							}
1394							(0..$#$nfa)
1395							) {
1396	20					27	my $state = $$nfa[$from_state_ind];
1397							$$state[1] = [
1398							map {
1399	30	100				61	if ($$_[0][0][0] >= 0) {
	20	50				39
	20					29
1400	10					23	$_;
1401							}
1402							elsif ( @{$$_[0]} == 1 ) {
1403	20					119	delete($path_tr{$$_[1]}{$from_state_ind});
1404	20					64	();
1405							}
1406							else {
1407	0					0	$path_tr{$$_[1]}{$from_state_ind}
1408							= $$_[0]
1409	0					0	= interval_list_to_cc(@{$$_[0]}[1..$#{$$_[0]}]);
	0					0
1410	0					0	$_;
1411							}
1412							}
1413	20					20	@{$$state[1]} # transitions
1414							];
1415							}
1416
1417							# ensure that the initial state cannot be reached
1418	10	100				26	if (@{$$nfa[0][1]}) {
	10					29
1419							# proper init transitions (clone of the initial state needed)
1420
1421							# replace transitions to the initial state
1422							# with transitions to the cloned initial state
1423	8					67	my $new_state_ind = @$nfa;
1424	8					11	my $clone_reachable;
1425	8					18	for my $transition (
	22					46
1426	32					49	grep { $$_[1] == 0 } # to initial state
1427	32					26	map { @{$$_[1]} } # transitions
1428							@$nfa
1429							) {
1430	8					11	$$transition[1] = $new_state_ind;
1431	8					18	$clone_reachable = 1;
1432							}
1433
1434	8	50				25	if ($clone_reachable) {
1435	8					21	my $new_state = [
1436							$$nfa[0][0]
1437	8					14	, [@{$$nfa[0][1]}]
1438							];
1439	8					17	push(@$nfa, $new_state);
1440	8					18	$path_tr{$new_state_ind} = $path_tr{0};
1441	8					11	for (@{$$nfa[0][1]}) {
	8					18
1442	10					40	$path_tr{$$_[1]}{$new_state_ind} = $$_[0];
1443							}
1444	8	50				30	if ($$nfa[0][0]) {
1445	0					0	$livings{$new_state_ind} = undef;
1446							}
1447							}
1448							}
1449							else {
1450							# no proper init transitions
1451
1452							# drop transitions to the initial state
1453	2					5	for my $state (@$nfa) {
1454	5					6	@{$$state[1]} = grep { $$_[1] != 0 } @{$$state[1]};
	5					9
	2					6
	5					6
1455							}
1456							}
1457	10					20	delete($path_tr{0});
1458
1459							# extend intial state (merge all initial states of %begs)
1460	10	100				26	if (keys(%begs) > 1) {
1461	2					4	my %state_ind_to_char_classes;
1462	2					5	for ( map { @{$$nfa[$_][1]} } keys(%begs) ) {
	4					3
	4					10
1463	5					6	push(@{$state_ind_to_char_classes{$$_[1]}}, $$_[0]);
	5					23
1464							}
1465	2					7	@{$$nfa[0][1]}
	5					11
1466	2					5	= map { [
1467	5					4	$path_tr{$_}{0} = cc_union(@{$state_ind_to_char_classes{$_}})
1468							, int($_)
1469							] }
1470							keys(%state_ind_to_char_classes)
1471							;
1472							}
1473	10	100				34	if ($$nfa[0][0] = $accept_empty) {
1474	4					7	$livings{0} = undef;
1475							}
1476
1477							# remove unreachable states
1478	10					21	my @cur_reachables = (0);
1479	10					21	my %reachables = (0 => 0);
1480	10					31	while (@cur_reachables) {
1481	23					34	my $from_state_ind = shift(@cur_reachables);
1482	23					25	for (
1483	24					50	map { $$_[1] }
	23					64
1484							@{$$nfa[$from_state_ind][1]}
1485							) {
1486	24	100				72	if (!exists($reachables{$_})) {
1487	13					54	push(@cur_reachables, $reachables{$_} = $_);
1488							}
1489							}
1490							}
1491
1492							# remove dead ends
1493	10					23	delete(@livings{grep { !exists($reachables{$_}) } keys(%livings)});
	13					49
1494	10					24	@cur_livings = keys(%livings);
1495	10					27	while (@cur_livings) {
1496	11					15	for (
1497	11					32	grep { exists($reachables{$_}) }
	11					43
1498							keys(%{$path_tr{pop(@cur_livings)}})
1499							) {
1500	8	100				24	if (!exists($livings{$_})) {
1501	5					9	push(@cur_livings, $_);
1502	5					15	$livings{$_} = undef;
1503							}
1504							}
1505							}
1506
1507	10	100				41	if (keys(%livings) == 0) {
		50
1508	4					46	return [[0, []]];
1509							}
1510							elsif (keys(%livings) == @$nfa) {
1511	0					0	return $nfa;
1512							}
1513
1514							# compact renumbering
1515	6					21	my @sorted_keys = sort { $a <=> $b } keys(%livings);
	8					21
1516	6					13	my $i = 0;
1517	6					25	my %compact_map = map { ($_ => $i++) } @sorted_keys;
	11					42
1518
1519							return [
1520	11					18	map {
1521	6					19	@{$$_[1]}
	8					10
1522							= map {
1523	12					28	$$_[1] = $compact_map{$$_[1]};
1524	8					10	$_;
1525							}
1526	11					19	grep { exists($compact_map{$$_[1]}) }
1527	11					16	@{$$_[1]}
1528							;
1529	11					76	$_;
1530							}
1531							@$nfa[@sorted_keys]
1532							];
1533							}
1534
1535							=item C
1536
1537							Returns true if and only if C<$in_nfa> accepts C<$str>.
1538
1539							=cut
1540
1541							sub nfa_match {
1542	19			19	1	7373	my ($nfa, $str) = @_;
1543
1544	19					48	my %state_inds = (0 => 0);
1545	19					74	for my $c ( map { ord($_) } split('', $str) ) {
	119					152
1546	66					292	%state_inds
1547	105					170	= map { $$_[1] => $$_[1] }
1548	74					168	grep { cc_match($$_[0], $c) } # matching transition list
1549	119					218	map { @{$$_[1]} } # all transition list
	74					77
1550							@$nfa[values(%state_inds)] # current states
1551							;
1552							}
1553
1554	19					112	return grep { $$_[0] } @$nfa[values(%state_inds)];
	11					70
1555							}
1556
1557							sub nfa_dump {
1558	0			0	0	0	my ($nfa) = @_;
1559	0					0	my $dump = '';
1560	0					0	for my $i (0..$#$nfa) {
1561	0	0				0	$dump
1562							.= "$i:"
1563							. ($$nfa[$i][0] ? " (accepting)" : "")
1564							. "\n"
1565							;
1566	0					0	for my $transition (@{$$nfa[$i][1]}) {
	0					0
1567	0					0	$dump
1568							.= " "
1569							. cc_to_regex($$transition[0]) . " => $$transition[1]\n";
1570							}
1571							}
1572	0					0	return $dump;
1573							}
1574
1575							=item C
1576
1577							Returns true if and only if the labeled graphs represented by C<$nfa1>
1578							and C<$nfa2> are isomorph. While isomorph C<$nfa>s accept the same language,
1579							the converse is not true.
1580
1581							=cut
1582
1583							sub nfa_isomorph {
1584	89			89	1	1085	my ($nfa1, $nfa2) = @_;
1585
1586	89					229	my %nfa1_nfa2_indexes = (0 => 0);
1587	89					181	my %nfa2_nfa1_indexes = (0 => 0);
1588	89					182	my @nfa1_index_todo = (0);
1589
1590	89					255	while (defined(my $nfa1_index = pop(@nfa1_index_todo))) {
1591
1592	470					731	my $state1 = $$nfa1[$nfa1_index];
1593	470					672	my $state2 = $$nfa2[$nfa1_nfa2_indexes{$nfa1_index}];
1594
1595							# accepting
1596	470	50				989	if ($$state1[0] != $$state2[0]) {
1597	0					0	return 0;
1598							}
1599
1600							# transitions
1601	470					449	my $transitions1 = [sort { $$a[0] <=> $$b[0] } @{$$state1[1]}];
	1847					2885
	470					1415
1602	470					656	my $transitions2 = [sort { $$a[0] <=> $$b[0] } @{$$state2[1]}];
	1851					2785
	470					898
1603	470	50				1073	if (@$transitions1 != @$transitions2) {
1604	0					0	return 0;
1605							}
1606	470					888	for my $i (0..$#$transitions1) {
1607	1393					1390	my ($cc1, $next_index1) = @{$$transitions1[$i]};
	1393					2691
1608	1393					1465	my ($cc2, $next_index2) = @{$$transitions2[$i]};
	1393					2039
1609	1393	50				3502	if ($cc1 ne $cc2) {
1610	0					0	return 0;
1611							}
1612	1393	100				2640	if (exists($nfa1_nfa2_indexes{$next_index1})) {
		50
1613	1012	50				4028	if ($nfa1_nfa2_indexes{$next_index1} != $next_index2) {
1614	0					0	return 0;
1615							}
1616							}
1617							elsif (exists($nfa2_nfa1_indexes{$next_index2})) {
1618							# $nfa2_nfa1_indexes{$next_index2} != $next_index1
1619							# because
1620							# - !exists($nfa1_nfa2_indexes{$next_index1})
1621							# - $nfa1_nfa2_indexes and $nfa2_nfa1_indexes
1622							# are reverse to each other by construction
1623	0					0	return 0;
1624							}
1625							else {
1626	381					769	$nfa1_nfa2_indexes{$next_index1} = $next_index2;
1627	381					591	$nfa2_nfa1_indexes{$next_index2} = $next_index1;
1628	381					1135	push(@nfa1_index_todo, $next_index1);
1629							}
1630							}
1631							}
1632	89					1179	return 1;
1633							}
1634
1635
1636							##############################################################################
1637							# $dfa
1638							##############################################################################
1639
1640							# input X:
1641							# Arbitrary list of intervals.
1642							# output Y:
1643							# List of paarwise disjoint intervals spanning the same subset such that
1644							# for any intersections/unions of intervals of X
1645							# an equal union of intervals of Y exists.
1646							# In short, all boundaries of X are preserved.
1647							#
1648							# Motivation:
1649							# nfas use character classes as alphabet (instead of single code points).
1650							# dfa operations needs a common refinement of sets of character classes.
1651							#
1652							# Example:
1653							# interval_cases( [ [0, 5], [2, 8] ] )
1654							# = [ [0, 1], [2, 5], [6, 8] ]
1655							#
1656							# X: \|0 1 2 3 4 5\|
1657							# \|2 3 4 5 6 7 8\|
1658							# Y: \|0 1\|2 3 4 5\|6 7 8\|
1659							#
1660							sub interval_cases {
1661	1209			1209	0	1419	my ($interval_list) = @_;
1662							my @sorted
1663	31795	50				58266	= sort {
1664	1209					2974	$$a[0] <=> $$b[0]
1665							\|\| $$b[1] <=> $$a[1]
1666							}
1667							@$interval_list
1668							;
1669	1209					1229	my %los;
1670							my %his;
1671	1209					1399	my $i = 0;
1672	1209					2482	while ($i < @sorted) {
1673	3404					6801	$los{$sorted[$i][0]} = undef;
1674	3404					5292	$his{$sorted[$i][1]} = undef;
1675	3404					3864	my $j = $i + 1;
1676	3404		100			20574	while (
			100
1677							$j < @sorted
1678							&& $sorted[$j][0] == $sorted[$i][0]
1679							&& $sorted[$j][1] == $sorted[$i][1]
1680							) {
1681							# $sorted[$i] ---------
1682							# $sorted[$j] ---------
1683	1347					6644	$j++;
1684							}
1685	3404		100			14800	while (
			66
1686							$j < @sorted
1687							&& $sorted[$j][0] == $sorted[$i][0]
1688							&& $sorted[$j][1] < $sorted[$i][1]
1689							) {
1690							# $sorted[$i] ---------
1691							# $sorted[$j] -----
1692	1240					2069	$his{$sorted[$j][1]} = undef;
1693	1240					1886	$los{$sorted[$j][1]+1} = undef;
1694	1240					7359	$j++;
1695							}
1696							# $sorted[$j][0] > $sorted[$i][0]
1697	3404		100			14141	while (
1698							$j < @sorted
1699							&& $sorted[$j][1] < $sorted[$i][1]
1700							) {
1701							# $sorted[$i] ---------
1702							# $sorted[$j] -----
1703	2674					4201	$his{$sorted[$j][0]-1} = undef;
1704	2674					3394	$los{$sorted[$j][0]} = undef;
1705	2674					3183	$his{$sorted[$j][1]} = undef;
1706	2674					3617	$los{$sorted[$j][1]+1} = undef;
1707	2674					11252	$j++;
1708							}
1709	3404	100	100			11948	if (
1710							$j < @sorted
1711							&& $sorted[$j][0] <= $sorted[$i][1]
1712							) {
1713							# $sorted[$j][0] > $sorted[$i][0]
1714							# && $sorted[$j][0] <= $sorted[$i][1]
1715							# && $sorted[$j][1] >= $sorted[$i][1]
1716							#
1717							# $sorted[$i] ---------
1718							# $sorted[$j] -----
1719	343					544	$his{$sorted[$j][0]-1} = undef;
1720	343	50				786	if ($sorted[$i][1] != $sorted[$j][1]) {
1721	0					0	$los{$sorted[$i][1]+1} = undef;
1722							}
1723							}
1724	3404					7809	$i = $j;
1725							}
1726	1209					3859	my @sorted_los = sort( { $a <=> $b } keys(%los));
	5992					8546
1727	1209					3488	my @sorted_his = sort( { $a <=> $b } keys(%his));
	6003					7953
1728	1209					2880	return [ map { [$sorted_los[$_], $sorted_his[$_]] } (0..$#sorted_los) ];
	3863					13160
1729							}
1730
1731							=item C
1732
1733							Compute a deterministic finite automaton from C<$in_nfa>
1734							(powerset construction).
1735
1736							The data structure of a deterministic finite automaton (dfa) is
1737							the same as that of a non-deterministic one, but it is further constrained:
1738							For each state and each unicode character there exist exactly one transition
1739							(i.e. a pair C<(char_class, $state_index)>) matching this character.
1740
1741							Note that the following constraint hold for both a C<$dfa> and a C<$nfa>:
1742							For each pair of state p1 and p2, there exists at most one transition
1743							from p1 to p2 (artefact of this implementation).
1744
1745							=cut
1746
1747							sub nfa_to_dfa {
1748	178			178	1	223	my ($nfa) = @_;
1749	178					295	my $dfa = [];
1750	178	50				434	if (!@$nfa) {
1751	0					0	return [[0, [$cc_any, 0]]];
1752							}
1753	178					225	my $trap_needed = 0;
1754	178					221	my $dfa_size = 0;
1755	178					439	my %dfa_indexes = ("0" => $dfa_size++);
1756	178					446	my @todo = ([0]);
1757	178					406	while (@todo) {
1758	1037					1525	my $nfa_indexes = pop(@todo);
1759	1037					2224	my $dfa_index = $dfa_indexes{join('.', @$nfa_indexes)};
1760	1037					2759	my @nfa_states = @$nfa[@$nfa_indexes];
1761
1762							# accepting
1763	1037	100				1346	$$dfa[$dfa_index][0] = scalar(grep { $$_[0] } @nfa_states) ? 1 : 0;
	1391					4532
1764
1765							# transitions
1766	3348					7401	my $cases = interval_cases([
1767	3348					3167	map { @{$$_[0]} }
	1391					3651
1768	1037					1394	map { @{$$_[1]} }
	1391					1391
1769							@nfa_states
1770							]);
1771	1037					4409	my %dfa_index_to_intervals;
1772	1037					1604	for my $interval (@$cases) {
1773							my @next_nfa_indexes
1774	3020					3278	= sort(keys(%{{
	3735					17588
1775	18666					38751	map { ($$_[1] => undef) }
1776	3899					9783	grep { cc_match($$_[0], $$interval[0]) }
1777	3020					4161	map { @{$$_[1]} }
	3899					3453
1778							@nfa_states
1779							}}))
1780							;
1781	3020					8976	my $next_index_key = join('.', @next_nfa_indexes);
1782	3020	100				7160	if (!exists($dfa_indexes{$next_index_key})) {
1783	859					1526	$dfa_indexes{$next_index_key} = $dfa_size++;
1784	859					1465	push(@todo, \@next_nfa_indexes);
1785							}
1786	3020					3029	push(@{$dfa_index_to_intervals{$dfa_indexes{$next_index_key}}},
	3020					10767
1787							$interval
1788							);
1789							}
1790
1791	1037					1713	my @any_ccs;
1792	2855					5603	$$dfa[$dfa_index][1] = [
1793							map {
1794	1037					2681	my $cc = interval_list_to_cc($dfa_index_to_intervals{$_});
1795	2855					4243	push(@any_ccs, $cc);
1796	2855					7361	[$cc, $_ ];
1797							}
1798							sort(keys(%dfa_index_to_intervals))
1799							];
1800	1037	100				2276	if ((my $all_cc = cc_union(@any_ccs)) != $cc_any) {
1801	990					1072	$trap_needed = 1;
1802	990					879	push(@{$$dfa[$dfa_index][1]},
	990					2362
1803							[ cc_neg($all_cc), -1 ]
1804							);
1805							}
1806							}
1807
1808	178	100				409	if ($trap_needed) {
1809	168					319	for (
1810	3735					6604	grep { $$_[1] == -1 }
	999					2279
1811	999					862	map { @{$$_[1]} }
1812							@$dfa
1813							) {
1814	990					1225	$$_[1] = $dfa_size;
1815							}
1816	168					745	$$dfa[$dfa_size] = [0, [[$cc_any, $dfa_size]]];
1817							}
1818
1819	178					833	return $dfa;
1820							}
1821
1822
1823							=item C
1824
1825
1826							Computes a minimal deterministic C<$dfa> from the given C<$in_dfa>
1827							(Hopcroft's algorithm).
1828
1829							Note that the given C<$in_dfa> must be a C<$dfa>, as
1830							returned from C, and not a mere C<$nfa>.
1831
1832							Myhill-Nerode theorem: two minimal dfa accepting
1833							the same language are isomorph (i.e. C returns true).
1834
1835							=cut
1836
1837							sub dfa_to_min_dfa {
1838	178			178	1	267	my ($dfa) = @_;
1839	178					277	my @acceptings;
1840							my @non_acceptings;
1841	0					0	my @intervals;
1842	178					470	for my $index (0..$#$dfa) {
1843	1205	100				2179	if ($$dfa[$index][0]) {
1844	245					336	push(@acceptings, $index);
1845							}
1846							else {
1847	960					1131	push(@non_acceptings, $index);
1848							}
1849	1205					1130	push(@intervals, map { @{$$_[0]} } @{$$dfa[$index][1]})
	4013					3424
	4013					8077
	1205					1855
1850							}
1851	178					256	my $partition;
1852	178	100				325	if (@non_acceptings) {
1853	172					375	$partition = [\@non_acceptings, \@acceptings];
1854	172					882	my %todo = (join('.', @non_acceptings) => \@non_acceptings);
1855	172					347	my $cases = interval_cases(\@intervals);
1856	172					661	while (my ($todo_key) = keys(%todo)) {
1857	764					1017	my %indexes = map { ($_ => undef) } @{delete($todo{$todo_key})};
	1786					4236
	764					1776
1858	764					1658	for my $interval (@$cases) {
1859	15626					30673	my %prev_inds = (
1860	229606					234666	map { ($_ => undef) }
1861							grep {
1862	7419					21761	my $i = $_;
1863	796851	100				2052263	grep {
1864	229606					339323	exists($indexes{$$_[1]})
1865							&& cc_match($$_[0], $$interval[0])
1866							}
1867	229606					205777	@{$$dfa[$i][1]}
1868							}
1869							(0..$#$dfa)
1870							);
1871	7419					18473	my $refined_partition;
1872	7419					10048	for my $partition_indexes (@$partition) {
1873	105893					105026	my (@inter, @diff);
1874	105893					136095	for (@$partition_indexes) {
1875	229606	100				338336	if (exists($prev_inds{$_})) {
1876	15626					26582	push(@inter, $_);
1877							}
1878							else {
1879	213980					354311	push(@diff, $_);
1880							}
1881							}
1882	105893	100	100			272952	if (!@inter \|\| !@diff) {
1883	105301					234937	push(@$refined_partition, $partition_indexes);
1884							}
1885							else {
1886	592					1389	push(@$refined_partition, \@inter, \@diff);
1887	592					3354	my $prev_inds_key = join('.', sort(keys(%prev_inds)));
1888	592	50				2005	if ($todo{$prev_inds_key}) {
		100
1889	0					0	delete($todo{$prev_inds_key});
1890	0					0	$todo{join('.', @diff)} = \@diff;
1891	0					0	$todo{join('.', @inter)} = \@inter;
1892							}
1893							elsif (@diff < @inter) {
1894	147					587	$todo{join('.', @diff)} = \@diff;
1895							}
1896							else {
1897	445					1563	$todo{join('.', @inter)} = \@inter;
1898							}
1899							}
1900							}
1901	7419					37875	$partition = $refined_partition;
1902							}
1903							}
1904							}
1905							else {
1906	6					12	$partition = [\@acceptings];
1907							}
1908	178					246	my $state_ind_to_equiv;
1909	178					322	for (grep { @$_ != 1 } @$partition) {
	942					1589
1910	124					677	@$state_ind_to_equiv{@$_[1..$#$_]} = ($$_[0]) x $#$_;
1911							}
1912	178					586	return _nfa_shrink_equiv($dfa, $state_ind_to_equiv);
1913							}
1914
1915
1916							##############################################################################
1917							# $tree
1918							##############################################################################
1919
1920							=back
1921
1922							=head2 Tree
1923
1924							$tree = [ $star, [ $alt_0, $alt_1, ... ] ]
1925							or $char_class # ref($char_class) eq CHAR_CLASS
1926							or undef # accepting nothing
1927							$alt = [ $tree_0, $tree_1, ... ]
1928
1929							A C<$tree> is a hierarchical data structure used as intermediate form for
1930							regular expression generation routines.
1931
1932							Similar to a parse tree, except that the C<$tree>s described here are not the
1933							direct result of the parsing routines C; indeed, the parsing
1934							routines generate a C<$nfa>, which then can be converted to a C<$tree>.
1935
1936							A string is spanned by C<$tree = [$star, [ $alt_0, $alt_1, ... ] ]> if it is
1937							spanned by one of the C<$alt_i> (if C<$star> is false) of a repetition thereof
1938							(if C<$star> is true).
1939
1940							A string is spanned by C<$alt = [ $tree_0, $tree_1, ...]> if it is the
1941							concatenation of C<@substrings>, each C<$substrings[$i]> being spanned by
1942							C<$$alt[$i]>.
1943
1944							=over 4
1945
1946							=item C
1947
1948							Converts a C<$nfa> to a C<$tree>.
1949							Returns C if the C<$nfa> accepts nothing (not even the empty string).
1950
1951							=cut
1952
1953							sub nfa_to_tree {
1954	107			107	1	175	my ($nfa) = @_;
1955
1956							# Warshall algorithm (Kleen's theorem)
1957							# with preliminary computations:
1958							# - words-pathes (unbranched pathes) are shrinked
1959							# - unique accepting state is ensured
1960							# - branches (with single parent) are skipped
1961
1962	107					169	my $path = {};
1963	107					162	my $path_tr = {};
1964	107					122	my %accepting_state_inds;
1965
1966							# Initialization of the pathes
1967
1968	107					313	for my $i (0..$#$nfa) {
1969	605	100				1246	if ($$nfa[$i][0]) {
1970	127					341	$accepting_state_inds{$i} = $i;
1971							}
1972	605					587	for (@{$$nfa[$i][1]}) {
	605					1147
1973	922					3980	$$path{$i}{$$_[1]}
1974							= $$path_tr{$$_[1]}{$i}
1975							= $$_[0];
1976							}
1977							}
1978
1979	107					158	if (TRACE_NFA_TO_TREE) {
1980							print STDERR "before word shrink\n";
1981							for my $i (sort {$a <=> $b} (keys(%$path))) {
1982							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
1983							print STDERR "$i $j: " . cc_to_regex($$path{$i}{$j}) . "\n";
1984							}}
1985							}
1986
1987	107					150	my @tree_list;
1988							my @state_ind_path;
1989
1990							# word-pathes (unbranched pathes) are shrinked
1991	107					244	for my $first (0..$#$nfa) {
1992	605	100				1282	if (!exists($$path{$first})) { next; }
	190					218
1993	415					396	my @todo = keys(%{$$path{$first}});
	415					1327
1994	415					5048	my %todo_ctrl;
1995	415					2334	while (@todo) {
1996	782					1236	$todo_ctrl{my $i = pop(@todo)} = undef;
1997	782	100	100			837	if (
1998	782					2757	keys(%{$$path_tr{$i}}) != 1
1999							\|\| $i == $first
2000							) {
2001	591					1604	next;
2002							}
2003
2004	191					409	my @tree_list = ($$path{$first}{$i});
2005	191					279	my @state_ind_path = ($i);
2006
2007	191		66			200	while (
2008	318					968	keys(%{$$path{$i}}) == 1
	180					694
2009							&& (my $j = (keys(%{$$path{$i}}))[0]) != $first
2010							) {
2011	180					317	push(@tree_list, $$path{$i}{$j});
2012	180					229	push(@state_ind_path, $i = $j);
2013	180	100				206	if (keys(%{$$path_tr{$j}}) != 1) {
	180					547
2014	53					94	last;
2015							}
2016							}
2017
2018	191					219	if (TRACE_NFA_TO_TREE) {
2019							print STDERR "first, state_ind_path: $first, @state_ind_path\n";
2020							}
2021
2022	191	100				651	if (@state_ind_path > 1) {
2023
2024	61					65	if (TRACE_NFA_TO_TREE) {
2025							print STDERR "delete head $first -> $state_ind_path[0]\n";
2026							}
2027	61					138	delete($$path{$first}{$state_ind_path[0]});
2028	61					183	for (@state_ind_path[0..$#state_ind_path-1]) {
2029	180					303	delete($$path{$_});
2030	180					291	delete($$path_tr{$_});
2031	180					277	if (TRACE_NFA_TO_TREE) {
2032							print STDERR "delete path $_ -> *\n";
2033							print STDERR "delete path * <- $_\n";
2034							}
2035							}
2036	61					200	delete($$path_tr{$state_ind_path[-1]}{$state_ind_path[-2]});
2037	61	100				154	if (!exists($todo_ctrl{$state_ind_path[-1]})) {
2038	40					65	$todo_ctrl{$state_ind_path[-1]} = undef;
2039	40					67	push(@todo, $state_ind_path[-1]);
2040							}
2041	61					64	if (TRACE_NFA_TO_TREE) {
2042							print STDERR "delete tail $state_ind_path[-1] <- $state_ind_path[-2]\n";
2043							}
2044
2045
2046							# $first -> $last
2047	61					80	my $last = $state_ind_path[-1];
2048	61	100				300	$$path{$first}{$last}
2049							= $$path_tr{$last}{$first}
2050							= exists($$path{$first}{$last})
2051							? tree_alt(
2052							$$path{$first}{$last}
2053							, tree_concat(@tree_list)
2054							)
2055							: tree_concat(@tree_list)
2056							;
2057
2058	61					106	if (TRACE_NFA_TO_TREE) {
2059							print STDERR
2060							"$first -> $last created (first ->last): "
2061							. join('', map {_tree_to_regex($_)} @tree_list) . "\n";
2062							}
2063
2064	61					141	for (0..$#state_ind_path-1) {
2065
2066							# $first -> accepting
2067	180	100				643	if ($accepting_state_inds{
2068							my $state_ind = $state_ind_path[$_]
2069							}) {
2070	30	50				108	$$path{$first}{$state_ind}
2071							= $$path_tr{$state_ind}{$first}
2072							= exists($$path{$first}{$state_ind})
2073							? tree_alt(
2074							$$path{$first}{$state_ind}
2075							, tree_concat(@tree_list[0..$_])
2076							)
2077							: tree_concat(@tree_list[0..$_])
2078							;
2079	30					191	if (TRACE_NFA_TO_TREE) {
2080							print STDERR
2081							"$first -> $state_ind created (first -> accepting): "
2082							. join('', map {_tree_to_regex($_)} @tree_list[0..$_]) . "\n";
2083							}
2084							}
2085							}
2086							}
2087							}
2088							}
2089
2090	107					160	if (TRACE_NFA_TO_TREE) {
2091							print STDERR "after word shrink\n";
2092							for my $i (sort {$a <=> $b} (keys(%$path))) {
2093							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2094							print STDERR "$i $j: " . tree_dump($$path{$i}{$j}) . "\n";
2095							}}
2096							for my $j (sort {$a <=> $b} (keys(%$path_tr))) {
2097							for my $i (sort {$a <=> $b} (keys(%{$$path_tr{$j}}))) {
2098							print STDERR "$j <- $i: " . tree_dump($$path_tr{$j}{$i}) . "\n";
2099							}}
2100							}
2101
2102							# unique accepting state is ensured
2103							# (pseudo-unique: the initial state may additionally be accepting)
2104	107					196	my $unique_accepting_state_ind = @$nfa;
2105	107	100	100			338	if (
		100
2106							keys(%accepting_state_inds) == 1
2107							) {
2108	92					220	$unique_accepting_state_ind = (keys(%accepting_state_inds))[0];
2109							}
2110							elsif (
2111							keys(%accepting_state_inds) == 2
2112							&& exists($accepting_state_inds{0})
2113							) {
2114	6					15	$unique_accepting_state_ind
2115	3					11	= (grep {$_} keys(%accepting_state_inds))[0];
2116							}
2117							else {
2118	12					28	$unique_accepting_state_ind = @$nfa;
2119	12					35	for my $to_state_ind (keys(%accepting_state_inds)) {
2120	29					33	for my $from_state_ind (keys(%{$$path_tr{$to_state_ind}})) {
	29					82
2121	50					161	push(
2122	50					49	@{$$path_tr{$unique_accepting_state_ind}{$from_state_ind}}
2123							, $$path_tr{$to_state_ind}{$from_state_ind}
2124							);
2125							}
2126							}
2127	12					22	for my $from_state_ind (
	12					41
2128							keys(%{$$path_tr{$unique_accepting_state_ind}})
2129							) {
2130	42					133	$$path_tr{$unique_accepting_state_ind}{$from_state_ind}
2131							= $$path{$from_state_ind}{$unique_accepting_state_ind}
2132							= tree_alt(
2133	42					67	@{$$path_tr{$unique_accepting_state_ind}{$from_state_ind}}
2134							);
2135							}
2136							}
2137
2138	107					135	if (TRACE_NFA_TO_TREE) {
2139							print STDERR "after unique state addition\n";
2140							for my $i (sort {$a <=> $b} (keys(%$path))) {
2141							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2142							print STDERR "$i $j: " . tree_dump($$path{$i}{$j}) . "\n";
2143							}}
2144							for my $j (sort {$a <=> $b} (keys(%$path_tr))) {
2145							for my $i (sort {$a <=> $b} (keys(%{$$path_tr{$j}}))) {
2146							print STDERR "$j <- $i: " . tree_dump($$path_tr{$j}{$i}) . "\n";
2147							}}
2148							}
2149
2150	107					224	for my $reversed (0, 1) {
2151	214	100				494	my ($tmp_path, $tmp_path_tr)
2152							= $reversed
2153							? ($path_tr, $path)
2154							: ($path, $path_tr)
2155							;
2156
2157							# branches (with single parent) are skipped
2158							my @branch_inds
2159	339					529	= $reversed
2160	664					1002	? sort {$a <=> $b} (keys(%$tmp_path))
2161	214	100				964	: sort {$b <=> $a} (keys(%$tmp_path))
2162							;
2163	214					524	while (@branch_inds) {
2164	924					10855	my $branch = pop(@branch_inds);
2165	924	100	100			4952	if (
			100
			100
2166	510					1809	!exists($$tmp_path{$branch})
2167							# root cannot be un-branched
2168							\|\| $branch == 0
2169							# accepting states cannot be un-branched
2170							\|\| $branch == $unique_accepting_state_ind
2171							# single parent (non-root have one or more parents)
2172							\|\| keys(%{$$tmp_path_tr{$branch}}) != 1
2173							) {
2174	674					1439	next;
2175							}
2176
2177	250					324	if (TRACE_NFA_TO_TREE) {
2178							print STDERR "branch at $branch\n";
2179							}
2180	250					254	my ($parent) = keys(%{$$tmp_path_tr{$branch}}); # single parent
	250					487
2181	250	100	66			899	if (
			66
2182							ref($$tmp_path{$parent}{$branch}) ne CHAR_CLASS
2183							&& (
2184							# starified parent
2185							$$tmp_path{$parent}{$branch}[0]
2186							# parent containing several pathes
2187							\|\| @{$$tmp_path{$parent}{$branch}[1]} > 1
2188							)
2189							) {
2190	37					80	next;
2191							}
2192
2193	213					225	my (@children) = keys(%{$$tmp_path{$branch}});
	213					590
2194
2195	213					480	for my $child (@children) {
2196	478	100				2078	$$tmp_path{$parent}{$child}
		100
		100
2197							= $$tmp_path_tr{$child}{$parent}
2198							= exists($$tmp_path{$parent}{$child})
2199							? tree_alt(
2200							$$tmp_path{$parent}{$child}
2201							, tree_concat2(
2202							$reversed
2203							? (
2204							$$tmp_path{$branch}{$child}
2205							, $$tmp_path{$parent}{$branch}
2206							)
2207							: (
2208							$$tmp_path{$parent}{$branch}
2209							, $$tmp_path{$branch}{$child}
2210							)
2211							)
2212							)
2213							: tree_concat2(
2214							$reversed
2215							? (
2216							$$tmp_path{$branch}{$child}
2217							, $$tmp_path{$parent}{$branch}
2218							)
2219							: (
2220							$$tmp_path{$parent}{$branch}
2221							, $$tmp_path{$branch}{$child}
2222							)
2223							)
2224							;
2225	478					1216	delete($$tmp_path_tr{$child}{$branch});
2226
2227	478					733	if (TRACE_NFA_TO_TREE) {
2228							print STDERR
2229							"parent -> branch: "
2230							. tree_dump($$tmp_path{$parent}{$branch}) . "\n";
2231							print STDERR
2232							"branch -> child : "
2233							. tree_dump($$tmp_path{$branch}{$child}) . "\n";
2234							print STDERR
2235							"$parent -> $child created (un-branch): "
2236							. tree_dump($$tmp_path{$parent}{$child})
2237							. ($reversed ? " (reversed)" : "" ) . "\n";
2238							print STDERR
2239							"delete $child <- $branch\n";
2240							}
2241
2242							}
2243	213					395	delete($$tmp_path{$parent}{$branch});
2244	213					578	delete($$tmp_path{$branch});
2245	213					533	delete($$tmp_path_tr{$branch});
2246
2247	213					197	if (TRACE_NFA_TO_TREE) {
2248							print STDERR "delete $parent -> $branch\n";
2249							print STDERR "delete $branch -> *\n";
2250							print STDERR "delete $branch <- *\n";
2251							}
2252
2253	213					648	push(@branch_inds, $parent);
2254							}
2255
2256	214					597	if (TRACE_NFA_TO_TREE) {
2257							print STDERR "after branch skip\n";
2258							for my $i (sort {$a <=> $b} (keys(%$tmp_path))) {
2259							for my $j (sort {$a <=> $b} (keys(%{$$tmp_path{$i}}))) {
2260							if ($reversed) {
2261							print STDERR "$j $i: " . tree_dump($$tmp_path{$i}{$j}) . "\n";
2262							}
2263							else {
2264							print STDERR "$i $j: " . tree_dump($$tmp_path{$i}{$j}) . "\n";
2265							}
2266							}}
2267							for my $j (sort {$a <=> $b} (keys(%$tmp_path_tr))) {
2268							for my $i (sort {$a <=> $b} (keys(%{$$tmp_path_tr{$j}}))) {
2269							print STDERR
2270							($reversed ? "$i <- $j: " : "$j <- $i:")
2271							. tree_dump($$tmp_path_tr{$j}{$i}) . "\n";
2272							}}
2273							}
2274
2275							}
2276
2277
2278							# starify diagonal
2279	107					271	for (grep { exists($$path{$_}{$_}) } keys(%$path)) {
	204					676
2280	77					275	$$path{$_}{$_}
2281							= $$path_tr{$_}{$_}
2282							= tree_starify($$path{$_}{$_});
2283							}
2284
2285							# Warshall algorithm (Kleene's theorem)
2286	107					370	my %updates;
2287							# strarified first
2288							my @ks
2289	178	50				508	= sort {
2290	107					325	exists($$path{$b}{$b}) <=> exists($$path{$a}{$a})
2291							\|\| $a <=> $b
2292							}
2293							keys(%$path)
2294							# note that keys(%$path_tr) are not additionally needed
2295							# case i == k && k == j: nothing to do
2296							# case i != k && k != j: $$path{$k}{$j} must exist
2297							# case i == k && k != j: $$path{$k}{$k} must exist
2298							# case i != k && k == j: $$path{$k}{$k} must exist
2299							;
2300	107					217	for my $k (@ks) {
2301	204					215	for my $i (keys(%{$$path_tr{$k}})) { # i -> k
	204					1069
2302	369					1096	for my $j (keys(%{$$path{$k}})) { # k -> j
	369					987
2303	1004	100	100			2886	if ($i == $k && $k == $j) { next; }
	93					224
2304	911					819	my @trees;
2305	911	100	100			4159	if (
			66
2306							exists($$path{$i}{$j})
2307							&& ($i != $k && $k != $j)
2308							) {
2309	412					668	push(@trees, $$path{$i}{$j});
2310							}
2311	911	100				4004	my $new_tree
		100
		100
2312							= exists($$path{$k}{$k})
2313							? tree_concat(
2314							(
2315							$i != $k
2316							? $$path{$i}{$k}
2317							: ()
2318							)
2319							, $$path{$k}{$k}
2320							, (
2321							$k != $j
2322							? $$path{$k}{$j}
2323							: ()
2324							)
2325							)
2326							: tree_concat2($$path{$i}{$k}, $$path{$k}{$j})
2327							;
2328	911	100				4226	push(@trees, $i == $j ? tree_starify($new_tree) : $new_tree);
2329
2330	911	100				1641	if (@trees == 1) {
2331	499					1523	$updates{$i}{$j} = $trees[0];
2332							}
2333							else {
2334	412					739	$updates{$i}{$j} = tree_alt(@trees);
2335							}
2336							}
2337							}
2338	204					767	for my $i (keys(%updates)) {
2339	289					290	for my $j (keys(%{$updates{$i}})) {
	289					736
2340	911					2266	$$path{$i}{$j} = $$path_tr{$j}{$i} = $updates{$i}{$j};
2341							}
2342							}
2343
2344	204					258	if (TRACE_NFA_TO_TREE) {
2345							my $num_of_updates = map {keys(%{$updates{$_}})} keys(%updates);
2346							print STDERR "k = $k ($num_of_updates updates)\n";
2347							if ($num_of_updates) {
2348							for my $i (sort {$a <=> $b} (keys(%$path))) {
2349							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2350							print STDERR "$i $j: ";
2351							print STDERR tree_dump($$path{$i}{$j}) . "\n";
2352							}}
2353							}
2354							}
2355
2356	204					617	%updates = ();
2357							}
2358
2359	107					141	my $tree;
2360
2361							# accepting emtpy init
2362	107	100				492	if ($$nfa[0][0]) {
2363
2364	43	100				139	my $path_0_0 = exists($$path{0}{0}) ? $$path{0}{0} : $cc_none;
2365
2366	43	100				91	if ($unique_accepting_state_ind == 0) {
2367	38					63	$tree = $path_0_0;
2368							}
2369							else {
2370	5					11	my $path_0_end = $$path{0}{$unique_accepting_state_ind};
2371
2372	5	50	100			40	if (
			66
2373							$path_0_0 == $cc_none
2374							&& ref($path_0_end) ne CHAR_CLASS
2375							&& $$path_0_end[0]
2376							) {
2377							# starified expression e* does not need (\|e*)
2378	0					0	$tree = $path_0_end;
2379							}
2380							else {
2381							# non-starified expression e needs (\|e)
2382	5					8	$tree = tree_alt($path_0_0, $path_0_end);
2383							}
2384							}
2385							}
2386							else {
2387	64					162	$tree = $$path{0}{$unique_accepting_state_ind};
2388							}
2389
2390	107					121	if (TRACE_NFA_TO_TREE) {
2391							print STDERR "tree: " . tree_dump($tree) . "\n";
2392							}
2393
2394	107					296	_tree_factorize_fixes($tree);
2395
2396	107					140	if (TRACE_NFA_TO_TREE) {
2397							print STDERR "tree (after factorization): " . tree_dump($tree) . "\n";
2398							}
2399	107					1140	return $tree;
2400							}
2401
2402
2403							# Recursively (bottom up) factorizes prefixes and suffixes out from
2404							# alternations if at least one of them contains a sub-tree.
2405							#
2406							# Example 1: (ab1cd\|ab2cd\|ab3cd) -> ab(1\|2\|3)cd
2407							# Example 2: (ab1cd\|ab2cd\|ab3cd) remains the same (no sub-tree)
2408							#
2409							# Example 2 does not need to be factorized
2410							# because it can be represented by a drop-down list,
2411							# which is the primary purpose of this module;
2412							# in this case, a factorization may lead to counter-intuitive results,
2413							# like words cut in the middle.
2414							#
2415							# But example 1 (less common) could only be represented as mere free-text
2416							# if the common pre- and suf-fixes were not factorized out,
2417							# thus loosing information for the input helper (xxx_to_input_constraints).
2418							#
2419							# This behavior can be changed by setting our $FULL_FACTORIZE_FIXES = 1;
2420							# in this case, Example 2 would produce ab(1\|2\|3)cd.
2421							#
2422							# Modifies $tree in place
2423							#
2424							sub _tree_factorize_fixes {
2425	690			690		915	my ($tree) = @_;
2426	690	100	100			3167	if (
			66
			100
			66
			66
2427	398					1881	!defined($tree)
2428							\|\| ref($tree) eq CHAR_CLASS
2429							\|\| @{$$tree[1]} == 0
2430							\|\| !$FULL_FACTORIZE_FIXES
2431							&& (
2432							@{$$tree[1]} == 1
2433							\|\| !grep { ref($_) ne CHAR_CLASS } map { @$_ } @{$$tree[1]}
2434							)
2435							) {
2436	575					1161	return $tree;
2437							}
2438							else {
2439
2440	115					137	for (grep { grep { ref($_) ne CHAR_CLASS } @$_ } @{$$tree[1]} ) {
	289					367
	875					1586
	115					181
2441	583					980	my $tmp_tree =
2442	161					210	tree_concat(map { _tree_factorize_fixes($_) } @$_)
2443							;
2444	161	100	66			747	if (
			66
2445	160					469	ref($tmp_tree) eq CHAR_CLASS
2446							\|\| $$tmp_tree[0]
2447							\|\| @{$$tmp_tree[1]} > 1
2448							) {
2449	1					4	$_ = [$tmp_tree];
2450							}
2451							else {
2452	160					440	$_ = $$tmp_tree[1][0];
2453							}
2454							}
2455
2456	115					164	my $fst_len = @{$$tree[1][0]};
	115					189
2457	115					156	my ($pre_len, $suf_len) = (0, 0);
2458	115					154	for (1, 0) {
2459	282					410	my ($len_ref, @range)
2460							= $_
2461							? (\$pre_len, (0..$fst_len-1))
2462	230	100				610	: (\$suf_len, map {-$_} (1..$fst_len-$pre_len))
2463							;
2464	230					365	for my $i (@range) {
2465	262	100				257	if (
2466	653	100	66			4011	grep {
2467	262					368	$i >= @$_
2468							\|\| ref($$_[$i]) ne CHAR_CLASS
2469							\|\| $$tree[1][0][$i] != $$_[$i]
2470							}
2471	262					399	@{$$tree[1]}[0..$#{$$tree[1]}]
2472							) {
2473	185					431	last;
2474							}
2475	77					166	$$len_ref++;
2476							}
2477							}
2478	115	100	100			426	if ($pre_len == 0 && $suf_len == 0) {
2479	60					172	return $tree;
2480							}
2481
2482	55					111	my $empty_seen = 0;
2483							my $mid_tree = [
2484							0
2485							, [
2486							map {
2487	120	100				286	if ($pre_len <= $#$_ - $suf_len) {
	55	50				99
2488	89					293	[ @$_[$pre_len..$#$_-$suf_len] ];
2489							}
2490							elsif (!$empty_seen++) {
2491	31					61	[];
2492							}
2493							else {
2494	0					0	();
2495							}
2496							}
2497	55					74	@{$$tree[1]}
2498							]
2499							];
2500	55					94	$$tree[1] = [[
2501	55					141	@{$$tree[1][0]}[0..$pre_len-1]
2502	55					149	, $empty_seen == @{$$tree[1]} ? () : $mid_tree
2503	55	100				94	, @{$$tree[1][0]}[$fst_len-$suf_len..$fst_len-1]
2504							]];
2505	55					187	return $tree;
2506							}
2507							}
2508
2509							=item C
2510
2511							Converts a C<$tree> to an C<$ere> (if C<$to_perlre> is false)
2512							or to a C<$perlre> (if C<$to_perlre> is true).
2513
2514							=cut
2515
2516							sub tree_to_regex {
2517	102	100		102	1	312	my $re = defined($_[0]) ? &_tree_to_regex : '$.';
2518	102	100				867	return $_[1] ? qr/\A$re\z/ms : "^$re\$";
2519							}
2520
2521							{
2522							my %cc_to_regex_cache;
2523
2524							sub _tree_to_regex {
2525	657			657		969	my ($tree, $to_perlre) = (@_, 0);
2526	657	100	100			1083	if (ref($tree) eq CHAR_CLASS) {
	636	50				1245
		100
2527							return
2528	21		100			123	$cc_to_regex_cache{$tree.$to_perlre}
2529							\|\|= cc_to_regex($tree, $to_perlre)
2530							;
2531							}
2532	636					1865	elsif (@{$$tree[1]} == 0) {
2533	0					0	return '';
2534							}
2535							elsif (
2536	365					1173	@{$$tree[1]} == 1 # single alteration
2537							&& @{$$tree[1][0]} == 1 # single atom
2538							) {
2539	185					329	my $atom = $$tree[1][0][0];
2540	185	50				331	if (ref($atom) eq CHAR_CLASS) {
2541	185	100	100			1142	return join('',
2542							$cc_to_regex_cache{$atom.$to_perlre}
2543							\|\|= cc_to_regex($atom, $to_perlre)
2544							, $$tree[0] ? '*' : ()
2545							);
2546							}
2547							else {
2548	0					0	return _tree_to_regex([$$tree[0], $$atom[1]], $to_perlre);
2549							}
2550							}
2551							else {
2552							my $needs_parenthesis
2553							= @{$$tree[1]} > 1 # (a\|...)
2554	451		66			431	\|\| $$tree[0] && @{$$tree[1][0]} > 1 # (ab...)*
2555							;
2556
2557	1947	100	100			9092	return join(''
2558							, ($needs_parenthesis ? ($to_perlre ? '(?:' : '(') : ())
2559							, (
2560							join('\|',
2561							map {
2562	451					700	join('',
2563							map {
2564	804					1095	ref($_) eq CHAR_CLASS
2565							? $cc_to_regex_cache{$_.$to_perlre}
2566							\|\|= cc_to_regex($_, $to_perlre)
2567							: _tree_to_regex($_, $to_perlre)
2568							}
2569							@$_ # alternation
2570							)
2571							}
2572	451	100				975	@{$$tree[1]}
		100
		100
		100
2573							)
2574							)
2575							, ($needs_parenthesis ? ')' : ())
2576							, ($$tree[0] ? '*' : ())
2577							);
2578							}
2579							}
2580							}
2581
2582							# starification (regex)*
2583							sub tree_starify {
2584	173			173	0	214	my ($tree) = @_;
2585	173	100				456	if (ref($tree) eq CHAR_CLASS) {
2586	55					430	return [1, [[$tree]]];
2587							}
2588							else {
2589	118					367	return [1, $$tree[1]];
2590							}
2591							}
2592
2593							# The behavior of tree_concat2 can be altered
2594							# by setting $TREE_CONCAT_FULL_EXPAND = 1;
2595							sub tree_concat2 {
2596	2214			2214	0	16628	my ($tree_0, $tree_1) = @_;
2597	2214					2194	my $concat;
2598
2599							# main criteria:
2600							# CHAR_CLASS
2601							# @{$$tree_n[1]} == 0
2602							# $$tree_n[0]
2603							# @{$$tree_n[1]} == 1
2604
2605	2214	100				3781	if (ref($tree_0) eq CHAR_CLASS) {
	1649	100				4332
		100
		100
2606	565	100				1595	if (@$tree_0 == 0) {
		100
		100
		100
2607	5	100	100			45	if (
2608	3					13	ref($tree_1) ne CHAR_CLASS
2609							&& @{$$tree_1[1]} == 0
2610							) {
2611							# () -> empty
2612	1					2	$concat = $cc_none;
2613							}
2614							else {
2615							# ->
2616	4					7	$concat = $tree_1;
2617							}
2618							}
2619	275					766	elsif (ref($tree_1) eq CHAR_CLASS) {
2620	285	100				458	if (@$tree_1 == 0) {
2621							# a -> a
2622	1					3	$concat = $tree_0;
2623							}
2624							else {
2625							# a b -> (ab)
2626	284					926	$concat = [0, [[ $tree_0, $tree_1 ]]];
2627							}
2628							}
2629							elsif (@{$$tree_1[1]} == 0) {
2630							# a () -> a
2631	1					2	$concat = $tree_0;
2632							}
2633							elsif ($$tree_1[0]) {
2634							# a (b)* -> (a(b)*)
2635	194					534	$concat = [0, [[ $tree_0, $tree_1 ]]];
2636							}
2637							else {
2638	80	100	100			207	if (
2639	247	100				838	$FULL_FACTORIZE_FIXES
2640	145					242	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2641	77					130	map {@$_} @{$$tree_1[1]}
2642							) {
2643							# a (bc\|de) -> (a(bc\|de))
2644							# one of bcde is starified
2645	11					37	$concat = [0, [[ $tree_0, $tree_1 ]]];
2646							}
2647							else {
2648							# a (bc\|de) -> (abc\|ade)
2649							# none of bcde is starified
2650	127					673	$concat = [
2651							0
2652	69					99	, [ map { [ $tree_0, @$_ ] } @{$$tree_1[1]} ]
	69					133
2653							];
2654							}
2655							}
2656							}
2657							elsif (@{$$tree_0[1]} == 0) {
2658	5	100	100			19	if (
2659	3					12	ref($tree_1) ne CHAR_CLASS
2660							&& @{$$tree_1[1]} == 0
2661							) {
2662							# () () -> empty
2663	1					3	$concat = $cc_none;
2664							}
2665							else {
2666							# () ->
2667	4					5	$concat = $tree_1;
2668							}
2669							}
2670	1475					2528	elsif ($$tree_0[0]) {
2671	169	100				299	if (ref($tree_1) eq CHAR_CLASS) {
	122	100				301
		100
		100
2672	47	100				156	if (@$tree_1 == 0) {
2673							# (a)* -> (a)*
2674	1					1	$concat = $tree_0;
2675							}
2676							else {
2677							# (a)* b -> ((a)*b)
2678	46					127	$concat = [0, [[ $tree_0, $tree_1 ]]];
2679							}
2680							}
2681							elsif (@{$$tree_1[1]} == 0) {
2682							# (a)* () -> (a)*
2683	1					2	$concat = $tree_0;
2684							}
2685	120					215	elsif ($$tree_1[0]) {
2686							# (a)* (b)* -> ((a)(b))
2687	1					4	$concat = [0, [[ $tree_0, $tree_1 ]]];
2688							}
2689							elsif (@{$$tree_1[1]} == 1) {
2690							# (a)* (bcd) -> ((a)*bcd)
2691	66					224	$concat = [
2692							0
2693	66					75	, [[ $tree_0, @{$$tree_1[1][0]} ]]
2694							];
2695							}
2696							else {
2697							# (a)* (b\|c) -> ((a)*(b\|c))
2698	54					163	$concat = [0, [[ $tree_0, $tree_1 ]]];
2699							}
2700							}
2701							elsif (@{$$tree_0[1]} == 1) {
2702	1106	100				1831	if (ref($tree_1) eq CHAR_CLASS) {
	859	100				1855
		100
		100
		100
2703	247	100				399	if (@$tree_1 == 0) {
2704							# (ab) -> (ab)
2705	1					3	$concat = $tree_0;
2706							}
2707							else {
2708							# (ab) c -> (abc)
2709	246					801	$concat = [
2710							0
2711	246					247	, [[ @{$$tree_0[1][0]}, $tree_1 ]]
2712							];
2713							}
2714							}
2715							elsif (@{$$tree_1[1]} == 0) {
2716							# (ab) () -> (ab)
2717	1					2	$concat = $tree_0;
2718							}
2719	564					1101	elsif ($$tree_1[0]) {
2720							# (ab) (c)* -> (ab(c)*)
2721	294					304	$concat = [0, [[@{$$tree_0[1][0]}, $tree_1]]];
	294					912
2722							}
2723	896					1856	elsif (@{$$tree_1[1]} == 1) {
2724							# (ab) (cd) -> (abcd)
2725	231					305	$concat = [
2726							0
2727	231					238	, [[ @{$$tree_0[1][0]}, @{$$tree_1[1][0]} ]]
	231					845
2728							];
2729							}
2730							elsif (
2731	333					544	!grep { ref($_) ne CHAR_CLASS } @{$$tree_0[1][0]}
2732							) {
2733	6	50	33			35	if (
2734	18	50				59	$FULL_FACTORIZE_FIXES
2735	12					22	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2736	6					9	map {@$_} @{$$tree_1[1]}
2737							) {
2738							# (ab) (cd\|ef) -> (ab(cd\|ef))
2739							# neither a nor b is a tree
2740							# one of cdef is starified
2741	0					0	$concat = [0, [[@{$$tree_0[1][0]}, $tree_1]]];
	0					0
2742							}
2743							else {
2744							# (ab) (cd\|ef) -> (abcd\|abef)
2745							# neither a nor b is a tree
2746							# none of cdef is starified
2747	12					47	$concat = [
2748							0
2749	6					8	, [ map { [ @{$$tree_0[1][0]}, @$_ ] } @{$$tree_1[1]} ]
	12					12
	6					10
2750							];
2751							}
2752							}
2753							else {
2754							# (ab) (cd\|ef) -> (ab(cd\|ef))
2755							# a or b is a tree
2756	327					431	$concat = [0, [[@{$$tree_0[1][0]} , $tree_1 ]]];
	327					1078
2757							}
2758							}
2759							else {
2760	369	100				628	if (ref($tree_1) eq CHAR_CLASS) {
	330	50				803
		100
		100
		50
2761	39	50				82	if (@$tree_1 == 0) {
2762							# (ab\|cd) -> (ab\|cd)
2763	0					0	$concat = $tree_0;
2764							}
2765							else {
2766	39	100	100			121	if (
2767	226	100				633	$FULL_FACTORIZE_FIXES
2768	91					235	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2769	36					71	map {@$_} @{$$tree_0[1]}
2770							) {
2771							# (ab\|cd) e -> ((ab\|cd)e)
2772							# one of abcd is starified
2773	6					15	$concat = [0, [[ $tree_0, $tree_1 ]]];
2774							}
2775							else {
2776							# (ab\|cd) e -> (abe\|cde)
2777							# none of abcd is starified
2778	85					294	$concat = [
2779							0
2780	33					42	, [ map { [@$_, $tree_1] } @{$$tree_0[1]} ]
	33					55
2781							];
2782							}
2783							}
2784							}
2785							elsif (@{$$tree_1[1]} == 0) {
2786							# (ab\|cd) () -> (ab\|cd)
2787	0					0	$concat = $tree_0;
2788							}
2789	80					259	elsif ($$tree_1[0]) {
2790							# (ab\|cd) (e)* -> ((ab\|cd)(e)*)
2791	250					654	$concat = [0, [[ $tree_0, $tree_1 ]]];
2792							}
2793							elsif (
2794							@{$$tree_1[1]} == 1
2795							) {
2796	8	100				9	if (!grep { ref($_) ne CHAR_CLASS } @{$$tree_1[1][0]}) {
	19					43
	8					17
2797	7	100	66			31	if (
2798	18	100				74	$FULL_FACTORIZE_FIXES
2799	14					25	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2800	7					13	map {@$_} @{$$tree_0[1]}
2801							) {
2802							# (ab\|cd) (ef) -> ((ab\|cd)ef)
2803							# e and f both CHAR_CLASS
2804							# one of abcd is starified
2805	6					8	$concat = [0, [[$tree_0, @{$$tree_1[1][0]}]]];
	6					34
2806							}
2807							else {
2808							# (ab\|cd) (ef) -> (acef\|cdef)
2809							# e and f both CHAR_CLASS
2810							# none of abcd is starified
2811	2					15	$concat = [
2812							0
2813	1					10	, [ map { [@$_, @{$$tree_1[1][0]}] } @{$$tree_0[1]} ]
	2					3
	1					3
2814							];
2815							}
2816							}
2817							else {
2818							# (ab\|cd) (ef) -> ((ab\|cd)ef)
2819							# e or f is a tree
2820	1					2	$concat = [0, [[$tree_0, @{$$tree_1[1][0]}]]];
	1					3
2821							}
2822							}
2823							elsif ($TREE_CONCAT_FULL_EXPAND) {
2824							# (ab\|cd) (ef\|gh) -> (abef\|abgh\|cdef\|cdgh)
2825	0					0	$concat = [
2826							0
2827							, [
2828							map {
2829	0					0	my $alt_0 = $_;
2830	0					0	map { [@$alt_0, @$_] }
	0					0
2831	0					0	@{$$tree_1[1]}
2832							}
2833	0					0	@{$$tree_0[1]}
2834							]
2835							];
2836							}
2837							else {
2838							# (ab\|cd) (ef\|gh) -> ((ab\|cd)(ef\|gh))
2839	72					210	$concat = [0, [[ $tree_0, $tree_1 ]]];
2840							}
2841							}
2842	2214					5902	return $concat;
2843							}
2844
2845							# concatenation regex0regex1...
2846							sub tree_concat {
2847	994	50		994	0	2589	if (@_ == 0) {
		100
		50
2848	0					0	return $cc_none; # neutral element: accepting empty string
2849							}
2850	2767					4992	elsif (@_ == 1) {
2851	31					113	return $_[0];
2852							}
2853							elsif (grep {!defined($_)} @_) {
2854	0					0	return undef; # one accepting nothing -> concat accepting nothing
2855							}
2856
2857							# resolve words first
2858	963					968	my @word;
2859							my @trees;
2860	963					1404	for (@_) {
2861	2767	100				4526	if (ref($_) eq CHAR_CLASS) {
2862	752					1272	push(@word, $_);
2863							}
2864							else {
2865	2015	100				4259	if (@word > 1) {
		100
2866	33					81	push(@trees, [0, [[ @word ]] ] );
2867	33					57	@word = ();
2868							}
2869							elsif (@word) {
2870	220					256	push(@trees, $word[0]);
2871	220					294	@word = ();
2872							}
2873	2015					3181	push(@trees, $_);
2874							}
2875							}
2876	963	100				2220	if (@word > 1) {
		100
2877	91					264	push(@trees, [0, [[ @word ]] ] );
2878							}
2879							elsif (@word) {
2880	146					186	push(@trees, $word[0]);
2881							}
2882
2883	963					1287	my $concat = $trees[0];
2884	963					1666	for my $tree (@trees[1..$#trees]) {
2885	1542					2184	$concat = tree_concat2($concat, $tree);
2886							}
2887
2888	963					2212	return $concat;
2889							}
2890
2891							# alternation regex0\|regex1\|...
2892							sub tree_alt {
2893	819			819	0	823	my @starified_alts;
2894							my @non_starified_alts;
2895	0					0	my $has_empty;
2896
2897	819					1014	for (grep { defined($_) } @_) {
	1604					3048
2898	1604	100				2721	if (ref($_) eq CHAR_CLASS) {
	1256	50				5696
		100
2899	348					1675	push(@non_starified_alts, [$_]);
2900							}
2901							elsif (!@{$$_[1]}) {
2902	0					0	$has_empty = 1;
2903							}
2904							elsif ($$_[0]) {
2905	143					127	push(@starified_alts, @{$$_[1]});
	143					288
2906							}
2907							else {
2908	1113					1326	push(@non_starified_alts, @{$$_[1]});
	1113					2586
2909							}
2910							}
2911
2912	819	100				1795	if (!@starified_alts) {
		100
2913	747	100	66			2223	if (
		50	33
			66
2914	35					122	@non_starified_alts > 1
2915							\|\| $has_empty
2916							\|\| @non_starified_alts && @{$non_starified_alts[0]} > 1
2917							) {
2918							return [
2919	712	50				3773	0
2920							, [
2921							@non_starified_alts
2922							, ($has_empty ? [[0, []]] : ())
2923							]
2924							];
2925							}
2926							elsif (!@non_starified_alts) {
2927	0					0	return undef; # neutral element: accepting nothing
2928							}
2929							else {
2930	35					163	return $non_starified_alts[0][0];
2931							}
2932
2933							}
2934							elsif (!@non_starified_alts) {
2935	71					351	return [1, \@starified_alts];
2936							}
2937							else {
2938							return [
2939	1					6	0
2940							, [
2941							@non_starified_alts
2942							, [[1, \@starified_alts]]
2943							]
2944							];
2945							}
2946							}
2947
2948
2949							# returns an unachored $ere having exactly the same structure
2950							# as the given $tree. Intended for tracing/debugging.
2951							sub tree_dump {
2952	131			131	0	294	my ($tree) = @_;
2953	131	50				265	if (!defined($_[0])) {
2954							# nothing accepted (not even the empty string)
2955	0					0	return '$.';
2956							}
2957	131	100				281	if (ref($tree) eq CHAR_CLASS) {
	83	100				181
2958	48					83	return cc_to_regex($tree);
2959							}
2960							elsif (@{$$tree[1]} == 0) {
2961	20					76	return '()';
2962							}
2963							else {
2964	63					71	return join(''
2965							, '('
2966							, (
2967							join('\|',
2968							map {
2969	63					107	my $alt = $_;
2970	132					148	join('',
2971							map {
2972	63					92	my $atom = $_;
2973	132	100				265	if (ref($atom) eq CHAR_CLASS) {
2974	126					219	cc_to_regex($atom);
2975							}
2976							else {
2977	6					17	tree_dump($atom);
2978							}
2979							}
2980							@$alt
2981							)
2982							}
2983	63	100				94	@{$$tree[1]}
2984							)
2985							)
2986							, ')'
2987							, ($$tree[0] ? '*' : ())
2988							);
2989							}
2990							}
2991
2992
2993							##############################################################################
2994							# $input_constraints
2995							##############################################################################
2996
2997							use constant {
2998	7					33596	FREE_TEXT => 'free text'
2999	7			7		217	};
	7					30
3000
3001							=back
3002
3003							=head2 Input constraints
3004
3005							$input_constraints = [ $input_constraint_0, $input_constraint_1, ... ]
3006							$input_constraint = [ 'word_0', 'word_1', ... ] (drop down)
3007							or 'free_text' (free text)
3008
3009
3010							=over 4
3011
3012							=item C
3013
3014							Converts a C<$tree> to a pair C<($input_constraints, $split_str)>.
3015
3016							C<$split_perlre> is a compiled perl regular expression splitting a string
3017							according to C<$input_constraints>. This C<$perlre> matches if and only if
3018							each drop down can be assigned a value; then C<$str =~ $perlre> in list
3019							context returns as many values as C<@$input_constraints>.
3020
3021							=cut
3022
3023							sub tree_to_input_constraints {
3024	5			5	1	13	my ($input_constraints, $perlres) = &_tree_to_input_constraints;
3025
3026							# concat free texts and stronger underlying regexs
3027	5					8	my @previous_undefs;
3028							my @kept;
3029	5					13	for my $i (0..$#$input_constraints) {
3030	15	100				30	if ($$input_constraints[$i] eq FREE_TEXT) {
3031	4					6	push(@previous_undefs, $i);
3032							}
3033							else {
3034	11	100				30	if (@previous_undefs) {
3035	4					6	push(@kept, $i-1);
3036	4	50				8	if (@previous_undefs > 1) {
3037	0					0	$$perlres[$i-1] = join('',
3038	0					0	map { '(?:' . $$perlres[$_] . ')' }
3039							@previous_undefs
3040							);
3041							}
3042	4					7	@previous_undefs = ();
3043							}
3044	11					26	push(@kept, $i);
3045							}
3046							}
3047	5	50				14	if (@previous_undefs) {
3048	0					0	push(@kept, $#$input_constraints);
3049	0	0				0	if (@previous_undefs > 1) {
3050	0					0	$$perlres[$#$input_constraints] = join('',
3051	0					0	map { '(?:' . $$perlres[$_] . ')' }
3052							@previous_undefs
3053							);
3054							}
3055							}
3056	5					16	@$input_constraints = @$input_constraints[@kept];
3057	5					17	@$perlres = @$perlres[@kept];
3058
3059							# sort words, remove duplicates
3060	5					10	for (grep { $_ ne FREE_TEXT } @$input_constraints) {
	15					32
3061	11					14	$_ = [ sort(keys(%{ { map { ($_ => $_) } @$_ } })) ];
	11					14
	19					103
3062							}
3063
3064							# remove empty words
3065							# concat single words
3066	5					7	my @previous_singles;
3067	5					7	@kept = ();
3068	5					12	for my $i (0..$#$input_constraints) {
3069	15	100	100			40	if (
		50	33
3070	11					39	$$input_constraints[$i] eq FREE_TEXT
3071							\|\| @{$$input_constraints[$i]} > 1
3072	5					25	) {
3073	10	100				20	if (@previous_singles) {
3074	3					5	push(@kept, $i-1);
3075	3	50				8	if (@previous_singles > 1) {
3076	0					0	$$perlres[$i-1] = join('',
3077	0					0	map { $$perlres[$_] }
3078							@previous_singles
3079							);
3080	0					0	$$input_constraints[$i-1] = join('',
3081	0					0	map { $$input_constraints[$_][0] }
3082							@previous_singles
3083							);
3084							}
3085	3					5	@previous_singles = ();
3086							}
3087	10					20	push(@kept, $i);
3088							}
3089							elsif (
3090							@{$$input_constraints[$i]} == 1
3091							&& length($$input_constraints[$i][0])
3092							) {
3093	5					10	push(@previous_singles, $i);
3094							}
3095							}
3096	5	100				18	if (@previous_singles) {
3097	2					12	push(@kept, $#$input_constraints);
3098	2	50				8	if (@previous_singles > 1) {
3099	0					0	$$perlres[$#$input_constraints] = join('',
3100	0					0	map { $$perlres[$_] }
3101							@previous_singles
3102							);
3103	0					0	$$input_constraints[$#$input_constraints] = join('',
3104	0					0	map { $$input_constraints[$_][0] }
3105							@previous_singles
3106							);
3107							}
3108							}
3109	5					26	@$input_constraints = @$input_constraints[@kept];
3110	5					18	@$perlres = @$perlres[@kept];
3111
3112	5	50				12	if (!@$input_constraints) {
3113	0					0	@$input_constraints = (['']);
3114	0					0	@$perlres = ('');
3115							}
3116
3117	15	100				59	my $split_perlre
3118							= join('',
3119							map {
3120	5					19	$$input_constraints[$_] eq FREE_TEXT
3121							? "($$perlres[$_]\|.*?)"
3122							: "($$perlres[$_])"
3123							}
3124							(0..$#$perlres)
3125							)
3126							;
3127	5					391	return ($input_constraints, qr/\A$split_perlre\z/ms);
3128							}
3129
3130							{
3131
3132							my %cc_to_input_constraint_cache;
3133
3134							# returns ($input_constraints, $perlres)
3135							# two references to arrays of the same size.
3136							sub _tree_to_input_constraints {
3137	14			14		19	my ($tree) = @_;
3138	14					13	my $input_constraints;
3139							my $perlres;
3140	14	50				42	if (!defined($tree)) {
		50
		50
		100
		100
3141							# regex accepting nothing -> free text (always rejected)
3142
3143	0					0	$input_constraints = [FREE_TEXT];
3144	0					0	$perlres = ['$.'];
3145							}
3146	14					41	elsif (ref($tree) eq CHAR_CLASS) {
3147							# single character class -> drop down
3148
3149	0		0			0	$input_constraints = [
3150							$cc_to_input_constraint_cache{$tree}
3151							\|\|= cc_to_input_constraint($tree)
3152							];
3153	0					0	$perlres = [_tree_to_regex($tree, 1)];
3154							}
3155							elsif (@{$$tree[1]} == 0) {
3156							# no top-level alternation
3157
3158	0					0	$input_constraints = [['']];
3159	0					0	$perlres = [_tree_to_regex($tree, 1)];
3160							}
3161	12					27	elsif ($$tree[0]) {
3162							# starified regex -> free text
3163
3164	2					4	$input_constraints = [FREE_TEXT];
3165	2					4	$perlres = [_tree_to_regex($tree, 1)];
3166							}
3167							elsif (@{$$tree[1]} == 1) {
3168							# single top-level alternation -> mixed results
3169							# example: ab*c(d\|e)f
3170
3171	5					8	$input_constraints = [];
3172	5					8	$perlres = [];
3173
3174	5					7	my $i = 0;
3175	5					7	while ($i != @{$$tree[1][0]}) {
	17					37
3176	12					13	my $beg = $i;
3177	12					20	my @expanded_words = ('');
3178	12					8	my $cc;
3179	12		100			11	while (
			33
			66
3180	29					4896	$i != @{$$tree[1][0]}
3181							&& ref($cc = $$tree[1][0][$i]) eq CHAR_CLASS
3182							&& (!@$cc \|\| $$cc[-1][1] != MAX_CHAR)
3183							) {
3184	17		66			52	my $input_constraint
3185							= $cc_to_input_constraint_cache{$cc}
3186							\|\|= cc_to_input_constraint($cc)
3187							;
3188
3189							@expanded_words
3190	18					21	= map {
3191	17					27	my $letter = $_;
3192	18					22	map { $_ . $letter }
	22					60
3193							@expanded_words
3194							}
3195							@$input_constraint
3196							;
3197	17					27	$i++;
3198							}
3199	12	100	66			53	if ($beg < $i && length($expanded_words[0])) {
3200	6					24	my $wrd_perlre = _tree_to_regex(
3201							[
3202							0
3203	6					12	, [[ @{$$tree[1][0]}[$beg..$i-1] ]]
3204							]
3205							, 1
3206							);
3207	6					14	push(@$input_constraints, \@expanded_words);
3208	6					12	push(@$perlres, $wrd_perlre);
3209							}
3210	12	100				14	if ($i < @{$$tree[1][0]}) {
	12					30
3211	9					39	my ($sub_input_constraints, $sub_perlres)
3212							= _tree_to_input_constraints($$tree[1][0][$i]);
3213	9	50	66			67	if (
			33
3214							@$sub_input_constraints
3215							&& (
3216							$$sub_input_constraints[0] eq FREE_TEXT
3217							\|\| length($$sub_input_constraints[0][0])
3218							)
3219							) {
3220	9					21	push(@$input_constraints, @$sub_input_constraints);
3221	9					15	push(@$perlres, @$sub_perlres);
3222							}
3223	9					22	$i++;
3224							}
3225							}
3226							}
3227							else {
3228							# multiple top-level alternations
3229
3230	7	100				7	if (
3231	107	100	33			476	grep { grep {
	7					13
3232	17					22	ref($_) ne CHAR_CLASS
3233							\|\| (@$_ && $$_[$#$_][1] == MAX_CHAR)
3234							} @$_ }
3235							@{$$tree[1]}
3236							) {
3237							# some alternation contains a sub-tree -> mixed results
3238							# example: abd\|ab*d
3239							# common pre/suf-fixes are factorized out
3240							# example: a(bd\|b*)d
3241
3242	2					3	my $fst_len = @{$$tree[1][0]};
	2					5
3243	2					3	my ($pre_len, $suf_len) = (0, 0);
3244	2					3	for (1, 0) {
3245	8					8	my ($len_ref, @range)
3246							= $_
3247							? (\$pre_len, (0..$fst_len-1))
3248	4	100				14	: (\$suf_len, map {-$_} (1..$fst_len-$pre_len))
3249							;
3250	4					10	for my $i (@range) {
3251	2	50				3	if (
3252	6	100	66			40	grep {
3253	2					3	$i >= @$_
3254							\|\| ref($$_[$i]) ne CHAR_CLASS
3255							\|\| $$tree[1][0][$i] != $$_[$i]
3256							}
3257	2					3	@{$$tree[1]}[0..$#{$$tree[1]}]
3258							) {
3259	2					5	last;
3260							}
3261	0					0	$$len_ref++;
3262							}
3263							}
3264	2	50				5	if ($pre_len) {
3265	0					0	my ($pre_input_constraints, $pre_perlres)
3266							= _tree_to_input_constraints(
3267							[
3268							0
3269	0					0	, [[ @{$$tree[1][0]}[0..$pre_len-1] ]]
3270							]
3271							);
3272	0					0	push(@$input_constraints, @$pre_input_constraints);
3273	0					0	push(@$perlres, @$pre_perlres);
3274							}
3275
3276	2	50				4	if (
3277							my @mid_alts
3278	5					17	= map { [ @$_[$pre_len..$#$_-$suf_len] ] }
	2					3
3279							@{$$tree[1]}
3280							) {
3281	2					3	push(@$input_constraints, FREE_TEXT);
3282	2					6	push(@$perlres, _tree_to_regex([ 0, \@mid_alts ] , 1));
3283							}
3284
3285	2	50				9	if ($suf_len) {
3286	0					0	my ($suf_input_constraints, $suf_perlres)
3287							= _tree_to_input_constraints(
3288							[
3289							0
3290							, [[
3291	0					0	@{$$tree[1][0]}
3292							[$fst_len-$suf_len..$fst_len-1]
3293							]]
3294							]
3295							);
3296	0					0	push(@$input_constraints, @$suf_input_constraints);
3297	0					0	push(@$perlres, @$suf_perlres);
3298							}
3299							}
3300							else {
3301							# each alternation contains only non negated char classes
3302							# -> drop down
3303
3304	5					13	$perlres = [_tree_to_regex($tree, 1)];
3305	5					10	for my $word (@{$$tree[1]}) {
	5					10
3306	12					17	my @expanded_words = ('');
3307	12		66			20	for my $input_constraint (
	85					243
3308							map {
3309							$cc_to_input_constraint_cache{$_}
3310							\|\|= cc_to_input_constraint($_);
3311							}
3312							@$word
3313							) {
3314	85	50				115	if (@$input_constraint == 1) {
3315	85					95	for (@expanded_words) {
3316	85					164	$_ .= $$input_constraint[0];
3317							}
3318							}
3319							else {
3320							@expanded_words
3321	0					0	= map {
3322	0					0	my $letter = $_;
3323	0					0	map { $_ . $letter }
	0					0
3324							@expanded_words
3325							}
3326							@$input_constraint
3327							;
3328							}
3329							}
3330	12					16	push(@{$$input_constraints[0]}, @expanded_words);
	12					32
3331							}
3332							}
3333							}
3334	14					33	return ($input_constraints, $perlres);
3335							}
3336							}
3337
3338							sub cc_to_input_constraint {
3339	26			26	0	32	my ($cc) = @_;
3340	26	50				67	if (@$cc == 0) {
		50
3341	0					0	return [''];
3342							}
3343							elsif ($$cc[$#$cc][1] == MAX_CHAR) {
3344	0					0	return FREE_TEXT;
3345							}
3346							else {
3347							return [
3348	26					32	map { map { chr($_) } ($$_[0]..$$_[1]) }
	26					38
	27					139
3349							@$cc
3350							];
3351							}
3352							}
3353
3354
3355							##############################################################################
3356							# $ere
3357							##############################################################################
3358
3359							=back
3360
3361							=head2 Ere
3362
3363							An C<$ere> is a perl string.
3364
3365							The syntax an C<$ere> is assumed to follow is based on POSIX ERE
3366							(else the C routines will C).
3367
3368							Unsupported POSIX features:
3369							back-references,
3370							equivalence classes C<[[=a=]]>,
3371							character class C<[[:digit:]]>,
3372							collating symbols C<[[.ch.]]>.
3373
3374							C<)> is always a special character. POSIX says that C<)> is a normal
3375							character if there is no matching C<(>.
3376
3377							There is no escape sequences such as C<\t> for tab or C<\n> for line feed.
3378							POSIX does not specify such escape sequences neither.
3379
3380							C<\> before a non-special character is ignored
3381							(except in bracket expressions). POSIX does not allow it.
3382
3383							The empty string is legal in alternations (C<(\|a)> is equivalent to C<(a?)>).
3384							POSIX does not allow it.
3385							The C<(\|a)> form is generated by the C routines
3386							(avoiding quantifiers other than C<*>).
3387
3388							C<[a-l-z]> is interpreted as C<([a-l] \| - \| z)> (but it is discouraged to
3389							rely upon this implementation artefact). POSIX says that the interpretation
3390							of this construct is undefined.
3391
3392							In bracket expressions, C<\> is a normal character,
3393							thus C<]> as character must occur first, or second after a C<^>
3394							(POSIX compliant, but possibly surprising for perl programmers).
3395
3396							All unicode characters supported by perl are allowed as litteral characters.
3397
3398							=over 4
3399
3400							=item C
3401
3402							Parses an C<$ere> to a C<$nfa>.
3403
3404							WARNING: the parsing routines, in particular C,
3405							C on syntax errors; thus the caller may want to eval-trap such errors.
3406
3407							=cut
3408
3409							sub ere_to_nfa {
3410	187			187	1	767628	my ($ere, $has_anchor_ref) = @_;
3411
3412							# optimize very first and very last anchors
3413	187					1108	my $has_beg_anchor = $ere =~ s/^\^+//;
3414	187					1064	my $has_end_anchor = $ere =~ s/\$+$//;
3415
3416	187					295	$$has_anchor_ref = 0;
3417	187					210	my @alternation_nfas;
3418	187					328	do {
3419	187					473	push(@alternation_nfas, parse_alternation(\$ere, $has_anchor_ref));
3420							} while($ere =~ /\G \\| /xmsgc);
3421
3422	187	50	100			848	if ((pos($ere) \|\| 0) != length($ere)) {
3423	0					0	parse_die("unexpected character", \$ere);
3424							}
3425
3426	187					202	my $nfa;
3427	187	100	100			545	if (!$has_beg_anchor && !$has_end_anchor) {
3428							# a\|b\|c => ^.(a\|b\|c).$
3429
3430	10	50				83	$nfa = nfa_concat(
3431							[[1, [[$cc_any, 0]]]]
3432							, @alternation_nfas == 1
3433							? $alternation_nfas[0]
3434							: nfa_union(@alternation_nfas)
3435							, [[1, [[$cc_any, 0]]]]
3436							);
3437							}
3438							else {
3439	177					615	for my $alternation_nfa (@alternation_nfas[1..$#alternation_nfas-1]) {
3440	0					0	$alternation_nfa = nfa_concat(
3441							[[1, [[$cc_any, 0]]]]
3442							, $alternation_nfa
3443							, [[1, [[$cc_any, 0]]]]
3444							);
3445							}
3446	177	100	66			930	if (!$has_beg_anchor \|\| @alternation_nfas > 1) {
3447	6	50				50	$alternation_nfas[0] = nfa_concat(
		50
3448							!$has_beg_anchor ? [[1, [[$cc_any, 0]]]] : ()
3449							, $alternation_nfas[0]
3450							, @alternation_nfas > 1 ? [[1, [[$cc_any, 0]]]] : ()
3451							);
3452							}
3453	177	100	66			881	if (!$has_end_anchor \|\| @alternation_nfas > 1) {
3454	2	50				15	$alternation_nfas[-1] = nfa_concat(
		50
3455							@alternation_nfas > 1 ? [[1, [[$cc_any, 0]]]] : ()
3456							, $alternation_nfas[-1]
3457							, !$has_end_anchor ? [[1, [[$cc_any, 0]]]] : ()
3458							);
3459							}
3460							$nfa
3461	177	50				380	= @alternation_nfas == 1
3462							? $alternation_nfas[0]
3463							: nfa_union(@alternation_nfas)
3464							;
3465							}
3466
3467	187	100				895	return $$has_anchor_ref ? nfa_resolve_anchors($nfa) : $nfa;
3468							}
3469
3470							sub _ere_to_nfa {
3471	218			218		299	my ($str_ref, $has_anchor_ref) = @_;
3472
3473	218					239	my @alternation_nfas;
3474	218					260	do {
3475	371					780	push(@alternation_nfas, parse_alternation($str_ref, $has_anchor_ref));
3476							} while($$str_ref =~ /\G \\| /xmsgc);
3477
3478							return
3479	218	100				657	@alternation_nfas == 1
3480							? $alternation_nfas[0]
3481							: nfa_union(@alternation_nfas)
3482							;
3483							}
3484
3485							sub bracket_expression_to_cc {
3486	82			82	0	121	my ($str_ref) = @_;
3487	82					262	my $neg = $$str_ref =~ /\G \^/xmsgc;
3488	82					115	my $interval_list = [];
3489
3490							# anything is allowd a first char, in particular ']' and '-'
3491	82	100				331	if ($$str_ref =~ /\G (.) - ([^]]) /xmsgc) {
		50
3492	12					42	push(@$interval_list, [ord($1), ord($2)]);
3493							}
3494							elsif ($$str_ref =~ /\G (.) /xmsgc) {
3495	70					266	push(@$interval_list, [ord($1), ord($1)]);
3496							}
3497
3498	82					106	my $loop = 1;
3499	82					153	while ($loop) {
3500	148	50				567	if ($$str_ref =~ /\G ([^]]) - ([^]]) /xmsgc) {
		100
3501	0					0	push(@$interval_list, [ord($1), ord($2)]);
3502							}
3503							elsif ($$str_ref =~ /\G ([^]]) /xmsgc) {
3504	66					203	push(@$interval_list, [ord($1), ord($1)]);
3505							}
3506							else {
3507	82					184	$loop = 0;
3508							}
3509							}
3510
3511							return
3512	82	100				202	$neg
3513							? cc_neg(interval_list_to_cc($interval_list))
3514							: interval_list_to_cc($interval_list)
3515							;
3516							}
3517
3518							# Returns:
3519							# - the empty list iff no quantification has been parsed
3520							# - a 2-tuple ($min, $max)
3521							# either $max is the empty string
3522							# or $min <= $max
3523							sub parse_quant {
3524	245			245	0	405	my ($str_ref) = @_;
3525	245	100				809	if ($$str_ref =~ /\G \* /xmsgc) {
		100
		100
		50
3526	223					558	return (0, '');
3527							}
3528							elsif ($$str_ref =~ /\G \+ /xmsgc) {
3529	7					20	return (1, '');
3530							}
3531							elsif ($$str_ref =~ /\G \? /xmsgc) {
3532	8					19	return (0, 1);
3533							}
3534							elsif ($$str_ref =~ /\G \{ /xmsgc) {
3535	7					11	my ($min, $max);
3536	7	50				25	if ($$str_ref =~ /\G ( [0-9]+ ) /xmsgc) {
3537	7					218	$min = $1;
3538	7	100				23	if ($$str_ref =~ /\G , ([0-9]*) /xmsgc) {
3539	6					9	$max = $1; # may be ''
3540	6	50	66			33	if (length($max) && $min > $max) {
3541	0					0	parse_die("$min > $max", $str_ref);
3542							}
3543							}
3544							else {
3545	1					2	$max = $min;
3546							}
3547							}
3548							else {
3549	0					0	parse_die('number expected', $str_ref);
3550							}
3551
3552	7	50				25	if ($$str_ref !~ /\G \} /xmsgc) {
3553	0					0	parse_die('} expected', $str_ref);
3554							}
3555	7					22	return ($min, $max);
3556							}
3557							else {
3558	0					0	return;
3559							}
3560							}
3561
3562							{
3563							my %char_to_cc_cache;
3564							sub parse_alternation {
3565	558			558	0	693	my ($str_ref, $has_anchor_ref) = @_;
3566	558					574	my @all_nfas;
3567							my $loop;
3568	558					570	do {
3569	929					936	$loop = 0;
3570	929					1265	my $nfa = [];
3571	929					994	my $next_state_index = 1;
3572	929					972	while (1) {
3573	1507	100				8480	if ($$str_ref =~ /\G ( $ERE_litteral + ) /xmsogc) {
		100
		100
		100
		100
		100
3574	944		66			5085	push(@$nfa,
3575							map {
3576	440					1387	[ 0, [[
3577							$char_to_cc_cache{$_} \|\|= char_to_cc($_)
3578							, $next_state_index++
3579							]]]
3580							}
3581							split('', $1)
3582							);
3583							}
3584							elsif ($$str_ref =~ /\G ( \. + ) /xmsgc) {
3585	31					127	push(@$nfa,
3586							map {
3587	31					87	[ 0, [[
3588							$cc_any
3589							, $next_state_index++
3590							]]]
3591							}
3592							(1..length($1))
3593							);
3594							}
3595							elsif ($$str_ref =~ /\G ( \[ ) /xmsgc) {
3596	82					8807	push(@$nfa,
3597							[ 0, [[
3598							bracket_expression_to_cc($str_ref)
3599							, $next_state_index++
3600							]]]
3601							);
3602	82	50				523	if ($$str_ref !~ /\G ] /xmsgc) {
3603	0					0	parse_die('] expected', $str_ref);
3604							}
3605							}
3606							elsif ($$str_ref =~ /\G \\ (.) /xmsgc) {
3607	7		66			45	push(@$nfa,
3608							[ 0, [[
3609							$char_to_cc_cache{$1} \|\|= char_to_cc($1)
3610							, $next_state_index++
3611							]]]
3612							);
3613							}
3614							elsif ($$str_ref =~ /\G \^ /xmsgc) {
3615	9					33	push(@$nfa,
3616							[ 0, [[
3617							$cc_beg
3618							, $next_state_index++
3619							]]]
3620							);
3621	9		100			38	$$has_anchor_ref \|\|= 1;
3622							}
3623							elsif ($$str_ref =~ /\G \$ /xmsgc) {
3624	9					46	push(@$nfa,
3625							[ 0, [[
3626							$cc_end
3627							, $next_state_index++
3628							]]]
3629							);
3630	9		100			46	$$has_anchor_ref \|\|= 1;
3631							}
3632							else {
3633	929					1327	last;
3634							}
3635							}
3636
3637	929	100				2432	if (@$nfa) {
3638	488	100				1184	if ($$str_ref =~ /\G (?= [*+?{] ) /xmsgc) {
3639	161					345	my ($min, $max) = parse_quant($str_ref);
3640	161					505	my $last_char_class = $$nfa[$#$nfa][1][0][0];
3641	161	100				355	if (@$nfa > 1) {
3642	81					119	@{$$nfa[$#$nfa]} = (1, []);
	81					229
3643	81					130	push(@all_nfas, $nfa);
3644							}
3645	161					732	push(@all_nfas, nfa_quant(
3646							[ [0, [[$last_char_class, 1 ]]], [1, []] ]
3647							, $min, $max
3648							));
3649	161					664	$loop = 1;
3650							}
3651							else {
3652	327					750	push(@$nfa, [1, []]);
3653	327					522	push(@all_nfas, $nfa);
3654							}
3655							}
3656
3657	929	100				3705	if ($$str_ref =~ /\G \( /xmsgc) {
3658	218					464	$nfa = _ere_to_nfa($str_ref, $has_anchor_ref);
3659	218	50				950	if ($$str_ref !~ /\G \) /xmsgc) {
3660	0					0	parse_die(') expected', $str_ref);
3661							}
3662	218	100				571	if ($$str_ref =~ /\G (?= [*+?{] ) /xmsgc) {
3663	84					212	my ($min, $max) = parse_quant($str_ref);
3664	84					205	push(@all_nfas, nfa_quant($nfa, $min, $max));
3665							}
3666							else {
3667	134					203	push(@all_nfas, $nfa);
3668							}
3669	218					825	$loop = 1;
3670							}
3671							} while ($loop);
3672	558	100				1504	if (@all_nfas > 1) {
		100
3673	177					399	return nfa_concat(@all_nfas);
3674							}
3675							elsif (@all_nfas) {
3676	326					1680	return $all_nfas[0];
3677							}
3678							else {
3679	55					320	return [[1, []]];
3680							}
3681							}
3682							}
3683
3684							sub parse_die {
3685	0			0	0	0	my ($msg, $str_ref) = @_;
3686	0		0			0	die("malformed regex: $msg at "
3687							. (pos($$str_ref) \|\| 0) . " in $$str_ref");
3688							}
3689
3690
3691							##############################################################################
3692							# Shorthands
3693							##############################################################################
3694
3695							=back
3696
3697							=head2 Shorthands
3698
3699							=over 4
3700
3701							=item C
3702							:= C
3703
3704							=cut
3705
3706							sub ere_to_tree {
3707	0			0	1	0	my ($ere) = @_;
3708	0					0	return nfa_to_tree(ere_to_nfa($ere));
3709							}
3710
3711							=item C
3712							:= C
3713
3714							=cut
3715
3716							sub ere_to_regex {
3717	0			0	1	0	my ($ere, $to_perlre) = (@_, 0);
3718	0					0	return tree_to_regex(ere_to_tree($ere), $to_perlre);
3719							}
3720
3721							=item C
3722							:= C
3723
3724							=cut
3725
3726							sub nfa_to_regex {
3727	102			102	1	310	my ($nfa, $to_perlre) = (@_, 0);
3728	102					271	return tree_to_regex(nfa_to_tree($nfa), $to_perlre);
3729							}
3730
3731							=item C
3732							:= C
3733
3734							=cut
3735
3736							sub ere_to_input_constraints {
3737	0			0	1	0	my ($ere) = @_;
3738	0					0	return tree_to_input_constraints(ere_to_tree($ere));
3739							}
3740
3741							=item C
3742							:= C
3743
3744							=cut
3745
3746							sub nfa_to_input_constraints {
3747	5			5	1	78	my ($nfa) = @_;
3748	5					19	return tree_to_input_constraints(nfa_to_tree($nfa));
3749							}
3750
3751							=item C
3752							:= C
3753
3754							=cut
3755
3756							sub nfa_to_min_dfa {
3757	178			178	1	21243	my ($nfa) = @_;
3758	178					518	return dfa_to_min_dfa(nfa_to_dfa($nfa));
3759							}
3760
3761							1;
3762
3763							=back
3764
3765							=head1 AUTHOR
3766
3767							Loïc Jonas Etienne
3768
3769							=head1 COPYRIGHT and LICENSE
3770
3771							Artistic License 2.0
3772							http://www.perlfoundation.org/artistic_license_2_0