File Coverage

blib/lib/Regexp/Assemble.pm

Criterion	Covered	Total	%
statement	1140	1155	98.7
branch	777	796	97.6
condition	164	177	92.6
subroutine	97	98	98.9
pod	49	49	100.0
total	2227	2275	97.8

line	stmt	bran	cond	sub	pod	time	code
1							# Regexp::Assemple.pm
2							#
3							# Copyright (c) 2004-2011 David Landgren
4							# All rights reserved
5
6							package Regexp::Assemble;
7
8	11			11		22265	use strict;
	11					12
	11					274
9	11			11		35	use warnings;
	11					15
	11					297
10
11	11			11		39	use constant DEBUG_ADD => 1;
	11					14
	11					869
12	11			11		39	use constant DEBUG_TAIL => 2;
	11					8
	11					409
13	11			11		36	use constant DEBUG_LEX => 4;
	11					15
	11					378
14	11			11		33	use constant DEBUG_TIME => 8;
	11					12
	11					441
15
16	11			11		36	use vars qw/$have_Storable $Current_Lexer $Default_Lexer $Single_Char $Always_Fail/;
	11					14
	11					37377
17
18							# The following patterns were generated with examples/naive.
19
20							$Default_Lexer = qr/(?![[(\\]).(?:[+?]\??\|\{\d+(?:,\d)?\}\??)?\|\\(?:[bABCEGLQUXZ]\|[lu].\|(?:[^\w]\|[aefnrtdDwWsS]\|c.\|0\d{2}\|x(?:[\da-fA-F]{2}\|{[\da-fA-F]{4}})\|N\{\w+\}\|[Pp](?:\{\w+\}\|.))(?:[+?]\??\|\{\d+(?:,\d)?\}\??)?)\|\[.*?(?
21
22							$Single_Char = qr/^(?:\\(?:[aefnrtdDwWsS]\|c.\|[^\w\/{\|}-]\|0\d{2}\|x(?:[\da-fA-F]{2}\|{[\da-fA-F]{4}}))\|[^\$^])$/;
23
24							# The pattern to return when nothing has been added (and thus not match anything)
25
26							$Always_Fail = "^\\b\0";
27
28							our $VERSION = '0.37';
29
30							# ------------------------------------------------
31
32							sub new {
33	2194			2194	1	1045533	my $class = shift;
34	2194					3305	my %args = @_;
35
36	2194					1690	my $anc;
37	2194					2449	for $anc (qw(word line string)) {
38	6582	100				11015	if (exists $args{"anchor_$anc"}) {
39	135					183	my $val = delete $args{"anchor_$anc"};
40	135					260	for my $anchor ("anchor_${anc}_begin", "anchor_${anc}_end") {
41	270	100				604	$args{$anchor} = $val unless exists $args{$anchor};
42							}
43							}
44							}
45
46							# anchor_string_absolute sets anchor_string_begin and anchor_string_end_absolute
47	2194	100				3213	if (exists $args{anchor_string_absolute}) {
48	3					4	my $val = delete $args{anchor_string_absolute};
49	3					3	for my $anchor (qw(anchor_string_begin anchor_string_end_absolute)) {
50	6	100				14	$args{$anchor} = $val unless exists $args{$anchor};
51							}
52							}
53
54	2194		100			32580	exists $args{$_} or $args{$_} = 0 for qw(
55							anchor_word_begin
56							anchor_word_end
57							anchor_line_begin
58							anchor_line_end
59							anchor_string_begin
60							anchor_string_end
61							anchor_string_end_absolute
62							debug
63							dup_warn
64							indent
65							lookahead
66							mutable
67							track
68							unroll_plus
69							);
70
71	2194		100			10019	exists $args{$_} or $args{$_} = 1 for qw(
72							fold_meta_pairs
73							reduce
74							chomp
75							);
76
77	2194					4183	@args{qw(re str path)} = (undef, undef, []);
78
79	2194		100			10301	$args{flags} \|\|= delete $args{modifiers} \|\| '';
			100
80	2194	100				3321	$args{lex} = $Current_Lexer if defined $Current_Lexer;
81
82	2194					2838	my $self = bless \%args, $class;
83
84	2194	100				3444	if ($self->_debug(DEBUG_TIME)) {
85	1					4	$self->_init_time_func();
86	1					5	$self->{_begin_time} = $self->{_time_func}->();
87							}
88							$self->{input_record_separator} = delete $self->{rs}
89	2194	100				3216	if exists $self->{rs};
90	2194	100				2884	exists $self->{file} and $self->add_file($self->{file});
91
92	2193					3714	return $self;
93							}
94
95							sub _init_time_func {
96	9			9		10	my $self = shift;
97	9	100				19	return if exists $self->{_time_func};
98
99							# attempt to improve accuracy
100	6	100				13	if (!defined($self->{_use_time_hires})) {
101	5					11	eval {require Time::HiRes};
	5					1251
102	5					2313	$self->{_use_time_hires} = $@;
103							}
104							$self->{_time_func} = length($self->{_use_time_hires}) > 0
105	4			4		10	? sub { time }
106	6	100				25	: \&Time::HiRes::time
107							;
108							}
109
110							sub clone {
111	55			55	1	257	my $self = shift;
112	55					41	my $clone;
113	55					187	my @attr = grep {$_ ne 'path'} keys %$self;
	1314					1117
114	55					94	@{$clone}{@attr} = @{$self}{@attr};
	55					317
	55					117
115	55					107	$clone->{path} = _path_clone($self->_path);
116	55					219	bless $clone, ref($self);
117							}
118
119							sub _fastlex {
120	884			884		654	my $self = shift;
121	884					612	my $record = shift;
122	884					610	my $len = 0;
123	884					784	my @path = ();
124	884					661	my $case = '';
125	884					544	my $qm = '';
126
127	884					749	my $debug = $self->{debug} & DEBUG_LEX;
128	884					626	my $unroll_plus = $self->{unroll_plus};
129
130	884					600	my $token;
131							my $qualifier;
132	884	100				1167	$debug and print "# _lex <$record>\n";
133	884					622	my $modifier = q{(?:[+?]\\??\|\\{(?:\\d+(?:,\d)?\|,\d+)\\}\\??)?};
134	884					1548	my $class_matcher = qr/\[(?:\[:[a-z]+:\]\|\\?.)*?\]/;
135	884					2454	my $paren_matcher = qr/\(.*?(?
136	884					2038	my $misc_matcher = qr/(?:(c)(.)\|(0)(\d{2}))($modifier)/;
137	884					1866	my $regular_matcher = qr/([^\\[(])($modifier)/;
138	884					1094	my $qm_matcher = qr/(\\?.)/;
139
140	884					644	my $matcher = $regular_matcher;
141							{
142	884	100				609	if ($record =~ /\G$matcher/gc) {
	5960	100				18509
		100
		100
143							# neither a \\ nor [ nor ( followed by a modifer
144	3344	100	100			7502	if ($1 eq '\\E') {
		100	66
145	12	100				27	$debug and print "# E\n";
146	12					15	$case = $qm = '';
147	12					9	$matcher = $regular_matcher;
148	12					13	redo;
149							}
150							elsif ($qm and ($1 eq '\\L' or $1 eq '\\U')) {
151	5	100				11	$debug and print "# ignore \\L, \\U\n";
152	5					7	redo;
153							}
154	3327					2661	$token = $1;
155	3327	100				3597	$qualifier = defined $2 ? $2 : '';
156	3327	100				3588	$debug and print "# token <$token> <$qualifier>\n";
157	3327	100				2774	if ($qm) {
158	90					76	$token = quotemeta($token);
159	90					99	$token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{}\/])$/$1/;
160							}
161							else {
162	3237					3044	$token =~ s{\A([][{}*+?@\\/])\Z}{\\$1};
163							}
164	3327	100	100			4686	if ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/) {
165	22	100				44	$1 and $qualifier .= $1;
166	22	100				40	$debug and print " unroll <$token><$token><$qualifier>\n";
167	22	100				59	$case and $token = $case eq 'L' ? lc($token) : uc($token);
		100
168	22					41	push @path, $token, "$token$qualifier";
169							}
170							else {
171	3305	100				3545	$debug and print " clean <$token>\n";
172	3305	100				5639	push @path,
		100
173							$case eq 'L' ? lc($token).$qualifier
174							: $case eq 'U' ? uc($token).$qualifier
175							: $token.$qualifier
176							;
177							}
178	3327					2442	redo;
179							}
180
181							elsif ($record =~ /\G\\/gc) {
182	1680	100				2084	$debug and print "# backslash\n";
183							# backslash
184	1680	100				9908	if ($record =~ /\G([sdwSDW])($modifier)/gc) {
		100
		100
		100
		100
		100
		100
		100
185	443					523	($token, $qualifier) = ($1, $2);
186	443	100				551	$debug and print "# meta <$token> <$qualifier>\n";
187	443	100	100			955	push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
		100
188							? ("\\$token", "\\$token$qualifier" . (defined $1 ? $1 : ''))
189							: "\\$token$qualifier";
190							}
191							elsif ($record =~ /\Gx([\da-fA-F]{2})($modifier)/gc) {
192	9	100				27	$debug and print "# x $1\n";
193	9					29	$token = quotemeta(chr(hex($1)));
194	9					10	$qualifier = $2;
195	9	100				23	$debug and print "# cooked <$token>\n";
196	9					25	$token =~ s/^\\([^\w$()*+.?\[\\\]^\|{\/])$/$1/; # } balance
197	9	100				21	$debug and print "# giving <$token>\n";
198	9	100	100			53	push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
		100
199							? ($token, "$token$qualifier" . (defined $1 ? $1 : ''))
200							: "$token$qualifier";
201							}
202							elsif ($record =~ /\GQ/gc) {
203	26	100				47	$debug and print "# Q\n";
204	26					28	$qm = 1;
205	26					26	$matcher = $qm_matcher;
206							}
207							elsif ($record =~ /\G([LU])/gc) {
208	15	100				40	$debug and print "# case $1\n";
209	15					26	$case = $1;
210							}
211							elsif ($record =~ /\GE/gc) {
212	6	100				16	$debug and print "# E\n";
213	6					15	$case = $qm = '';
214	6					6	$matcher = $regular_matcher;
215							}
216							elsif ($record =~ /\G([lu])(.)/gc) {
217	6	100				18	$debug and print "# case $1 to <$2>\n";
218	6	100				21	push @path, $1 eq 'l' ? lc($2) : uc($2);
219							}
220	30					37	elsif (my @arg = grep {defined} $record =~ /\G$misc_matcher/gc) {
221	6	50				14	if ($] < 5.007) {
222	0					0	my $len = 0;
223	0					0	$len += length($_) for @arg;
224	0	0				0	$debug and print "# pos ", pos($record), " fixup add $len\n";
225	0					0	pos($record) = pos($record) + $len;
226							}
227	6					6	my $directive = shift @arg;
228	6	100				36	if ($directive eq 'c') {
229	3	100				11	$debug and print "# ctrl <@arg>\n";
230	3					9	push @path, "\\c" . uc(shift @arg);
231							}
232							else { # elsif ($directive eq '0') {
233	3	100				10	$debug and print "# octal <@arg>\n";
234	3					5	my $ascii = oct(shift @arg);
235	3	100				11	push @path, ($ascii < 32)
236							? "\\c" . chr($ascii+64)
237							: chr($ascii)
238							;
239							}
240	6					9	$path[-1] .= join( '', @arg ); # if @arg;
241	6					10	redo;
242							}
243							elsif ($record =~ /\G(.)/gc) {
244	1165					1438	$token = $1;
245	1165					1716	$token =~ s{[AZabefnrtz\[\]{}()\\\$*+.?@\|/^]}{\\$token};
246	1165	100				1462	$debug and print "# meta <$token>\n";
247	1165					1339	push @path, $token;
248							}
249							else {
250	4	100				11	$debug and print "# ignore char at ", pos($record), " of <$record>\n";
251							}
252	1674					1767	redo;
253							}
254
255							elsif ($record =~ /\G($class_matcher)($modifier)/gc) {
256							# [class] followed by a modifer
257	39					64	my $class = $1;
258	39	50				68	my $qualifier = defined $2 ? $2 : '';
259	39	100				83	$debug and print "# class begin <$class> <$qualifier>\n";
260	39	100				98	if ($class =~ /\A\[\\?(.)]\Z/) {
261	12					20	$class = quotemeta $1;
262	12					21	$class =~ s{\A\\([!@%])\Z}{$1};
263	12	100				21	$debug and print "# class unwrap $class\n";
264							}
265	39	100				60	$debug and print "# class end <$class> <$qualifier>\n";
266	39	100	100			128	push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
		100
267							? ($class, "$class$qualifier" . (defined $1 ? $1 : ''))
268							: "$class$qualifier";
269	39					52	redo;
270							}
271
272							elsif ($record =~ /\G($paren_matcher)/gc) {
273	13	100				23	$debug and print "# paren <$1>\n";
274							# (paren) followed by a modifer
275	13					24	push @path, $1;
276	13					16	redo;
277							}
278
279							}
280	884					2579	return \@path;
281							}
282
283							sub _lex {
284	211			211		224	my $self = shift;
285	211					176	my $record = shift;
286	211					146	my $len = 0;
287	211					189	my @path = ();
288	211					146	my $case = '';
289	211					152	my $qm = '';
290							my $re = defined $self->{lex} ? $self->{lex}
291	211	50				335	: defined $Current_Lexer ? $Current_Lexer
		100
292							: $Default_Lexer;
293	211					178	my $debug = $self->{debug} & DEBUG_LEX;
294	211	100				336	$debug and print "# _lex <$record>\n";
295	211					149	my ($token, $next_token, $diff, $token_len);
296	211					2320	while( $record =~ /($re)/g ) {
297	480					583	$token = $1;
298	480					331	$token_len = length($token);
299	480	100				789	$debug and print "# lexed <$token> len=$token_len\n";
300	480	100				664	if( pos($record) - $len > $token_len ) {
301	15					13	$next_token = $token;
302	15					20	$token = substr( $record, $len, $diff = pos($record) - $len - $token_len );
303	15	100				35	$debug and print "# recover <", substr( $record, $len, $diff ), "> as <$token>, save <$next_token>\n";
304	15					42	$len += $diff;
305							}
306	480					324	$len += $token_len;
307							TOKEN: {
308	480	100				315	if( substr( $token, 0, 1 ) eq '\\' ) {
	495					671
309	226	100				610	if( $token =~ /^\\([ELQU])$/ ) {
		100
		100
310	51	100				104	if( $1 eq 'E' ) {
		100
311							$qm and $re = defined $self->{lex} ? $self->{lex}
312	12	50				33	: defined $Current_Lexer ? $Current_Lexer
		100
		100
313							: $Default_Lexer;
314	12					17	$case = $qm = '';
315							}
316							elsif( $1 eq 'Q' ) {
317	19					19	$qm = $1;
318							# switch to a more precise lexer to quotemeta individual characters
319	19					49	$re = qr/\\?./;
320							}
321							else {
322	20					22	$case = $1;
323							}
324	51	100				101	$debug and print "# state change qm=<$qm> case=<$case>\n";
325	51					358	goto NEXT_TOKEN;
326							}
327							elsif( $token =~ /^\\([lu])(.)$/ ) {
328	3	100				11	$debug and print "# apply case=<$1> to <$2>\n";
329	3	100				10	push @path, $1 eq 'l' ? lc($2) : uc($2);
330	3					22	goto NEXT_TOKEN;
331							}
332							elsif( $token =~ /^\\x([\da-fA-F]{2})$/ ) {
333	41					91	$token = quotemeta(chr(hex($1)));
334	41	100				58	$debug and print "# cooked <$token>\n";
335	41					70	$token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{\/])$/$1/; # } balance
336	41	100				70	$debug and print "# giving <$token>\n";
337							}
338							else {
339	131					155	$token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{\/])$/$1/; # } balance
340	131	100				181	$debug and print "# backslashed <$token>\n";
341							}
342							}
343							else {
344	269	100				329	$case and $token = $case eq 'U' ? uc($token) : lc($token);
		100
345	269	100				352	$qm and $token = quotemeta($token);
346	269	100				362	$token = '\\/' if $token eq '/';
347							}
348							# undo quotemeta's brute-force escapades
349	441	100				519	$qm and $token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{}\/])$/$1/;
350	441	100				577	$debug and print "# <$token> case=<$case> qm=<$qm>\n";
351	441					491	push @path, $token;
352
353							NEXT_TOKEN:
354	495	100				2529	if( defined $next_token ) {
355	15	100				30	$debug and print "# redo <$next_token>\n";
356	15					12	$token = $next_token;
357	15					12	$next_token = undef;
358	15					15	redo TOKEN;
359							}
360							}
361							}
362	211	100				286	if( $len < length($record) ) {
363							# NB: the remainder only arises in the case of degenerate lexer,
364							# and if \Q is operative, the lexer will have been switched to
365							# /\\?./, which means there can never be a remainder, so we
366							# don't have to bother about quotemeta. In other words:
367							# $qm will never be true in this block.
368	7					9	my $remain = substr($record,$len);
369	7	100				12	$case and $remain = $case eq 'U' ? uc($remain) : lc($remain);
		100
370	7	100				22	$debug and print "# add remaining <$remain> case=<$case> qm=<$qm>\n";
371	7					8	push @path, $remain;
372							}
373	211	100				283	$debug and print "# _lex out <@path>\n";
374	211					937	return \@path;
375							}
376
377							sub add {
378	1059			1059	1	8089	my $self = shift;
379	1059					740	my $record;
380	1059					1508	my $debug = $self->{debug} & DEBUG_LEX;
381	1059					1928	while( defined( $record = shift @_ )) {
382	2263	100				3261	CORE::chomp($record) if $self->{chomp};
383	2263	100	100			3484	next if $self->{pre_filter} and not $self->{pre_filter}->($record);
384	2262	100				2831	$debug and print "# add <$record>\n";
385	2262					2022	$self->{stats_raw} += length $record;
386							my $list = $record =~ /[+*?(\\\[{]/ # }]) restore equilibrium
387	2262	100				7483	? $self->{lex} ? $self->_lex($record) : $self->_fastlex($record)
		100
388							: [split //, $record]
389							;
390	2262	100	100			4252	next if $self->{filter} and not $self->{filter}->(@$list);
391	2261					2930	$self->_insertr( $list );
392							}
393	1059					1674	return $self;
394							}
395
396							sub add_file {
397	13			13	1	14	my $self = shift;
398	13					8	my $rs;
399							my @file;
400	13	100				23	if (ref($_[0]) eq 'HASH') {
401	6					6	my $arg = shift;
402							$rs = $arg->{rs}
403							\|\| $arg->{input_record_separator}
404							\|\| $self->{input_record_separator}
405	6		66			22	\|\| $/;
406							@file = ref($arg->{file}) eq 'ARRAY'
407	4					7	? @{$arg->{file}}
408	6	100				11	: $arg->{file};
409							}
410							else {
411	7		66			19	$rs = $self->{input_record_separator} \|\| $/;
412	7					10	@file = @_;
413							}
414	13					29	local $/ = $rs;
415	13					9	my $file;
416	13					14	for $file (@file) {
417	15	100				514	open my $fh, '<', $file or do {
418	1					8	require Carp;
419	1					149	Carp::croak("cannot open $file for input: $!");
420							};
421	14					102	while (defined (my $rec = <$fh>)) {
422	55					73	$self->add($rec);
423							}
424	14					90	close $fh;
425							}
426	12					44	return $self;
427							}
428
429							sub insert {
430	3398			3398	1	6941	my $self = shift;
431	3398	100	100			5196	return if $self->{filter} and not $self->{filter}->(@_);
432	3397					5954	$self->_insertr( [@_] );
433	3397					5032	return $self;
434							}
435
436							sub _insertr {
437	7068			7068		4681	my $self = shift;
438	7068		100			15969	my $dup = $self->{stats_dup} \|\| 0;
439	7068					7865	$self->{path} = $self->_insert_path( $self->_path, $self->_debug(DEBUG_ADD), $_[0] );
440	7068	100	100			13511	if( not defined $self->{stats_dup} or $dup == $self->{stats_dup} ) {
		50
441	7063					5621	++$self->{stats_add};
442	7063	100				4528	$self->{stats_cooked} += defined($_) ? length($_) : 0 for @{$_[0]};
	7063					26398
443							}
444							elsif( $self->{dup_warn} ) {
445	0	0				0	if( ref $self->{dup_warn} eq 'CODE' ) {
446	0					0	$self->{dup_warn}->($self, $_[0]);
447							}
448							else {
449	0					0	my $pattern = join( '', @{$_[0]} );
	0					0
450	0					0	require Carp;
451	0					0	Carp::carp("duplicate pattern added: /$pattern/");
452							}
453							}
454	7068					11456	$self->{str} = $self->{re} = undef;
455							}
456
457							sub lexstr {
458	2			2	1	7	return shift->_lex(shift);
459							}
460
461							sub pre_filter {
462	3			3	1	540	my $self = shift;
463	3					4	my $pre_filter = shift;
464	3	100	100			24	if( defined $pre_filter and ref($pre_filter) ne 'CODE' ) {
465	1					4	require Carp;
466	1					60	Carp::croak("pre_filter method not passed a coderef");
467							}
468	2					3	$self->{pre_filter} = $pre_filter;
469	2					4	return $self;
470							}
471
472
473							sub filter {
474	4			4	1	242	my $self = shift;
475	4					3	my $filter = shift;
476	4	100	100			21	if( defined $filter and ref($filter) ne 'CODE' ) {
477	1					8	require Carp;
478	1					117	Carp::croak("filter method not passed a coderef");
479							}
480	3					5	$self->{filter} = $filter;
481	3					5	return $self;
482							}
483
484							sub as_string {
485	800			800	1	982	my $self = shift;
486	800	100				1228	if( not defined $self->{str} ) {
487	798	100				928	if( $self->{track} ) {
488	8					9	$self->{m} = undef;
489	8					10	$self->{mcount} = 0;
490	8					11	$self->{mlist} = [];
491	8					14	$self->{str} = _re_path_track($self, $self->_path, '', '');
492							}
493							else {
494	790	100	100			3171	$self->_reduce unless ($self->{mutable} or not $self->{reduce});
495	790					812	my $arg = {@_};
496							$arg->{indent} = $self->{indent}
497	790	100	100			2526	if not exists $arg->{indent} and $self->{indent} > 0;
498	790	100	100			1807	if( exists $arg->{indent} and $arg->{indent} > 0 ) {
		100
499	42					38	$arg->{depth} = 0;
500	42					72	$self->{str} = _re_path_pretty($self, $self->_path, $arg);
501							}
502							elsif( $self->{lookahead} ) {
503	35					52	$self->{str} = _re_path_lookahead($self, $self->_path);
504							}
505							else {
506	713					781	$self->{str} = _re_path($self, $self->_path);
507							}
508							}
509	798	100				1369	if (not length $self->{str}) {
510							# explicitly fail to match anything if no pattern was generated
511	9					14	$self->{str} = $Always_Fail;
512							}
513							else {
514							my $begin =
515							$self->{anchor_word_begin} ? '\\b'
516							: $self->{anchor_line_begin} ? '^'
517	789	100				1707	: $self->{anchor_string_begin} ? '\A'
		100
		100
518							: ''
519							;
520							my $end =
521							$self->{anchor_word_end} ? '\\b'
522							: $self->{anchor_line_end} ? '$'
523							: $self->{anchor_string_end} ? '\Z'
524	789	100				1755	: $self->{anchor_string_end_absolute} ? '\z'
		100
		100
		100
525							: ''
526							;
527	789					1150	$self->{str} = "$begin$self->{str}$end";
528							}
529	798	100				1545	$self->{path} = [] unless $self->{mutable};
530							}
531	800					2850	return $self->{str};
532							}
533
534							sub re {
535	122			122	1	880	my $self = shift;
536	122	100				306	$self->_build_re($self->as_string(@_)) unless defined $self->{re};
537	122					424	return $self->{re};
538							}
539
540							use overload '""' => sub {
541	2131			2131		380510	my $self = shift;
542	2131	100				11608	return $self->{re} if $self->{re};
543	489					648	$self->_build_re($self->as_string());
544	489					4033	return $self->{re};
545	11			11		11455	};
	11					8936
	11					102
546
547							sub _build_re {
548	618			618		469	my $self = shift;
549	618					443	my $str = shift;
550	618	100				722	if( $self->{track} ) {
551	11			11		788	use re 'eval';
	11					16
	11					89333
552							$self->{re} = length $self->{flags}
553	8	100				992	? qr/(?$self->{flags}:$str)/
554							: qr/$str/
555							;
556							}
557							else {
558							# how could I not repeat myself?
559							$self->{re} = length $self->{flags}
560	610	100				9172	? qr/(?$self->{flags}:$str)/
561							: qr/$str/
562							;
563							}
564							}
565
566							sub match {
567	29			29	1	3095	my $self = shift;
568	29					23	my $target = shift;
569	29	100				72	$self->_build_re($self->as_string(@_)) unless defined $self->{re};
570	29					37	$self->{m} = undef;
571	29					42	$self->{mvar} = [];
572	29	100				647	if( not $target =~ /$self->{re}/ ) {
573	8					14	$self->{mbegin} = [];
574	8					19	$self->{mend} = [];
575	8					38	return undef;
576							}
577	21	50				45	$self->{m} = $^R if $] >= 5.009005;
578	21					84	$self->{mbegin} = _path_copy([@-]);
579	21					93	$self->{mend} = _path_copy([@+]);
580	21					37	my $n = 0;
581	21					40	for( my $n = 0; $n < @-; ++$n ) {
582	43	100	66			171	push @{$self->{mvar}}, substr($target, $-[$n], $+[$n] - $-[$n])
	33					178
583							if defined $-[$n] and defined $+[$n];
584							}
585	21	100				40	if( $self->{track} ) {
586	20	50				111	return defined $self->{m} ? $self->{mlist}[$self->{m}] : 1;
587							}
588							else {
589	1					6	return 1;
590							}
591							}
592
593							sub source {
594	4			4	1	247	my $self = shift;
595	4	100				14	return unless $self->{track};
596	3	100				8	defined($_[0]) and return $self->{mlist}[$_[0]];
597	2	100				7	return unless defined $self->{m};
598	1					4	return $self->{mlist}[$self->{m}];
599							}
600
601							sub mbegin {
602	3			3	1	7	my $self = shift;
603	3	100				25	return exists $self->{mbegin} ? $self->{mbegin} : [];
604							}
605
606							sub mend {
607	3			3	1	20	my $self = shift;
608	3	100				18	return exists $self->{mend} ? $self->{mend} : [];
609							}
610
611							sub mvar {
612	19			19	1	24	my $self = shift;
613	19	100				46	return undef unless exists $self->{mvar};
614	18	100				97	return defined($_[0]) ? $self->{mvar}[$_[0]] : $self->{mvar};
615							}
616
617							sub capture {
618	5			5	1	12	my $self = shift;
619	5	100				13	if( $self->{mvar} ) {
620	4					5	my @capture = @{$self->{mvar}};
	4					12
621	4					5	shift @capture;
622	4					16	return @capture;
623							}
624	1					3	return ();
625							}
626
627							sub matched {
628	9			9	1	222	my $self = shift;
629	9	100				48	return defined $self->{m} ? $self->{mlist}[$self->{m}] : undef;
630							}
631
632							sub stats_add {
633	2			2	1	8	my $self = shift;
634	2		100			11	return $self->{stats_add} \|\| 0;
635							}
636
637							sub stats_dup {
638	2			2	1	2	my $self = shift;
639	2		100			12	return $self->{stats_dup} \|\| 0;
640							}
641
642							sub stats_raw {
643	2			2	1	3	my $self = shift;
644	2		100			12	return $self->{stats_raw} \|\| 0;
645							}
646
647							sub stats_cooked {
648	2			2	1	4	my $self = shift;
649	2		100			11	return $self->{stats_cooked} \|\| 0;
650							}
651
652							sub stats_length {
653	6			6	1	1665	my $self = shift;
654	6	100	100			39	return (defined $self->{str} and $self->{str} ne $Always_Fail) ? length $self->{str} : 0;
655							}
656
657							sub dup_warn {
658	0			0	1	0	my $self = shift;
659	0	0				0	$self->{dup_warn} = defined($_[0]) ? $_[0] : 1;
660	0					0	return $self;
661							}
662
663							sub anchor_word_begin {
664	5			5	1	3	my $self = shift;
665	5	100				10	$self->{anchor_word_begin} = defined($_[0]) ? $_[0] : 1;
666	5					8	return $self;
667							}
668
669							sub anchor_word_end {
670	4			4	1	4	my $self = shift;
671	4	100				7	$self->{anchor_word_end} = defined($_[0]) ? $_[0] : 1;
672	4					6	return $self;
673							}
674
675							sub anchor_word {
676	2			2	1	2	my $self = shift;
677	2					2	my $state = shift;
678	2					5	$self->anchor_word_begin($state)->anchor_word_end($state);
679	2					5	return $self;
680							}
681
682							sub anchor_line_begin {
683	4			4	1	3	my $self = shift;
684	4	100				8	$self->{anchor_line_begin} = defined($_[0]) ? $_[0] : 1;
685	4					7	return $self;
686							}
687
688							sub anchor_line_end {
689	2			2	1	1	my $self = shift;
690	2	100				5	$self->{anchor_line_end} = defined($_[0]) ? $_[0] : 1;
691	2					1	return $self;
692							}
693
694							sub anchor_line {
695	2			2	1	3	my $self = shift;
696	2					1	my $state = shift;
697	2					4	$self->anchor_line_begin($state)->anchor_line_end($state);
698	2					7	return $self;
699							}
700
701							sub anchor_string_begin {
702	277			277	1	188	my $self = shift;
703	277	100				357	$self->{anchor_string_begin} = defined($_[0]) ? $_[0] : 1;
704	277					321	return $self;
705							}
706
707							sub anchor_string_end {
708	276			276	1	166	my $self = shift;
709	276	100				315	$self->{anchor_string_end} = defined($_[0]) ? $_[0] : 1;
710	276					197	return $self;
711							}
712
713							sub anchor_string_end_absolute {
714	3			3	1	3	my $self = shift;
715	3	100				5	$self->{anchor_string_end_absolute} = defined($_[0]) ? $_[0] : 1;
716	3					4	return $self;
717							}
718
719							sub anchor_string {
720	274			274	1	193	my $self = shift;
721	274	100				367	my $state = defined($_[0]) ? $_[0] : 1;
722	274					417	$self->anchor_string_begin($state)->anchor_string_end($state);
723	274					578	return $self;
724							}
725
726							sub anchor_string_absolute {
727	2			2	1	1	my $self = shift;
728	2	100				5	my $state = defined($_[0]) ? $_[0] : 1;
729	2					3	$self->anchor_string_begin($state)->anchor_string_end_absolute($state);
730	2					4	return $self;
731							}
732
733							sub debug {
734	605			605	1	1481	my $self = shift;
735	605	100				925	$self->{debug} = defined($_[0]) ? $_[0] : 0;
736	605	100				616	if ($self->_debug(DEBUG_TIME)) {
737							# hmm, debugging time was switched on after instantiation
738	4					28	$self->_init_time_func;
739	4					14	$self->{_begin_time} = $self->{_time_func}->();
740							}
741	605					495	return $self;
742							}
743
744							sub dump {
745	9			9	1	514	return _dump($_[0]->_path);
746							}
747
748							sub chomp {
749	22			22	1	498	my $self = shift;
750	22	100				32	$self->{chomp} = defined($_[0]) ? $_[0] : 1;
751	22					37	return $self;
752							}
753
754							sub fold_meta_pairs {
755	5			5	1	4	my $self = shift;
756	5	100				14	$self->{fold_meta_pairs} = defined($_[0]) ? $_[0] : 1;
757	5					7	return $self;
758							}
759
760							sub indent {
761	4			4	1	477	my $self = shift;
762	4	100				10	$self->{indent} = defined($_[0]) ? $_[0] : 0;
763	4					8	return $self;
764							}
765
766							sub lookahead {
767	22			22	1	19	my $self = shift;
768	22	100				45	$self->{lookahead} = defined($_[0]) ? $_[0] : 1;
769	22					34	return $self;
770							}
771
772							sub flags {
773	24			24	1	956	my $self = shift;
774	24	100				47	$self->{flags} = defined($_[0]) ? $_[0] : '';
775	24					40	return $self;
776							}
777
778							sub modifiers {
779	4			4	1	936	my $self = shift;
780	4					9	return $self->flags(@_);
781							}
782
783							sub track {
784	5			5	1	922	my $self = shift;
785	5	100				11	$self->{track} = defined($_[0]) ? $_[0] : 1;
786	5					8	return $self;
787							}
788
789							sub unroll_plus {
790	2			2	1	513	my $self = shift;
791	2	100				6	$self->{unroll_plus} = defined($_[0]) ? $_[0] : 1;
792	2					4	return $self;
793							}
794
795							sub lex {
796	1			1	1	1	my $self = shift;
797	1					39	$self->{lex} = qr($_[0]);
798	1					2	return $self;
799							}
800
801							sub reduce {
802	19			19	1	721	my $self = shift;
803	19	100				50	$self->{reduce} = defined($_[0]) ? $_[0] : 1;
804	19					31	return $self;
805							}
806
807							sub mutable {
808	5			5	1	737	my $self = shift;
809	5	100				16	$self->{mutable} = defined($_[0]) ? $_[0] : 1;
810	5					6	return $self;
811							}
812
813							sub reset {
814							# reinitialise the internal state of the object
815	19			19	1	1402	my $self = shift;
816	19					30	$self->{path} = [];
817	19					33	$self->{re} = undef;
818	19					19	$self->{str} = undef;
819	19					52	return $self;
820							}
821
822							sub Default_Lexer {
823	4	100		4	1	2230	if( $_[0] ) {
824	3	100				13	if( my $refname = ref($_[0]) ) {
825	1					8	require Carp;
826	1					138	Carp::croak("Cannot pass a $refname to Default_Lexer");
827							}
828	2					15	$Current_Lexer = $_[0];
829							}
830	3	100				11	return defined $Current_Lexer ? $Current_Lexer : $Default_Lexer;
831							}
832
833							# --- no user serviceable parts below ---
834
835							# -- debug helpers
836
837							sub _debug {
838	12297			12297		7986	my $self = shift;
839	12297	100				24416	return $self->{debug} & shift() ? 1 : 0;
840							}
841
842							# -- helpers
843
844							sub _path {
845							# access the path
846	10011			10011		15380	return $_[0]->{path};
847							}
848
849							# -- the heart of the matter
850
851							$have_Storable = do {
852							eval {
853							require Storable;
854							import Storable 'dclone';
855							};
856							$@ ? 0 : 1;
857							};
858
859							sub _path_clone {
860	55	100		55		1976	$have_Storable ? dclone($_[0]) : _path_copy($_[0]);
861							}
862
863							sub _path_copy {
864	80			80		403	my $path = shift;
865	80					65	my $new = [];
866	80					136	for( my $p = 0; $p < @$path; ++$p ) {
867	201	100				302	if( ref($path->[$p]) eq 'HASH' ) {
		100
868	9					16	push @$new, _node_copy($path->[$p]);
869							}
870							elsif( ref($path->[$p]) eq 'ARRAY' ) {
871	3					3	push @$new, _path_copy($path->[$p]);
872							}
873							else {
874	189					319	push @$new, $path->[$p];
875							}
876							}
877	80					132	return $new;
878							}
879
880							sub _node_copy {
881	11			11		8	my $node = shift;
882	11					9	my $new = {};
883	11					29	while( my( $k, $v ) = each %$node ) {
884	27	100				44	$new->{$k} = defined($v)
885							? _path_copy($v)
886							: undef
887							;
888							}
889	11					20	return $new;
890							}
891
892							sub _insert_path {
893	7142			7142		5000	my $self = shift;
894	7142					4472	my $list = shift;
895	7142					4332	my $debug = shift;
896	7142					4589	my @in = @{shift()}; # create a new copy
	7142					11832
897	7142	100				9983	if( @$list == 0 ) { # special case the first time
898	2030	100	100			6861	if( @in == 0 or (@in == 1 and (not defined $in[0] or $in[0] eq ''))) {
			66
			66
899	33					81	return [{'' => undef}];
900							}
901							else {
902	1997					3256	return \@in;
903							}
904							}
905	5112	100				6219	$debug and print "# _insert_path @{[_dump(\@in)]} into @{[_dump($list)]}\n";
	151					207
	151					147
906	5112					3535	my $path = $list;
907	5112					3170	my $offset = 0;
908	5112					3106	my $token;
909	5112	100				6348	if( not @in ) {
910	2	100				6	if( ref($list->[0]) ne 'HASH' ) {
911	1					2	return [ { '' => undef, $list->[0] => $list } ];
912							}
913							else {
914	1					1	$list->[0]{''} = undef;
915	1					2	return $list;
916							}
917							}
918	5110					8098	while( defined( $token = shift @in )) {
919	17713	100				20516	if( ref($token) eq 'HASH' ) {
920	282	100				409	$debug and print "# p0=", _dump($path), "\n";
921	282					492	$path = $self->_insert_node( $path, $offset, $token, $debug, @in );
922	282	100				407	$debug and print "# p1=", _dump($path), "\n";
923	282					274	last;
924							}
925	17431	100				21085	if( ref($path->[$offset]) eq 'HASH' ) {
926	3703	100				4250	$debug and print "# at (off=$offset len=@{[scalar @$path]}) ", _dump($path->[$offset]), "\n";
	54					171
927	3703					2637	my $node = $path->[$offset];
928	3703	100				3967	if( exists( $node->{$token} )) {
929	2624	100				2721	if ($offset < $#$path) {
930							my $new = {
931							$token => [$token, @in],
932	1					3	_re_path($self, [$node]) => [@{$path}[$offset..$#$path]],
	1					3
933							};
934	1					3	splice @$path, $offset, @$path-$offset, $new;
935	1					1	last;
936							}
937							else {
938	2623	100				2985	$debug and print "# descend key=$token @{[_dump($node->{$token})]}\n";
	31					55
939	2623					1893	$path = $node->{$token};
940	2623					1723	$offset = 0;
941	2623					2047	redo;
942							}
943							}
944							else {
945	1079	100				1303	$debug and print "# add path ($token:@{[_dump(\@in)]}) into @{[_dump($path)]} at off=$offset to end=@{[scalar $#$path]}\n";
	23					32
	23					32
	23					329
946	1079	100				1237	if( $offset == $#$path ) {
947	1072					2084	$node->{$token} = [ $token, @in ];
948							}
949							else {
950							my $new = {
951							_node_key($token) => [ $token, @in ],
952	7					13	_node_key($node) => [@{$path}[$offset..$#{$path}]],
	7					19
	7					10
953							};
954	7					18	splice( @$path, $offset, @$path - $offset, $new );
955	7	100				17	$debug and print "# fused node=@{[_dump($new)]} path=@{[_dump($path)]}\n";
	1					2
	1					3
956							}
957	1079					1031	last;
958							}
959							}
960
961	13728	100				14830	if( $debug ) {
962	306					226	my $msg = '';
963	306					184	my $n;
964	306					447	for( $n = 0; $n < @$path; ++$n ) {
965	1093	100				1252	$msg .= ' ' if $n;
966							my $atom = ref($path->[$n]) eq 'HASH'
967	1093	100				1225	? '{'.join( ' ', keys(%{$path->[$n]})).'}'
	81					173
968							: $path->[$n]
969							;
970	1093	100				1951	$msg .= $n == $offset ? "<$atom>" : $atom;
971							}
972	306					701	print "# at path ($msg)\n";
973							}
974
975	13728	100				25093	if( $offset >= @$path ) {
		100
		100
976	732					1588	push @$path, { $token => [ $token, @in ], '' => undef };
977	732	100				1005	$debug and print "# added remaining @{[_dump($path)]}\n";
	21					32
978	732					599	last;
979							}
980							elsif( $token ne $path->[$offset] ) {
981	2099	100				2583	$debug and print "# token $token not present\n";
982							splice @$path, $offset, @$path-$offset, {
983							length $token
984							? ( _node_key($token) => [$token, @in])
985							: ( '' => undef )
986							,
987	2099	100				4018	$path->[$offset] => [@{$path}[$offset..$#{$path}]],
	2099					5812
	2099					2253
988							};
989	2099	100				3433	$debug and print "# path=@{[_dump($path)]}\n";
	79					95
990	2099					1792	last;
991							}
992							elsif( not @in ) {
993	917	100				1033	$debug and print "# last token to add\n";
994	917	100				1206	if( defined( $path->[$offset+1] )) {
995	912					665	++$offset;
996	912	100				964	if( ref($path->[$offset]) eq 'HASH' ) {
997	118	100				145	$debug and print "# add sentinel to node\n";
998	118					140	$path->[$offset]{''} = undef;
999							}
1000							else {
1001	794	100				892	$debug and print "# convert <$path->[$offset]> to node for sentinel\n";
1002							splice @$path, $offset, @$path-$offset, {
1003							'' => undef,
1004	794					919	$path->[$offset] => [ @{$path}[$offset..$#{$path}] ],
	794					1758
	794					802
1005							};
1006							}
1007							}
1008							else {
1009							# already seen this pattern
1010	5					7	++$self->{stats_dup};
1011							}
1012	917					870	last;
1013							}
1014							# if we get here then @_ still contains a token
1015	9980					12880	++$offset;
1016							}
1017	5110					6727	$list;
1018							}
1019
1020							sub _insert_node {
1021	282			282		234	my $self = shift;
1022	282					208	my $path = shift;
1023	282					195	my $offset = shift;
1024	282					212	my $token = shift;
1025	282					204	my $debug = shift;
1026	282					257	my $path_end = [@{$path}[$offset..$#{$path}]];
	282					385
	282					309
1027							# NB: $path->[$offset] and $[path_end->[0] are equivalent
1028	282					552	my $token_key = _re_path($self, [$token]);
1029	282	100				534	$debug and print "# insert node(@{[_dump($token)]}:@{[_dump(\@_)]}) (key=$token_key)",
	26					45
	26					50
1030	26					43	" at path=@{[_dump($path_end)]}\n";
1031	282	100				519	if( ref($path_end->[0]) eq 'HASH' ) {
1032	195	100				459	if( exists($path_end->[0]{$token_key}) ) {
		100
1033	25	100				48	if( @$path_end > 1 ) {
1034	2					6	my $path_key = _re_path($self, [$path_end->[0]]);
1035	2					10	my $new = {
1036							$path_key => [ @$path_end ],
1037							$token_key => [ $token, @_ ],
1038							};
1039	2	100				6	$debug and print "# +bifurcate new=@{[_dump($new)]}\n";
	1					3
1040	2					7	splice( @$path, $offset, @$path_end, $new );
1041							}
1042							else {
1043	23					28	my $old_path = $path_end->[0]{$token_key};
1044	23					29	my $new_path = [];
1045	23		100			92	while( @$old_path and _node_eq( $old_path->[0], $token )) {
1046	30	100				80	$debug and print "# identical nodes in sub_path ",
		100
1047							ref($token) ? _dump($token) : $token, "\n";
1048	30					45	push @$new_path, shift(@$old_path);
1049	30					76	$token = shift @_;
1050							}
1051	23	50				112	if( @$new_path ) {
1052	23					22	my $new;
1053	23					26	my $token_key = $token;
1054	23	100				36	if( @_ ) {
1055	6					11	$new = {
1056							_re_path($self, $old_path) => $old_path,
1057							$token_key => [$token, @_],
1058							};
1059	6	100				14	$debug and print "# insert_node(bifurc) n=@{[_dump([$new])]}\n";
	1					4
1060							}
1061							else {
1062	17	100				30	$debug and print "# insert $token into old path @{[_dump($old_path)]}\n";
	5					7
1063	17	100				29	if( @$old_path ) {
1064	11					38	$new = ($self->_insert_path( $old_path, $debug, [$token] ))->[0];
1065							}
1066							else {
1067	6					15	$new = { '' => undef, $token => [$token] };
1068							}
1069							}
1070	23					35	push @$new_path, $new;
1071							}
1072	23					38	$path_end->[0]{$token_key} = $new_path;
1073	23	100				37	$debug and print "# +_insert_node result=@{[_dump($path_end)]}\n";
	6					11
1074	23					66	splice( @$path, $offset, @$path_end, @$path_end );
1075							}
1076							}
1077							elsif( not _node_eq( $path_end->[0], $token )) {
1078	70	100				105	if( @$path_end > 1 ) {
1079	11					26	my $path_key = _re_path($self, [$path_end->[0]]);
1080	11					44	my $new = {
1081							$path_key => [ @$path_end ],
1082							$token_key => [ $token, @_ ],
1083							};
1084	11	100				27	$debug and print "# path->node1 at $path_key/$token_key @{[_dump($new)]}\n";
	1					3
1085	11					27	splice( @$path, $offset, @$path_end, $new );
1086							}
1087							else {
1088	59	100				142	$debug and print "# next in path is node, trivial insert at $token_key\n";
1089	59					139	$path_end->[0]{$token_key} = [$token, @_];
1090	59					114	splice( @$path, $offset, @$path_end, @$path_end );
1091							}
1092							}
1093							else {
1094	100		100			271	while( @$path_end and _node_eq( $path_end->[0], $token )) {
1095	131	100				283	$debug and print "# identical nodes @{[_dump([$token])]}\n";
	9					24
1096	131					114	shift @$path_end;
1097	131					134	$token = shift @_;
1098	131					252	++$offset;
1099							}
1100	100	100				140	if( @$path_end ) {
1101	57	100				82	$debug and print "# insert at $offset $token:@{[_dump(\@_)]} into @{[_dump($path_end)]}\n";
	4					10
	4					7
1102	57					180	$path_end = $self->_insert_path( $path_end, $debug, [$token, @_] );
1103	57	100				102	$debug and print "# got off=$offset s=@{[scalar @_]} path_add=@{[_dump($path_end)]}\n";
	4					12
	4					6
1104	57					99	splice( @$path, $offset, @$path - $offset, @$path_end );
1105	57	100				97	$debug and print "# got final=@{[_dump($path)]}\n";
	4					8
1106							}
1107							else {
1108	43					69	$token_key = _node_key($token);
1109	43					124	my $new = {
1110							'' => undef,
1111							$token_key => [ $token, @_ ],
1112							};
1113	43	100				74	$debug and print "# convert opt @{[_dump($new)]}\n";
	3					6
1114	43					55	push @$path, $new;
1115							}
1116							}
1117							}
1118							else {
1119	87	100				126	if( @$path_end ) {
1120	74					262	my $new = {
1121							$path_end->[0] => [ @$path_end ],
1122							$token_key => [ $token, @_ ],
1123							};
1124	74	100				121	$debug and print "# atom->node @{[_dump($new)]}\n";
	5					10
1125	74					131	splice( @$path, $offset, @$path_end, $new );
1126	74	100				131	$debug and print "# out=@{[_dump($path)]}\n";
	5					8
1127							}
1128							else {
1129	13	100				31	$debug and print "# add opt @{[_dump([$token,@_])]} via $token_key\n";
	4					12
1130	13					42	push @$path, {
1131							'' => undef,
1132							$token_key => [ $token, @_ ],
1133							};
1134							}
1135							}
1136	282					509	$path;
1137							}
1138
1139							sub _reduce {
1140	810			810		654	my $self = shift;
1141	810					1000	my $context = { debug => $self->_debug(DEBUG_TAIL), depth => 0 };
1142
1143	810	100				977	if ($self->_debug(DEBUG_TIME)) {
1144	4					7	$self->_init_time_func;
1145	4					11	my $now = $self->{_time_func}->();
1146	4	100				8	if (exists $self->{_begin_time}) {
1147	3					104	printf "# load=%0.6f\n", $now - $self->{_begin_time};
1148							}
1149							else {
1150	1					41	printf "# load-epoch=%0.6f\n", $now;
1151							}
1152	4					14	$self->{_begin_time} = $self->{_time_func}->();
1153							}
1154
1155	810					974	my ($head, $tail) = _reduce_path( $self->_path, $context );
1156	810	100				1221	$context->{debug} and print "# final head=", _dump($head), ' tail=', _dump($tail), "\n";
1157	810	100				957	if( !@$head ) {
1158	651					649	$self->{path} = $tail;
1159							}
1160							else {
1161							$self->{path} = [
1162	159					237	@{_unrev_path( $tail, $context )},
1163	159					123	@{_unrev_path( $head, $context )},
	159					187
1164							];
1165							}
1166
1167	810	100				1334	if ($self->_debug(DEBUG_TIME)) {
1168	4					9	my $now = $self->{_time_func}->();
1169	4	50				7	if (exists $self->{_begin_time}) {
1170	4					102	printf "# reduce=%0.6f\n", $now - $self->{_begin_time};
1171							}
1172							else {
1173	0					0	printf "# reduce-epoch=%0.6f\n", $now;
1174							}
1175	4					12	$self->{_begin_time} = $self->{_time_func}->();
1176							}
1177
1178	810	100				1104	$context->{debug} and print "# final path=", _dump($self->{path}), "\n";
1179	810					1153	return $self;
1180							}
1181
1182							sub _remove_optional {
1183	1710	100		1710		2420	if( exists $_[0]->{''} ) {
1184	350					364	delete $_[0]->{''};
1185	350					350	return 1;
1186							}
1187	1360					1860	return 0;
1188							}
1189
1190							sub _reduce_path {
1191	810			810		749	my ($path, $ctx) = @_;
1192	810					1091	my $indent = ' ' x $ctx->{depth};
1193	810					646	my $debug = $ctx->{debug};
1194	810	100				1074	$debug and print "#$indent _reduce_path $ctx->{depth} ", _dump($path), "\n";
1195	810					636	my $new;
1196	810					714	my $head = [];
1197	810					621	my $tail = [];
1198	810					1474	while( defined( my $p = pop @$path )) {
1199	1159	100				1536	if( ref($p) eq 'HASH' ) {
1200	514					706	my ($node_head, $node_tail) = _reduce_node($p, _descend($ctx) );
1201	514	100				1102	$debug and print "#$indent\| head=", _dump($node_head), " tail=", _dump($node_tail), "\n";
1202	514	100				822	push @$head, @$node_head if scalar @$node_head;
1203	514	100				2700	push @$tail, ref($node_tail) eq 'HASH' ? $node_tail : @$node_tail;
1204							}
1205							else {
1206	645	100				695	if( @$head ) {
1207	125	100				179	$debug and print "#$indent\| push $p leaves @{[_dump($path)]}\n";
	7					12
1208	125					271	push @$tail, $p;
1209							}
1210							else {
1211	520	100				613	$debug and print "#$indent\| unshift $p\n";
1212	520					1214	unshift @$tail, $p;
1213							}
1214							}
1215							}
1216	20					108	$debug and print "#$indent\| tail nr=@{[scalar @$tail]} t0=", ref($tail->[0]),
1217	810	100				1036	(ref($tail->[0]) eq 'HASH' ? " n=" . scalar(keys %{$tail->[0]}) : '' ),
	18	100				73
1218							"\n";
1219	810	100	100			1955	if( @$tail > 1
			100
1220							and ref($tail->[0]) eq 'HASH'
1221	97					298	and keys %{$tail->[0]} == 2
1222							) {
1223	72					70	my $opt;
1224							my $fixed;
1225	72					61	while( my ($key, $path) = each %{$tail->[0]} ) {
	216					428
1226	144	100				204	$debug and print "#$indent\| scan k=$key p=@{[_dump($path)]}\n";
	14					15
1227	144	100				200	next unless $path;
1228	123	100	100			317	if (@$path == 1 and ref($path->[0]) eq 'HASH') {
1229	8					14	$opt = $path->[0];
1230							}
1231							else {
1232	115					125	$fixed = $path;
1233							}
1234							}
1235	72	100				145	if( exists $tail->[0]{''} ) {
1236	21					28	my $path = [@{$tail}[1..$#{$tail}]];
	21					37
	21					25
1237	21					28	$tail = $tail->[0];
1238	21					42	($head, $tail, $path) = _slide_tail( $head, $tail, $path, _descend($ctx) );
1239	21					60	$tail = [$tail, @$path];
1240							}
1241							}
1242	810	100				1023	$debug and print "#$indent _reduce_path $ctx->{depth} out head=", _dump($head), ' tail=', _dump($tail), "\n";
1243	810					1083	return ($head, $tail);
1244							}
1245
1246							sub _reduce_node {
1247	1080			1080		937	my ($node, $ctx) = @_;
1248	1080					1193	my $indent = ' ' x $ctx->{depth};
1249	1080					830	my $debug = $ctx->{debug};
1250	1080					1083	my $optional = _remove_optional($node);
1251	1080	100				1480	$debug and print "#$indent _reduce_node $ctx->{depth} in @{[_dump($node)]} opt=$optional\n";
	57					66
1252	1080	100	100			1954	if( $optional and scalar keys %$node == 1 ) {
1253	70					78	my $path = (values %$node)[0];
1254	70	100				69	if( not grep { ref($_) eq 'HASH' } @$path ) {
	117					209
1255							# if we have removed an optional, and there is only one path
1256							# left then there is nothing left to compare. Because of the
1257							# optional it cannot participate in any further reductions.
1258							# (unless we test for equality among sub-trees).
1259	61					108	my $result = {
1260							'' => undef,
1261							$path->[0] => $path
1262							};
1263	61	100				90	$debug and print "#$indent\| fast fail @{[_dump($result)]}\n";
	2					4
1264	61					97	return [], $result;
1265							}
1266							}
1267
1268	1019					1073	my( $fail, $reduce ) = _scan_node( $node, _descend($ctx) );
1269
1270	1019	100				1871	$debug and print "#$indent\|_scan_node done opt=$optional reduce=@{[_dump($reduce)]} fail=@{[_dump($fail)]}\n";
	55					75
	55					81
1271
1272							# We now perform tail reduction on each of the nodes in the reduce
1273							# hash. If we have only one key, we know we will have a successful
1274							# reduction (since everything that was inserted into the node based
1275							# on the value of the last token of each path all mapped to the same
1276							# value).
1277
1278	1019	100	100			4378	if( @$fail == 0 and keys %$reduce == 1 and not $optional) {
			100
1279							# every path shares a common path
1280	506					626	my $path = (values %$reduce)[0];
1281	506					600	my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1282	506	100				919	$debug and print "#$indent\|_reduce_node $ctx->{depth} common=@{[_dump($common)]} tail=", _dump($tail), "\n";
	50					76
1283	506					1534	return( $common, $tail );
1284							}
1285
1286							# this node resulted in a list of paths, game over
1287	513					643	$ctx->{indent} = $indent;
1288	513					583	return _reduce_fail( $reduce, $fail, $optional, _descend($ctx) );
1289							}
1290
1291							sub _reduce_fail {
1292	513			513		450	my( $reduce, $fail, $optional, $ctx ) = @_;
1293	513					400	my( $debug, $depth, $indent ) = @{$ctx}{qw(debug depth indent)};
	513					659
1294	513					377	my %result;
1295	513	100				727	$result{''} = undef if $optional;
1296	513					324	my $p;
1297	513					705	for $p (keys %$reduce) {
1298	1181					861	my $path = $reduce->{$p};
1299	1181	100				1205	if( scalar @$path == 1 ) {
1300	1114					762	$path = $path->[0];
1301	1114	100				1340	$debug and print "#$indent\| -simple opt=$optional unrev @{[_dump($path)]}\n";
	7					10
1302	1114					1094	$path = _unrev_path($path, _descend($ctx) );
1303	1114					1634	$result{_node_key($path->[0])} = $path;
1304							}
1305							else {
1306	67	100				114	$debug and print "#$indent\| _do_reduce(@{[_dump($path)]})\n";
	1					2
1307	67					84	my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1308							$path = [
1309							(
1310							ref($tail) eq 'HASH'
1311							? _unrev_node($tail, _descend($ctx) )
1312							: _unrev_path($tail, _descend($ctx) )
1313							),
1314	67	100				197	@{_unrev_path($common, _descend($ctx) )}
	67					88
1315							];
1316	67	100				172	$debug and print "#$indent\| +reduced @{[_dump($path)]}\n";
	1					3
1317	67					83	$result{_node_key($path->[0])} = $path;
1318							}
1319							}
1320	513					443	my $f;
1321	513					573	for $f( @$fail ) {
1322	219	100				272	$debug and print "#$indent\| +fail @{[_dump($f)]}\n";
	3					6
1323	219					257	$result{$f->[0]} = $f;
1324							}
1325	513	100				665	$debug and print "#$indent _reduce_fail $depth fail=@{[_dump(\%result)]}\n";
	5					9
1326	513					1533	return ( [], \%result );
1327							}
1328
1329							sub _scan_node {
1330	1019			1019		816	my( $node, $ctx ) = @_;
1331	1019					1111	my $indent = ' ' x $ctx->{depth};
1332	1019					757	my $debug = $ctx->{debug};
1333
1334							# For all the paths in the node, reverse them. If the first token
1335							# of the path is a scalar, push it onto an array in a hash keyed by
1336							# the value of the scalar.
1337							#
1338							# If it is a node, call _reduce_node on this node beforehand. If we
1339							# get back a common head, all of the paths in the subnode shared a
1340							# common tail. We then store the common part and the remaining node
1341							# of paths (which is where the paths diverged from the end and install
1342							# this into the same hash. At this point both the common and the tail
1343							# are in reverse order, just as simple scalar paths are.
1344							#
1345							# On the other hand, if there were no common path returned then all
1346							# the paths of the sub-node diverge at the end character. In this
1347							# case the tail cannot participate in any further reductions and will
1348							# appear in forward order.
1349							#
1350							# certainly the hurgliest function in the whole file :(
1351
1352							# $debug = 1 if $depth >= 8;
1353	1019					730	my @fail;
1354							my %reduce;
1355
1356	0					0	my $n;
1357	1019					1585	for $n(
1358	2743					3943	map { substr($_, index($_, '#')+1) }
1359							sort
1360							map {
1361							join( '\|' =>
1362	9029					9533	scalar(grep {ref($_) eq 'HASH'} @{$node->{$_}}),
	2743					2824
1363							_node_offset($node->{$_}),
1364	2743					1858	scalar @{$node->{$_}},
	2743					7481
1365							)
1366							. "#$_"
1367							}
1368							keys %$node ) {
1369	2743					1773	my( $end, @path ) = reverse @{$node->{$n}};
	2743					4631
1370	2743	100				3429	if( ref($end) ne 'HASH' ) {
1371	2137	100				2597	$debug and print "# $indent\|_scan_node push reduce ($end:@{[_dump(\@path)]})\n";
	87					167
1372	2137					1375	push @{$reduce{$end}}, [ $end, @path ];
	2137					5573
1373							}
1374							else {
1375	606	100				840	$debug and print "# $indent\|_scan_node head=", _dump(\@path), ' tail=', _dump($end), "\n";
1376	606					441	my $new_path;
1377							# deal with sing, singing => s(?:ing)?ing
1378	606	100	66			1501	if( keys %$end == 2 and exists $end->{''} ) {
1379	94					115	my ($key, $opt_path) = each %$end;
1380	94	100				164	($key, $opt_path) = each %$end if $key eq '';
1381	94					73	$opt_path = [reverse @{$opt_path}];
	94					134
1382	94	100				137	$debug and print "# $indent\| check=", _dump($opt_path), "\n";
1383	94					259	my $end = { '' => undef, $opt_path->[0] => [@$opt_path] };
1384	94					81	my $head = [];
1385	94					111	my $path = [@path];
1386	94					118	($head, my $slide, $path) = _slide_tail( $head, $end, $path, $ctx );
1387	94	100				221	if( @$head ) {
1388	40					111	$new_path = [ @$head, $slide, @$path ];
1389							}
1390							}
1391	606	100				673	if( $new_path ) {
1392	40	100				58	$debug and print "# $indent\|_scan_node slid=", _dump($new_path), "\n";
1393	40					24	push @{$reduce{$new_path->[0]}}, $new_path;
	40					80
1394							}
1395							else {
1396	566					744	my( $common, $tail ) = _reduce_node( $end, _descend($ctx) );
1397	566	100				1044	if( not @$common ) {
1398	219	100				287	$debug and print "# $indent\| +failed $n\n";
1399	219					488	push @fail, [reverse(@path), $tail];
1400							}
1401							else {
1402	347					473	my $path = [@path];
1403	347	100				501	$debug and print "# $indent\|_scan_node ++recovered common=@{[_dump($common)]} tail=",
	34					45
1404	34					45	_dump($tail), " path=@{[_dump($path)]}\n";
1405	347	100	100			1311	if( ref($tail) eq 'HASH'
1406							and keys %$tail == 2
1407							) {
1408	287	100				462	if( exists $tail->{''} ) {
1409	121					184	($common, $tail, $path) = _slide_tail( $common, $tail, $path, $ctx );
1410							}
1411							}
1412	347	100				296	push @{$reduce{$common->[0]}}, [
	347					1622
1413							@$common,
1414							(ref($tail) eq 'HASH' ? $tail : @$tail ),
1415							@$path
1416							];
1417							}
1418							}
1419							}
1420							}
1421	1019	100				1841	$debug and print
1422	55					140	"# $indent\|_scan_node counts: reduce=@{[scalar keys %reduce]} fail=@{[scalar @fail]}\n";
	55					162
1423	1019					1566	return( \@fail, \%reduce );
1424							}
1425
1426							sub _do_reduce {
1427	573			573		488	my ($path, $ctx) = @_;
1428	573					664	my $indent = ' ' x $ctx->{depth};
1429	573					442	my $debug = $ctx->{debug};
1430	573					1004	my $ra = Regexp::Assemble->new(chomp=>0);
1431	573					823	$ra->debug($debug);
1432	573	100				741	$debug and print "# $indent\| do @{[_dump($path)]}\n";
	51					77
1433	573					1189	$ra->_insertr( $_ ) for
1434							# When nodes come into the picture, we have to be careful
1435							# about how we insert the paths into the assembly.
1436							# Paths with nodes first, then closest node to front
1437							# then shortest path. Merely because if we can control
1438							# order in which paths containing nodes get inserted,
1439							# then we can make a couple of assumptions that simplify
1440							# the code in _insert_node.
1441							sort {
1442	6000					5442	scalar(grep {ref($_) eq 'HASH'} @$a)
1443	1113	50	100			1071	<=> scalar(grep {ref($_) eq 'HASH'} @$b)
	6578					6599
1444							\|\|
1445							_node_offset($b) <=> _node_offset($a)
1446							\|\|
1447							scalar @$a <=> scalar @$b
1448							}
1449							@$path
1450							;
1451	573					767	$path = $ra->_path;
1452	573					527	my $common = [];
1453	573					2467	push @$common, shift @$path while( ref($path->[0]) ne 'HASH' );
1454	573	100				902	my $tail = scalar( @$path ) > 1 ? [@$path] : $path->[0];
1455	573	100				752	$debug and print "# $indent\| _do_reduce common=@{[_dump($common)]} tail=@{[_dump($tail)]}\n";
	51					76
	51					61
1456	573					1913	return ($common, $tail);
1457							}
1458
1459							sub _node_offset {
1460							# return the offset that the first node is found, or -ve
1461							# optimised for speed
1462	4422			4422		2584	my $nr = @{$_[0]};
	4422					3550
1463	4422					2803	my $atom = -1;
1464	4422		100			26670	ref($_[0]->[$atom]) eq 'HASH' and return $atom while ++$atom < $nr;
1465	3458					4677	return -1;
1466							}
1467
1468							sub _slide_tail {
1469	240			240		17165	my $head = shift;
1470	240					181	my $tail = shift;
1471	240					179	my $path = shift;
1472	240					165	my $ctx = shift;
1473	240					291	my $indent = ' ' x $ctx->{depth};
1474	240					186	my $debug = $ctx->{debug};
1475	240	100				353	$debug and print "# $indent\| slide in h=", _dump($head),
1476							' t=', _dump($tail), ' p=', _dump($path), "\n";
1477	240					314	my $slide_path = (each %$tail)[-1];
1478	240	100				435	$slide_path = (each %$tail)[-1] unless defined $slide_path;
1479	240	100				365	$debug and print "# $indent\| slide potential ", _dump($slide_path), " over ", _dump($path), "\n";
1480	240		100			829	while( defined $path->[0] and $path->[0] eq $slide_path->[0] ) {
1481	154	100				237	$debug and print "# $indent\| slide=tail=$slide_path->[0]\n";
1482	154					150	my $slide = shift @$path;
1483	154					114	shift @$slide_path;
1484	154					167	push @$slide_path, $slide;
1485	154					398	push @$head, $slide;
1486							}
1487	240	100				344	$debug and print "# $indent\| slide path ", _dump($slide_path), "\n";
1488	240					318	my $slide_node = {
1489							'' => undef,
1490							_node_key($slide_path->[0]) => $slide_path,
1491							};
1492	240	100				411	$debug and print "# $indent\| slide out h=", _dump($head),
1493							' s=', _dump($slide_node), ' p=', _dump($path), "\n";
1494	240					444	return ($head, $slide_node, $path);
1495							}
1496
1497							sub _unrev_path {
1498	2690			2690		1997	my ($path, $ctx) = @_;
1499	2690					2498	my $indent = ' ' x $ctx->{depth};
1500	2690					1887	my $debug = $ctx->{debug};
1501	2690					1549	my $new;
1502	2690	100				2187	if( not grep { ref($_) } @$path ) {
	6383					7591
1503	2196	100				2575	$debug and print "# ${indent}_unrev path fast ", _dump($path);
1504	2196					2665	$new = [reverse @$path];
1505	2196	100				2582	$debug and print "# -> ", _dump($new), "\n";
1506	2196					2467	return $new;
1507							}
1508	494	100				666	$debug and print "# ${indent}unrev path in ", _dump($path), "\n";
1509	494					822	while( defined( my $p = pop @$path )) {
1510	1483	100				3184	push @$new,
		100
1511							ref($p) eq 'HASH' ? _unrev_node($p, _descend($ctx) )
1512							: ref($p) eq 'ARRAY' ? _unrev_path($p, _descend($ctx) )
1513							: $p
1514							;
1515							}
1516	494	100				613	$debug and print "# ${indent}unrev path out ", _dump($new), "\n";
1517	494					534	return $new;
1518							}
1519
1520							sub _unrev_node {
1521	630			630		531	my ($node, $ctx ) = @_;
1522	630					630	my $indent = ' ' x $ctx->{depth};
1523	630					436	my $debug = $ctx->{debug};
1524	630					636	my $optional = _remove_optional($node);
1525	630	100				826	$debug and print "# ${indent}unrev node in ", _dump($node), " opt=$optional\n";
1526	630					421	my $new;
1527	630	100				845	$new->{''} = undef if $optional;
1528	630					403	my $n;
1529	630					909	for $n( keys %$node ) {
1530	1167					1213	my $path = _unrev_path($node->{$n}, _descend($ctx) );
1531	1167					1751	$new->{_node_key($path->[0])} = $path;
1532							}
1533	630	100				885	$debug and print "# ${indent}unrev node out ", _dump($new), "\n";
1534	630					1766	return $new;
1535							}
1536
1537							sub _node_key {
1538	4653			4653		3856	my $node = shift;
1539	4653	100				5684	return _node_key($node->[0]) if ref($node) eq 'ARRAY';
1540	4642	100				10996	return $node unless ref($node) eq 'HASH';
1541	205					162	my $key = '';
1542	205					135	my $k;
1543	205					318	for $k( keys %$node ) {
1544	452	100				553	next if $k eq '';
1545	374	100	100			987	$key = $k if $key eq '' or $key gt $k;
1546							}
1547	205					456	return $key;
1548							}
1549
1550							sub _descend {
1551							# Take a context object, and increase the depth by one.
1552							# By creating a fresh hash each time, we don't have to
1553							# bother adding make-work code to decrease the depth
1554							# when we return from what we called.
1555	6190			6190		4064	my $ctx = shift;
1556	6190					15825	return {%$ctx, depth => $ctx->{depth}+1};
1557							}
1558
1559							#####################################################################
1560
1561							sub _make_class {
1562	650			650		536	my $self = shift;
1563	650					585	my %set = map { ($_,1) } @_;
	1675					2287
1564	650	100				1107	delete $set{'\\d'} if exists $set{'\\w'};
1565	650	100				856	delete $set{'\\D'} if exists $set{'\\W'};
1566							return '.' if exists $set{'.'}
1567							or ($self->{fold_meta_pairs} and (
1568							(exists $set{'\\d'} and exists $set{'\\D'})
1569							or (exists $set{'\\s'} and exists $set{'\\S'})
1570	650	100	66			3377	or (exists $set{'\\w'} and exists $set{'\\W'})
			66
			66
1571							))
1572							;
1573	632					655	for my $meta( q/\\d/, q/\\D/, q/\\s/, q/\\S/, q/\\w/, q/\\W/ ) {
1574	3792	100				4570	if( exists $set{$meta} ) {
1575	28					155	my $re = qr/$meta/;
1576	28					21	my @delete;
1577	28		66			326	$_ =~ /^$re$/ and push @delete, $_ for keys %set;
1578	28	100				79	delete @set{@delete} if @delete;
1579							}
1580							}
1581	632	100				937	return (keys %set)[0] if keys %set == 1;
1582	625					594	for my $meta( '.', '+', '*', '?', '(', ')', '^', '@', '$', '[', '/', ) {
1583	6875	100				8503	exists $set{"\\$meta"} and $set{$meta} = delete $set{"\\$meta"};
1584							}
1585	625	100				811	my $dash = exists $set{'-'} ? do { delete($set{'-'}), '-' } : '';
	20					36
1586	625	100				667	my $caret = exists $set{'^'} ? do { delete($set{'^'}), '^' } : '';
	7					10
1587	625					1461	my $class = join( '' => sort keys %set );
1588	625	100	100			1240	$class =~ s/0123456789/\\d/ and $class eq '\\d' and return $class;
1589	622					2828	return "[$dash$class$caret]";
1590							}
1591
1592							sub _re_sort {
1593	1009		100	1009		3481	return length $b <=> length $a \|\| $a cmp $b
1594							}
1595
1596							sub _combine {
1597	140			140		104	my $self = shift;
1598	140					107	my $type = shift;
1599							# print "c in = @{[_dump(\@_)]}\n";
1600							# my $combine =
1601							return '('
1602							. $type
1603	140					118	. do {
1604	140					91	my( @short, @long );
1605	140	100				160	push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
	377					1441
1606	140	100				228	if( @short == 1 ) {
		100
1607	31					59	@long = sort _re_sort @long, @short;
1608							}
1609							elsif( @short > 1 ) {
1610							# yucky but true
1611	77					96	my @combine = (_make_class($self, @short), sort _re_sort @long);
1612	77					112	@long = @combine;
1613							}
1614							else {
1615	32					60	@long = sort _re_sort @long;
1616							}
1617	140					326	join( '\|', @long );
1618							}
1619							. ')';
1620							# print "combine <$combine>\n";
1621							# $combine;
1622							}
1623
1624							sub _combine_new {
1625	1738			1738		1333	my $self = shift;
1626	1738					1098	my( @short, @long );
1627	1738	100				1881	push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
	3244					13702
1628	1738	100	100			5895	if( @short == 1 and @long == 0 ) {
		100	100
1629	365					1384	return $short[0];
1630							}
1631							elsif( @short > 1 and @short == @_ ) {
1632	494					642	return _make_class($self, @short);
1633							}
1634							else {
1635	879	100				3457	return '(?:'
1636							. join( '\|' =>
1637							@short > 1
1638							? ( _make_class($self, @short), sort _re_sort @long)
1639							: ( (sort _re_sort( @long )), @short )
1640							)
1641							. ')';
1642							}
1643							}
1644
1645							sub _re_path {
1646	4772			4772		3214	my $self = shift;
1647							# in shorter assemblies, _re_path() is the second hottest
1648							# routine. after insert(), so make it fast.
1649
1650	4772	100				6182	if ($self->{unroll_plus}) {
1651							# but we can't easily make this blockless
1652	72					55	my @arr = @{$_[0]};
	72					118
1653	72					52	my $str = '';
1654	72					55	my $skip = 0;
1655	72					104	for my $i (0..$#arr) {
1656	127	100	100			983	if (ref($arr[$i]) eq 'ARRAY') {
		100
		100
		100
1657	1					3	$str .= _re_path($self, $arr[$i]);
1658							}
1659							elsif (ref($arr[$i]) eq 'HASH') {
1660							$str .= exists $arr[$i]->{''}
1661							? _combine_new( $self,
1662	7					19	map { _re_path( $self, $arr[$i]->{$_} ) } grep { $_ ne '' } keys %{$arr[$i]}
	14					18
	7					15
1663							) . '?'
1664	28	100				41	: _combine_new($self, map { _re_path( $self, $arr[$i]->{$_} ) } keys %{$arr[$i]})
	42					60
	21					44
1665							;
1666							}
1667							elsif ($i < $#arr and $arr[$i+1] =~ /\A$arr[$i]\*(\??)\Z/) {
1668	7	50				25	$str .= "$arr[$i]+" . (defined $1 ? $1 : '');
1669	7					14	++$skip;
1670							}
1671							elsif ($skip) {
1672	7					10	$skip = 0;
1673							}
1674							else {
1675	84					119	$str .= $arr[$i];
1676							}
1677							}
1678	72					166	return $str;
1679							}
1680
1681	4700	50				3620	return join( '', @_ ) unless grep { length ref $_ } @_;
	4700					7211
1682	4700					2932	my $p;
1683							return join '', map {
1684							ref($_) eq '' ? $_
1685	9506	100				16153	: ref($_) eq 'HASH' ? do {
		100
1686							# In the case of a node, see whether there's a '' which
1687							# indicates that the whole thing is optional and thus
1688							# requires a trailing ?
1689							# Unroll the two different paths to avoid the needless
1690							# grep when it isn't necessary.
1691	1710					1090	$p = $_;
1692							exists $_->{''}
1693							? _combine_new( $self,
1694	801					984	map { _re_path( $self, $p->{$_} ) } grep { $_ ne '' } keys %$_
	1508					1946
1695							) . '?'
1696	1710	100				3241	: _combine_new($self, map { _re_path( $self, $p->{$_} ) } keys %$_ )
	2394					2710
1697							}
1698							: _re_path($self, $_) # ref($_) eq 'ARRAY'
1699	4700					2888	} @{$_[0]}
	4700					4167
1700							}
1701
1702							sub _lookahead {
1703	132			132		101	my $in = shift;
1704	132					90	my %head;
1705							my $path;
1706	132					171	for $path( keys %$in ) {
1707	328	100				431	next unless defined $in->{$path};
1708							# print "look $path: ", ref($in->{$path}[0]), ".\n";
1709	267	100				396	if( ref($in->{$path}[0]) eq 'HASH' ) {
		100
1710	15					15	my $next = 0;
1711	15		100			36	while( ref($in->{$path}[$next]) eq 'HASH' and @{$in->{$path}} > $next + 1 ) {
	16					60
1712	11	100				20	if( exists $in->{$path}[$next]{''} ) {
1713	5					12	++$head{$in->{$path}[$next+1]};
1714							}
1715	11					22	++$next;
1716							}
1717	15					30	my $inner = _lookahead( $in->{$path}[0] );
1718	15					46	@head{ keys %$inner } = (values %$inner);
1719							}
1720							elsif( ref($in->{$path}[0]) eq 'ARRAY' ) {
1721	2					3	my $subpath = $in->{$path}[0];
1722	2					6	for( my $sp = 0; $sp < @$subpath; ++$sp ) {
1723	3	100				5	if( ref($subpath->[$sp]) eq 'HASH' ) {
1724	2					3	my $follow = _lookahead( $subpath->[$sp] );
1725	2					5	@head{ keys %$follow } = (values %$follow);
1726	2	100				7	last unless exists $subpath->[$sp]{''};
1727							}
1728							else {
1729	1					2	++$head{$subpath->[$sp]};
1730	1					2	last;
1731							}
1732							}
1733							}
1734							else {
1735	250					306	++$head{ $in->{$path}[0] };
1736							}
1737							}
1738							# print "_lookahead ", _dump($in), '==>', _dump([keys %head]), "\n";
1739	132					184	return \%head;
1740							}
1741
1742							sub _re_path_lookahead {
1743	265			265		181	my $self = shift;
1744	265					165	my $in = shift;
1745							# print "_re_path_la in ", _dump($in), "\n";
1746	265					172	my $out = '';
1747	265					359	for( my $p = 0; $p < @$in; ++$p ) {
1748	573	100				674	if( ref($in->[$p]) eq '' ) {
		100
1749	462					332	$out .= $in->[$p];
1750	462					610	next;
1751							}
1752							elsif( ref($in->[$p]) eq 'ARRAY' ) {
1753	2					3	$out .= _re_path_lookahead($self, $in->[$p]);
1754	2					4	next;
1755							}
1756							# print "$p ", _dump($in->[$p]), "\n";
1757							my $path = [
1758	228					278	map { _re_path_lookahead($self, $in->[$p]{$_} ) }
1759	275					284	grep { $_ ne '' }
1760	109					84	keys %{$in->[$p]}
	109					182
1761							];
1762	109					176	my $ahead = _lookahead($in->[$p]);
1763	109					86	my $more = 0;
1764	109	100	100			257	if( exists $in->[$p]{''} and $p + 1 < @$in ) {
1765	12					11	my $next = 1;
1766	12					30	while( $p + $next < @$in ) {
1767	14	100				25	if( ref( $in->[$p+$next] ) eq 'HASH' ) {
1768	2					4	my $follow = _lookahead( $in->[$p+$next] );
1769	2					6	@{$ahead}{ keys %$follow } = (values %$follow);
	2					5
1770							}
1771							else {
1772	12					17	++$ahead->{$in->[$p+$next]};
1773	12					12	last;
1774							}
1775	2					4	++$next;
1776							}
1777	12					9	$more = 1;
1778							}
1779	109					99	my $nr_one = grep { /^$Single_Char$/ } @$path;
	228					891
1780	109					87	my $nr = @$path;
1781	109	100	100			216	if( $nr_one > 1 and $nr_one == $nr ) {
1782	18					29	$out .= _make_class($self, @$path);
1783	18	100				66	$out .= '?' if exists $in->[$p]{''};
1784							}
1785							else {
1786							my $zwla = keys(%$ahead) > 1
1787	91	100				188	? _combine($self, '?=', grep { s/\+$//; $_ } keys %$ahead )
	191					152
	191					212
1788							: '';
1789	91	100				165	my $patt = $nr > 1 ? _combine($self, '?:', @$path ) : $path->[0];
1790							# print "have nr=$nr n1=$nr_one n=", _dump($in->[$p]), ' a=', _dump([keys %$ahead]), " zwla=$zwla patt=$patt @{[_dump($path)]}\n";
1791	91	100				117	if( exists $in->[$p]{''} ) {
1792	44	100				166	$out .= $more ? "$zwla(?:$patt)?" : "(?:$zwla$patt)?";
1793							}
1794							else {
1795	47					157	$out .= "$zwla$patt";
1796							}
1797							}
1798							}
1799	265					397	return $out;
1800							}
1801
1802							sub _re_path_track {
1803	33			33		24	my $self = shift;
1804	33					23	my $in = shift;
1805	33					25	my $normal = shift;
1806	33					23	my $augmented = shift;
1807	33					22	my $o;
1808	33					19	my $simple = '';
1809	33					28	my $augment = '';
1810	33					48	for( my $n = 0; $n < @$in; ++$n ) {
1811	114	100				126	if( ref($in->[$n]) eq '' ) {
1812	104					73	$o = $in->[$n];
1813	104					68	$simple .= $o;
1814	104					65	$augment .= $o;
1815	104	100	100			555	if( (
			66
			100
1816							$n < @$in - 1
1817							and ref($in->[$n+1]) eq 'HASH' and exists $in->[$n+1]{''}
1818							)
1819							or $n == @$in - 1
1820							) {
1821	24					20	push @{$self->{mlist}}, $normal . $simple ;
	24					42
1822	24	50				64	$augment .= $] < 5.009005
1823							? "(?{\$self->{m}=$self->{mcount}})"
1824							: "(?{$self->{mcount}})"
1825							;
1826	24					40	++$self->{mcount};
1827							}
1828							}
1829							else {
1830							my $path = [
1831	25					64	map { $self->_re_path_track( $in->[$n]{$_}, $normal.$simple , $augmented.$augment ) }
1832	26					39	grep { $_ ne '' }
1833	10					9	keys %{$in->[$n]}
	10					23
1834							];
1835	10					50	$o = '(?:' . join( '\|' => sort _re_sort @$path ) . ')';
1836	10	100				17	$o .= '?' if exists $in->[$n]{''};
1837	10					11	$simple .= $o;
1838	10					24	$augment .= $o;
1839							}
1840							}
1841	33					65	return $augment;
1842							}
1843
1844							sub _re_path_pretty {
1845	411			411		267	my $self = shift;
1846	411					253	my $in = shift;
1847	411					252	my $arg = shift;
1848	411					454	my $pre = ' ' x (($arg->{depth}+0) * $arg->{indent});
1849	411					384	my $indent = ' ' x (($arg->{depth}+1) * $arg->{indent});
1850	411					251	my $out = '';
1851	411					262	$arg->{depth}++;
1852	411					291	my $prev_was_paren = 0;
1853	411					530	for( my $p = 0; $p < @$in; ++$p ) {
1854	1084	100				1129	if( ref($in->[$p]) eq '' ) {
		100
1855	910	100				953	$out .= "\n$pre" if $prev_was_paren;
1856	910					636	$out .= $in->[$p];
1857	910					1137	$prev_was_paren = 0;
1858							}
1859							elsif( ref($in->[$p]) eq 'ARRAY' ) {
1860	3					6	$out .= _re_path($self, $in->[$p]);
1861							}
1862							else {
1863							my $path = [
1864	369					448	map { _re_path_pretty($self, $in->[$p]{$_}, $arg ) }
1865	419					409	grep { $_ ne '' }
1866	171					128	keys %{$in->[$p]}
	171					276
1867							];
1868	171					174	my $nr = @$path;
1869	171					110	my( @short, @long );
1870	171	100				180	push @{/^$Single_Char$/ ? \@short : \@long}, $_ for @$path;
	369					1549
1871	171	100				202	if( @short == $nr ) {
1872	37	100				64	$out .= $nr == 1 ? $path->[0] : _make_class($self, @short);
1873	37	100				122	$out .= '?' if exists $in->[$p]{''};
1874							}
1875							else {
1876	134	100				200	$out .= "\n" if length $out;
1877	134	100				163	$out .= $pre if $p;
1878	134					138	$out .= "(?:\n$indent";
1879	134	100				134	if( @short < 2 ) {
1880	133					93	my $r = 0;
1881							$out .= join( "\n$indent\|" => map {
1882	133	100				278	$r++ and $_ =~ s/^\(\?:/\n$indent(?:/;
	298					412
1883	298					410	$_
1884							}
1885							sort _re_sort @$path
1886							);
1887							}
1888							else {
1889	1					6	$out .= join( "\n$indent\|" => ( (sort _re_sort @long), _make_class($self, @short) ));
1890							}
1891	134					163	$out .= "\n$pre)";
1892	134	100				153	if( exists $in->[$p]{''} ) {
1893	37					46	$out .= "\n$pre?";
1894	37					82	$prev_was_paren = 0;
1895							}
1896							else {
1897	97					237	$prev_was_paren = 1;
1898							}
1899							}
1900							}
1901							}
1902	411					283	$arg->{depth}--;
1903	411					653	return $out;
1904							}
1905
1906							sub _node_eq {
1907	425	100	66	425		950	return 0 if not defined $_[0] or not defined $_[1];
1908	422	100				703	return 0 if ref $_[0] ne ref $_[1];
1909							# Now that we have determined that the reference of each
1910							# argument are the same, we only have to test the first
1911							# one, which gives us a nice micro-optimisation.
1912	381	100				501	if( ref($_[0]) eq 'HASH' ) {
		100
1913	305					388	keys %{$_[0]} == keys %{$_[1]}
	305					585
1914							and
1915							# does this short-circuit to avoid _re_path() cost more than it saves?
1916	305	100	100			199	join( '\|' => sort keys %{$_[0]}) eq join( '\|' => sort keys %{$_[1]})
	272					563
	272					1114
1917							and
1918							_re_path(undef, [$_[0]] ) eq _re_path(undef, [$_[1]] );
1919							}
1920							elsif( ref($_[0]) eq 'ARRAY' ) {
1921	9	100				6	scalar @{$_[0]} == scalar @{$_[1]}
	9					10
	9					25
1922							and
1923							_re_path(undef, $_[0]) eq _re_path(undef, $_[1]);
1924							}
1925							else {
1926	67					183	$_[0] eq $_[1];
1927							}
1928							}
1929
1930							sub _pretty_dump {
1931	7			7		19	return sprintf "\\x%02x", ord(shift);
1932							}
1933
1934							sub _dump {
1935	5579			5579		4217	my $path = shift;
1936	5579	100				7376	return _dump_node($path) if ref($path) eq 'HASH';
1937	5057					3507	my $dump = '[';
1938	5057					2715	my $d;
1939	5057					3209	my $nr = 0;
1940	5057					4163	for $d( @$path ) {
1941	11036	100				12707	$dump .= ' ' if $nr++;
1942	11036	100				16790	if( ref($d) eq 'HASH' ) {
		100
		100
1943	1340					1324	$dump .= _dump_node($d);
1944							}
1945							elsif( ref($d) eq 'ARRAY' ) {
1946	242					242	$dump .= _dump($d);
1947							}
1948							elsif( defined $d ) {
1949							# D::C indicates the second test is redundant
1950							# $dump .= ( $d =~ /\s/ or not length $d )
1951	9453	100				18142	$dump .= (
		100
1952							$d =~ /\s/ ? qq{'$d'} :
1953							$d =~ /^[\x00-\x1f]$/ ? _pretty_dump($d) :
1954							$d
1955							);
1956							}
1957							else {
1958	1					2	$dump .= '*';
1959							}
1960							}
1961	5057					16911	return $dump . ']';
1962							}
1963
1964							sub _dump_node {
1965	1862			1862		1307	my $node = shift;
1966	1862					1290	my $dump = '{';
1967	1862					1153	my $nr = 0;
1968	1862					1126	my $n;
1969	1862					3372	for $n (sort keys %$node) {
1970	3899	100				4746	$dump .= ' ' if $nr++;
1971							# Devel::Cover shows this to test to be redundant
1972							# $dump .= ( $n eq '' and not defined $node->{$n} )
1973							$dump .= $n eq ''
1974							? '*'
1975							: ($n =~ /^[\x00-\x1f]$/ ? _pretty_dump($n) : $n)
1976	3899	100				8440	. "=>" . _dump($node->{$n})
		100
1977							;
1978							}
1979	1862					6620	return $dump . '}';
1980							}
1981
1982							=pod
1983
1984							=head1 NAME
1985
1986							Regexp::Assemble - Assemble multiple Regular Expressions into a single RE
1987
1988							=head1 SYNOPSIS
1989
1990							use Regexp::Assemble;
1991
1992							my $ra = Regexp::Assemble->new;
1993							$ra->add( 'ab+c' );
1994							$ra->add( 'ab+-' );
1995							$ra->add( 'a\w\d+' );
1996							$ra->add( 'a\d+' );
1997							print $ra->re; # prints a(?:\w?\d+\|b+[-c])
1998
1999							=head1 DESCRIPTION
2000
2001							Regexp::Assemble takes an arbitrary number of regular expressions
2002							and assembles them into a single regular expression (or RE) that
2003							matches all that the individual REs match.
2004
2005							As a result, instead of having a large list of expressions to loop
2006							over, a target string only needs to be tested against one expression.
2007							This is interesting when you have several thousand patterns to deal
2008							with. Serious effort is made to produce the smallest pattern possible.
2009
2010							It is also possible to track the original patterns, so that you can
2011							determine which, among the source patterns that form the assembled
2012							pattern, was the one that caused the match to occur.
2013
2014							You should realise that large numbers of alternations are processed
2015							in perl's regular expression engine in O(n) time, not O(1). If you
2016							are still having performance problems, you should look at using a
2017							trie. Note that Perl's own regular expression engine will implement
2018							trie optimisations in perl 5.10 (they are already available in
2019							perl 5.9.3 if you want to try them out). C will
2020							do the right thing when it knows it's running on a trie'd perl.
2021							(At least in some version after this one).
2022
2023							Some more examples of usage appear in the accompanying README. If
2024							that file is not easy to access locally, you can find it on a web
2025							repository such as
2026							L or
2027							L.
2028
2029							See also L.
2030
2031							=head1 Methods
2032
2033							=head2 add(LIST)
2034
2035							Takes a string, breaks it apart into a set of tokens (respecting
2036							meta characters) and inserts the resulting list into the C
2037							object. It uses a naive regular expression to lex the string
2038							that may be fooled complex expressions (specifically, it will
2039							fail to lex nested parenthetical expressions such as
2040							C correctly). If this is the case, the end of
2041							the string will not be tokenised correctly and returned as one
2042							long string.
2043
2044							On the one hand, this may indicate that the patterns you are
2045							trying to feed the C object are too complex. Simpler
2046							patterns might allow the algorithm to work more effectively and
2047							perform more reductions in the resulting pattern.
2048
2049							On the other hand, you can supply your own pattern to perform the
2050							lexing if you need. The test suite contains an example of a lexer
2051							pattern that will match one level of nested parentheses.
2052
2053							Note that there is an internal optimisation that will bypass a
2054							much of the lexing process. If a string contains no C<\>
2055							(backslash), C<[> (open square bracket), C<(> (open paren),
2056							C (question mark), C<+> (plus), C<*> (star) or C<{> (open
2057							curly), a character split will be performed directly.
2058
2059							A list of strings may be supplied, thus you can pass it a file
2060							handle of a file opened for reading:
2061
2062							$re->add( '\d+-\d+-\d+-\d+\.example\.com' );
2063							$re->add( );
2064
2065							If the file is very large, it may be more efficient to use a
2066							C loop, to read the file line-by-line:
2067
2068							$re->add($_) while ;
2069
2070							The C method will chomp the lines automatically. If you
2071							do not want this to occur (you want to keep the record
2072							separator), then disable Cing.
2073
2074							$re->chomp(0);
2075							$re->add($_) while ;
2076
2077							This method is chainable.
2078
2079							=head2 add_file(FILENAME [...])
2080
2081							Takes a list of file names. Each file is opened and read
2082							line by line. Each line is added to the assembly.
2083
2084							$r->add_file( 'file.1', 'file.2' );
2085
2086							If a file cannot be opened, the method will croak. If you cannot
2087							afford to let this happen then you should wrap the call in a C
2088							block.
2089
2090							Chomping happens automatically unless you the C method
2091							to disable it. By default, input lines are read according to the
2092							value of the C attribute (if defined), and
2093							will otherwise fall back to the current setting of the system C<$/>
2094							variable. The record separator may also be specified on each
2095							call to C. Internally, the routine Cises the
2096							value of C<$/> to whatever is required, for the duration of the
2097							call.
2098
2099							An alternate calling mechanism using a hash reference is
2100							available. The recognised keys are:
2101
2102							=over 4
2103
2104							=item file
2105
2106							Reference to a list of file names, or the name of a single
2107							file.
2108
2109							$r->add_file({file => ['file.1', 'file.2', 'file.3']});
2110							$r->add_file({file => 'file.n'});
2111
2112							=item input_record_separator
2113
2114							If present, indicates what constitutes a line
2115
2116							$r->add_file({file => 'data.txt', input_record_separator => ':' });
2117
2118							=item rs
2119
2120							An alias for input_record_separator (mnemonic: same as the
2121							English variable names).
2122
2123							=back
2124
2125							$r->add_file( {
2126							file => [ 'pattern.txt', 'more.txt' ],
2127							input_record_separator => "\r\n",
2128							});
2129
2130							=head2 clone()
2131
2132							Clones the contents of a Regexp::Assemble object and creates a new
2133							object (in other words it performs a deep copy).
2134
2135							If the Storable module is installed, its dclone method will be used,
2136							otherwise the cloning will be performed using a pure perl approach.
2137
2138							You can use this method to take a snapshot of the patterns that have
2139							been added so far to an object, and generate an assembly from the
2140							clone. Additional patterns may to be added to the original object
2141							afterwards.
2142
2143							my $re = $main->clone->re();
2144							$main->add( 'another-pattern-\\d+' );
2145
2146							=head2 insert(LIST)
2147
2148							Takes a list of tokens representing a regular expression and
2149							stores them in the object. Note: you should not pass it a bare
2150							regular expression, such as C. You must pass it as
2151							a list of tokens, I C<('a', 'b+', 'c?', 'd*', 'e')>.
2152
2153							This method is chainable, I:
2154
2155							my $ra = Regexp::Assemble->new
2156							->insert( qw[ a b+ c? d* e ] )
2157							->insert( qw[ a c+ d+ e* f ] );
2158
2159							Lexing complex patterns with metacharacters and so on can consume
2160							a significant proportion of the overall time to build an assembly.
2161							If you have the information available in a tokenised form, calling
2162							C directly can be a big win.
2163
2164							=head2 lexstr
2165
2166							Use the C method if you are curious to see how a pattern
2167							gets tokenised. It takes a scalar on input, representing a pattern,
2168							and returns a reference to an array, containing the tokenised
2169							pattern. You can recover the original pattern by performing a
2170							C:
2171
2172							my @token = $re->lexstr($pattern);
2173							my $new_pattern = join( '', @token );
2174
2175							If the original pattern contains unnecessary backslashes, or C<\x4b>
2176							escapes, or quotemeta escapes (C<\Q>...C<\E>) the resulting pattern
2177							may not be identical.
2178
2179							Call C does not add the pattern to the object, it is merely
2180							for exploratory purposes. It will, however, update various statistical
2181							counters.
2182
2183							=head2 pre_filter(CODE)
2184
2185							Allows you to install a callback to check that the pattern being
2186							loaded contains valid input. It receives the pattern as a whole to
2187							be added, before it been tokenised by the lexer. It may to return
2188							0 or C to indicate that the pattern should not be added, any
2189							true value indicates that the contents are fine.
2190
2191							A filter to strip out trailing comments (marked by #):
2192
2193							$re->pre_filter( sub { $_[0] =~ s/\s#.$//; 1 } );
2194
2195							A filter to ignore blank lines:
2196
2197							$re->pre_filter( sub { length(shift) } );
2198
2199							If you want to remove the filter, pass C as a parameter.
2200
2201							$ra->pre_filter(undef);
2202
2203							This method is chainable.
2204
2205							=head2 filter(CODE)
2206
2207							Allows you to install a callback to check that the pattern being
2208							loaded contains valid input. It receives a list on input, after it
2209							has been tokenised by the lexer. It may to return 0 or undef to
2210							indicate that the pattern should not be added, any true value
2211							indicates that the contents are fine.
2212
2213							If you know that all patterns you expect to assemble contain
2214							a restricted set of of tokens (e.g. no spaces), you could do
2215							the following:
2216
2217							$ra->filter(sub { not grep { / / } @_ });
2218
2219							or
2220
2221							sub only_spaces_and_digits {
2222							not grep { ![\d ] } @_
2223							}
2224							$ra->filter( \&only_spaces_and_digits );
2225
2226							These two examples will silently ignore faulty patterns, If you
2227							want the user to be made aware of the problem you should raise an
2228							error (via C or C), log an error message, whatever is
2229							best. If you want to remove a filter, pass C as a parameter.
2230
2231							$ra->filter(undef);
2232
2233							This method is chainable.
2234
2235							=head2 as_string
2236
2237							Assemble the expression and return it as a string. You may want to do
2238							this if you are writing the pattern to a file. The following arguments
2239							can be passed to control the aspect of the resulting pattern:
2240
2241							B, the number of spaces used to indent nested grouping of
2242							a pattern. Use this to produce a pretty-printed pattern (for some
2243							definition of "pretty"). The resulting output is rather verbose. The
2244							reason is to ensure that the metacharacters C<(?:> and C<)> always
2245							occur on otherwise empty lines. This allows you grep the result for an
2246							even more synthetic view of the pattern:
2247
2248							egrep -v '^ *[()]'
2249
2250							The result of the above is quite readable. Remember to backslash the
2251							spaces appearing in your own patterns if you wish to use an indented
2252							pattern in an C construct. Indenting is ignored if tracking
2253							is enabled.
2254
2255							The B argument takes precedence over the C
2256							method/attribute of the object.
2257
2258							Calling this
2259							method will drain the internal data structure. Large numbers of patterns
2260							can eat a significant amount of memory, and this lets perl recover the
2261							memory used for other purposes.
2262
2263							If you want to reduce the pattern I continue to add new patterns,
2264							clone the object and reduce the clone, leaving the original object intact.
2265
2266							=head2 re
2267
2268							Assembles the pattern and return it as a compiled RE, using the
2269							C operator.
2270
2271							As with C, calling this method will reset the internal data
2272							structures to free the memory used in assembling the RE.
2273
2274							The B attribute, documented in the C method, can be
2275							used here (it will be ignored if tracking is enabled).
2276
2277							With method chaining, it is possible to produce a RE without having
2278							a temporary C object lying around, I:
2279
2280							my $re = Regexp::Assemble->new
2281							->add( q[ab+cd+e] )
2282							->add( q[ac\\d+e] )
2283							->add( q[c\\d+e] )
2284							->re;
2285
2286							The C<$re> variable now contains a Regexp object that can be used
2287							directly:
2288
2289							while( <> ) {
2290							/$re/ and print "Something in [$_] matched\n";
2291							)
2292
2293							The C method is called when the object is used in string context
2294							(hence, within an C operator), so by and large you do not even
2295							need to save the RE in a separate variable. The following will work
2296							as expected:
2297
2298							my $re = Regexp::Assemble->new->add( qw[ fee fie foe fum ] );
2299							while( ) {
2300							if( /($re)/ ) {
2301							print "Here be giants: $1\n";
2302							}
2303							}
2304
2305							This approach does not work with tracked patterns. The
2306							C and C methods must be used instead, see below.
2307
2308							=head2 match(SCALAR)
2309
2310							The following information applies to Perl 5.8 and below. See
2311							the section that follows for information on Perl 5.10.
2312
2313							If pattern tracking is in use, you must C in order
2314							to make things work correctly. At a minimum, this will make your
2315							code look like this:
2316
2317							my $did_match = do { use re 'eval'; $target =~ /$ra/ }
2318							if( $did_match ) {
2319							print "matched ", $ra->matched, "\n";
2320							}
2321
2322							(The main reason is that the C<$^R> variable is currently broken
2323							and an ugly workaround that runs some Perl code during the match
2324							is required, in order to simulate what C<$^R> should be doing. See
2325							Perl bug #32840 for more information if you are curious. The README
2326							also contains more information). This bug has been fixed in 5.10.
2327
2328							The important thing to note is that with C, THERE
2329							ARE SECURITY IMPLICATIONS WHICH YOU IGNORE AT YOUR PERIL. The problem
2330							is this: if you do not have strict control over the patterns being
2331							fed to C when tracking is enabled, and someone
2332							slips you a pattern such as C and you
2333							attempt to match a string against the resulting pattern, you will
2334							know Fear and Loathing.
2335
2336							What is more, the C<$^R> workaround means that that tracking does
2337							not work if you perform a bare C pattern match as shown
2338							above. You have to instead call the C method, in order to
2339							supply the necessary context to take care of the tracking housekeeping
2340							details.
2341
2342							if( defined( my $match = $ra->match($_)) ) {
2343							print " $_ matched by $match\n";
2344							}
2345
2346							In the case of a successful match, the original matched pattern
2347							is returned directly. The matched pattern will also be available
2348							through the C method.
2349
2350							(Except that the above is not true for 5.6.0: the C method
2351							returns true or undef, and the C method always returns
2352							undef).
2353
2354							If you are capturing parts of the pattern I C
2355							you will want to get at the captures. See the C, C,
2356							C and C methods. If you are not using captures
2357							then you may safely ignore this section.
2358
2359							In 5.10, since the bug concerning C<$^R> has been resolved, there
2360							is no need to use C and the assembled pattern does
2361							not require any Perl code to be executed during the match.
2362
2363							=head2 new()
2364
2365							Creates a new C object. The following optional
2366							key/value parameters may be employed. All keys have a corresponding
2367							method that can be used to change the behaviour later on. As a
2368							general rule, especially if you're just starting out, you don't
2369							have to bother with any of these.
2370
2371							B, a family of optional attributes that allow anchors
2372							(C<^>, C<\b>, C<\Z>...) to be added to the resulting pattern.
2373
2374							B, sets the C flags to add to the assembled regular
2375							expression. Warning: no error checking is done, you should ensure
2376							that the flags you pass are understood by the version of Perl you
2377							are using. B exists as an alias, for users familiar
2378							with L.
2379
2380							B, controls whether the pattern should be chomped before being
2381							lexed. Handy if you are reading patterns from a file. By default,
2382							Cing is performed (this behaviour changed as of version 0.24,
2383							prior versions did not chomp automatically).
2384							See also the C attribute and the C method.
2385
2386							B, slurp the contents of the specified file and add them
2387							to the assembly. Multiple files may be processed by using a list.
2388
2389							my $r = Regexp::Assemble->new(file => 're.list');
2390
2391							my $r = Regexp::Assemble->new(file => ['re.1', 're.2']);
2392
2393							If you really don't want chomping to occur, you will have to set
2394							the C attribute to 0 (zero). You may also want to look at
2395							the C attribute, as well.
2396
2397							B, controls what constitutes a record
2398							separator when using the C attribute or the C
2399							method. May be abbreviated to B. See the C<$/> variable in
2400							L.
2401
2402							B, controls whether the pattern should contain zero-width
2403							lookahead assertions (For instance: (?=[abc])(?:bob\|alice\|charles).
2404							This is not activated by default, because in many circumstances the
2405							cost of processing the assertion itself outweighs the benefit of
2406							its faculty for short-circuiting a match that will fail. This is
2407							sensitive to the probability of a match succeeding, so if you're
2408							worried about performance you'll have to benchmark a sample population
2409							of targets to see which way the benefits lie.
2410
2411							B, controls whether you want know which of the initial
2412							patterns was the one that matched. See the C method for
2413							more details. Note for version 5.8 of Perl and below, in this mode
2414							of operation YOU SHOULD BE AWARE OF THE SECURITY IMPLICATIONS that
2415							this entails. Perl 5.10 does not suffer from any such restriction.
2416
2417							B, the number of spaces used to indent nested grouping of
2418							a pattern. Use this to produce a pretty-printed pattern. See the
2419							C method for a more detailed explanation.
2420
2421							B, allows you to add a callback to enable sanity checks
2422							on the pattern being loaded. This callback is triggered before the
2423							pattern is split apart by the lexer. In other words, it operates
2424							on the entire pattern. If you are loading patterns from a file,
2425							this would be an appropriate place to remove comments.
2426
2427							B, allows you to add a callback to enable sanity checks on
2428							the pattern being loaded. This callback is triggered after the
2429							pattern has been split apart by the lexer.
2430
2431							B, controls whether to unroll, for example, C into
2432							C, C, which may allow additional reductions in the
2433							resulting assembled pattern.
2434
2435							B, controls whether tail reduction occurs or not. If set,
2436							patterns like C will be reduced to C.
2437							That is, the end of the pattern in each part of the b... and d...
2438							alternations is identical, and hence is hoisted out of the alternation
2439							and placed after it. On by default. Turn it off if you're really
2440							pressed for short assembly times.
2441
2442							B, specifies the pattern used to lex the input lines into
2443							tokens. You could replace the default pattern by a more sophisticated
2444							version that matches arbitrarily nested parentheses, for example.
2445
2446							B, controls whether copious amounts of output is produced
2447							during the loading stage or the reducing stage of assembly.
2448
2449							my $ra = Regexp::Assemble->new;
2450							my $rb = Regexp::Assemble->new( chomp => 1, debug => 3 );
2451
2452							B, controls whether new patterns can be added to the object
2453							after the assembled pattern is generated. DEPRECATED.
2454
2455							This method/attribute will be removed in a future release. It doesn't
2456							really serve any purpose, and may be more effectively replaced by
2457							cloning an existing C object and spinning out a
2458							pattern from that instead.
2459
2460							=head2 source()
2461
2462							When using tracked mode, after a successful match is made, returns
2463							the original source pattern that caused the match. In Perl 5.10,
2464							the C<$^R> variable can be used to as an index to fetch the correct
2465							pattern from the object.
2466
2467							If no successful match has been performed, or the object is not in
2468							tracked mode, this method returns C.
2469
2470							my $r = Regexp::Assemble->new->track(1)->add(qw(foo? bar{2} [Rr]at));
2471
2472							for my $w (qw(this food is rather barren)) {
2473							if ($w =~ /$r/) {
2474							print "$w matched by ", $r->source($^R), $/;
2475							}
2476							else {
2477							print "$w no match\n";
2478							}
2479							}
2480
2481							=head2 mbegin()
2482
2483							This method returns a copy of C<@-> at the moment of the
2484							last match. You should ordinarily not need to bother with
2485							this, C should be able to supply all your needs.
2486
2487							=head2 mend()
2488
2489							This method returns a copy of C<@+> at the moment of the
2490							last match.
2491
2492							=head2 mvar(NUMBER)
2493
2494							The C method returns the captures of the last match.
2495							C corresponds to $1, C to $2, and so on.
2496							C happens to return the target string matched,
2497							as a byproduct of walking down the C<@-> and C<@+> arrays
2498							after the match.
2499
2500							If called without a parameter, C will return a
2501							reference to an array containing all captures.
2502
2503							=head2 capture
2504
2505							The C method returns the the captures of the last
2506							match as an array. Unlink C, this method does not
2507							include the matched string. It is equivalent to getting an
2508							array back that contains C<$1, $2, $3, ...>.
2509
2510							If no captures were found in the match, an empty array is
2511							returned, rather than C. You are therefore guaranteed
2512							to be able to use C<< for my $c ($re->capture) { ... >>
2513							without have to check whether anything was captured.
2514
2515							=head2 matched()
2516
2517							If pattern tracking has been set, via the C attribute,
2518							or through the C method, this method will return the
2519							original pattern of the last successful match. Returns undef
2520							match has yet been performed, or tracking has not been enabled.
2521
2522							See below in the NOTES section for additional subtleties of
2523							which you should be aware of when tracking patterns.
2524
2525							Note that this method is not available in 5.6.0, due to
2526							limitations in the implementation of C<(?{...})> at the time.
2527
2528							=head2 Statistics/Reporting routines
2529
2530							=head2 stats_add
2531
2532							Returns the number of patterns added to the assembly (whether
2533							by C or C). Duplicate patterns are not included
2534							in this total.
2535
2536							=head2 stats_dup
2537
2538							Returns the number of duplicate patterns added to the assembly.
2539							If non-zero, this may be a sign that something is wrong with
2540							your data (or at the least, some needless redundancy). This may
2541							occur when you have two patterns (for instance, C and
2542							C) which map to the same result.
2543
2544							=head2 stats_raw()
2545
2546							Returns the raw number of bytes in the patterns added to the
2547							assembly. This includes both original and duplicate patterns.
2548							For instance, adding the two patterns C and C will
2549							count as 4 bytes.
2550
2551							=head2 stats_cooked()
2552
2553							Return the true number of bytes added to the assembly. This
2554							will not include duplicate patterns. Furthermore, it may differ
2555							from the raw bytes due to quotemeta treatment. For instance,
2556							C will count as 7 (not 8) bytes, because C<\,> will
2557							be stored as C<,>. Also, C<\Qa.b\E> is 7 bytes long, however,
2558							after the quotemeta directives are processed, C will be
2559							stored, for a total of 4 bytes.
2560
2561							=head2 stats_length()
2562
2563							Returns the length of the resulting assembled expression.
2564							Until C or C have been called, the length
2565							will be 0 (since the assembly will have not yet been
2566							performed). The length includes only the pattern, not the
2567							additional (C<(?-xism...>) fluff added by the compilation.
2568
2569							=head2 dup_warn(NUMBER\|CODEREF)
2570
2571							Turns warnings about duplicate patterns on or off. By
2572							default, no warnings are emitted. If the method is
2573							called with no parameters, or a true parameter,
2574							the object will carp about patterns it has
2575							already seen. To turn off the warnings, use 0 as a
2576							parameter.
2577
2578							$r->dup_warn();
2579
2580							The method may also be passed a code block. In this case
2581							the code will be executed and it will receive a reference
2582							to the object in question, and the lexed pattern.
2583
2584							$r->dup_warn(
2585							sub {
2586							my $self = shift;
2587							print $self->stats_add, " patterns added at line $.\n",
2588							join( '', @_ ), " added previously\n";
2589							}
2590							)
2591
2592							=head2 Anchor routines
2593
2594							Suppose you wish to assemble a series of patterns that all begin
2595							with C<^> and end with C<$> (anchor pattern to the beginning and
2596							end of line). Rather than add the anchors to each and every pattern
2597							(and possibly forget to do so when a new entry is added), you may
2598							specify the anchors in the object, and they will appear in the
2599							resulting pattern, and you no longer need to (or should) put them
2600							in your source patterns. For example, the two following snippets
2601							will produce identical patterns:
2602
2603							$r->add(qw(^this ^that ^them))->as_string;
2604
2605							$r->add(qw(this that them))->anchor_line_begin->as_string;
2606
2607							# both techniques will produce ^th(?:at\|em\|is)
2608
2609							All anchors are possible word (C<\b>) boundaries, line
2610							boundaries (C<^> and C<$>) and string boundaries (C<\A>
2611							and C<\Z> (or C<\z> if you absolutely need it)).
2612
2613							The shortcut C> implies both
2614							C_begin> C_end>
2615							is also available. If different anchors are specified
2616							the most specific anchor wins. For instance, if both
2617							C and C are
2618							specified, C takes precedence.
2619
2620							All the anchor methods are chainable.
2621
2622							=head2 anchor_word_begin
2623
2624							The resulting pattern will be prefixed with a C<\b>
2625							word boundary assertion when the value is true. Set
2626							to 0 to disable.
2627
2628							$r->add('pre')->anchor_word_begin->as_string;
2629							# produces '\bpre'
2630
2631							=head2 anchor_word_end
2632
2633							The resulting pattern will be suffixed with a C<\b>
2634							word boundary assertion when the value is true. Set
2635							to 0 to disable.
2636
2637							$r->add(qw(ing tion))
2638							->anchor_word_end
2639							->as_string; # produces '(?:tion\|ing)\b'
2640
2641							=head2 anchor_word
2642
2643							The resulting pattern will be have C<\b>
2644							word boundary assertions at the beginning and end
2645							of the pattern when the value is true. Set
2646							to 0 to disable.
2647
2648							$r->add(qw(cat carrot)
2649							->anchor_word(1)
2650							->as_string; # produces '\bca(?:rro)t\b'
2651
2652							=head2 anchor_line_begin
2653
2654							The resulting pattern will be prefixed with a C<^>
2655							line boundary assertion when the value is true. Set
2656							to 0 to disable.
2657
2658							$r->anchor_line_begin;
2659							# or
2660							$r->anchor_line_begin(1);
2661
2662							=head2 anchor_line_end
2663
2664							The resulting pattern will be suffixed with a C<$>
2665							line boundary assertion when the value is true. Set
2666							to 0 to disable.
2667
2668							# turn it off
2669							$r->anchor_line_end(0);
2670
2671							=head2 anchor_line
2672
2673							The resulting pattern will be have the C<^> and C<$>
2674							line boundary assertions at the beginning and end
2675							of the pattern, respectively, when the value is true. Set
2676							to 0 to disable.
2677
2678							$r->add(qw(cat carrot)
2679							->anchor_line
2680							->as_string; # produces '^ca(?:rro)t$'
2681
2682							=head2 anchor_string_begin
2683
2684							The resulting pattern will be prefixed with a C<\A>
2685							string boundary assertion when the value is true. Set
2686							to 0 to disable.
2687
2688							$r->anchor_string_begin(1);
2689
2690							=head2 anchor_string_end
2691
2692							The resulting pattern will be suffixed with a C<\Z>
2693							string boundary assertion when the value is true. Set
2694							to 0 to disable.
2695
2696							# disable the string boundary end anchor
2697							$r->anchor_string_end(0);
2698
2699							=head2 anchor_string_end_absolute
2700
2701							The resulting pattern will be suffixed with a C<\z>
2702							string boundary assertion when the value is true. Set
2703							to 0 to disable.
2704
2705							# disable the string boundary absolute end anchor
2706							$r->anchor_string_end_absolute(0);
2707
2708							If you don't understand the difference between
2709							C<\Z> and C<\z>, the former will probably do what
2710							you want.
2711
2712							=head2 anchor_string
2713
2714							The resulting pattern will be have the C<\A> and C<\Z>
2715							string boundary assertions at the beginning and end
2716							of the pattern, respectively, when the value is true. Set
2717							to 0 to disable.
2718
2719							$r->add(qw(cat carrot)
2720							->anchor_string
2721							->as_string; # produces '\Aca(?:rro)t\Z'
2722
2723							=head2 anchor_string_absolute
2724
2725							The resulting pattern will be have the C<\A> and C<\z>
2726							string boundary assertions at the beginning and end
2727							of the pattern, respectively, when the value is true. Set
2728							to 0 to disable.
2729
2730							$r->add(qw(cat carrot)
2731							->anchor_string_absolute
2732							->as_string; # produces '\Aca(?:rro)t\z'
2733
2734							=head2 debug(NUMBER)
2735
2736							Turns debugging on or off. Statements are printed
2737							to the currently selected file handle (STDOUT by default).
2738							If you are already using this handle, you will have to
2739							arrange to select an output handle to a file of your own
2740							choosing, before call the C, C or C)
2741							functions, otherwise it will scribble all over your
2742							carefully formatted output.
2743
2744							=over 4
2745
2746							=item * 0
2747
2748							Off. Turns off all debugging output.
2749
2750							=item * 1
2751
2752							Add. Trace the addition of patterns.
2753
2754							=item * 2
2755
2756							Reduce. Trace the process of reduction and assembly.
2757
2758							=item * 4
2759
2760							Lex. Trace the lexing of the input patterns into its constituent
2761							tokens.
2762
2763							=item * 8
2764
2765							Time. Print to STDOUT the time taken to load all the patterns. This is
2766							nothing more than the difference between the time the object was
2767							instantiated and the time reduction was initiated.
2768
2769							# load=
2770
2771							Any lengthy computation performed in the client code will be reflected
2772							in this value. Another line will be printed after reduction is
2773							complete.
2774
2775							# reduce=
2776
2777							The above output lines will be changed to C and
2778							C if the internal state of the object is corrupted
2779							and the initial timestamp is lost.
2780
2781							The code attempts to load L in order to report fractional
2782							seconds. If this is not successful, the elapsed time is displayed
2783							in whole seconds.
2784
2785							=back
2786
2787							Values can be added (or or'ed together) to trace everything
2788
2789							$r->debug(7)->add( '\\d+abc' );
2790
2791							Calling C with no arguments turns debugging off.
2792
2793							=head2 dump()
2794
2795							Produces a synthetic view of the internal data structure. How
2796							to interpret the results is left as an exercise to the reader.
2797
2798							print $r->dump;
2799
2800							=head2 chomp(0\|1)
2801
2802							Turns chomping on or off.
2803
2804							IMPORTANT: As of version 0.24, chomping is now on by default as it
2805							makes C Just Work. The only time you may run into trouble
2806							is with C. So don't do that, or else explicitly turn
2807							off chomping.
2808
2809							To avoid incorporating (spurious)
2810							record separators (such as "\n" on Unix) when reading from a file,
2811							C Cs its input. If you don't want this to happen,
2812							call C with a false value.
2813
2814							$re->chomp(0); # really want the record separators
2815							$re->add();
2816
2817							=head2 fold_meta_pairs(NUMBER)
2818
2819							Determines whether C<\s>, C<\S> and C<\w>, C<\W> and C<\d>, C<\D>
2820							are folded into a C<.> (dot). Folding happens by default (for
2821							reasons of backwards compatibility, even though it is wrong when
2822							the C expression modifier is active).
2823
2824							Call this method with a false value to prevent this behaviour (which
2825							is only a problem when dealing with C<\n> if the C expression
2826							modifier is also set).
2827
2828							$re->add( '\\w', '\\W' );
2829							my $clone = $re->clone;
2830
2831							$clone->fold_meta_pairs(0);
2832							print $clone->as_string; # prints '.'
2833							print $re->as_string; # print '[\W\w]'
2834
2835							=head2 indent(NUMBER)
2836
2837							Sets the level of indent for pretty-printing nested groups
2838							within a pattern. See the C method for more details.
2839							When called without a parameter, no indenting is performed.
2840
2841							$re->indent( 4 );
2842							print $re->as_string;
2843
2844							=head2 lookahead(0\|1)
2845
2846							Turns on zero-width lookahead assertions. This is usually
2847							beneficial when you expect that the pattern will usually fail.
2848							If you expect that the pattern will usually match you will
2849							probably be worse off.
2850
2851							=head2 flags(STRING)
2852
2853							Sets the flags that govern how the pattern behaves (for
2854							versions of Perl up to 5.9 or so, these are C). By
2855							default no flags are enabled.
2856
2857							=head2 modifiers(STRING)
2858
2859							An alias of the C method, for users familiar with
2860							C.
2861
2862							=head2 track(0\|1)
2863
2864							Turns tracking on or off. When this attribute is enabled,
2865							additional housekeeping information is inserted into the
2866							assembled expression using C<({...}> embedded code
2867							constructs. This provides the necessary information to
2868							determine which, of the original patterns added, was the
2869							one that caused the match.
2870
2871							$re->track( 1 );
2872							if( $target =~ /$re/ ) {
2873							print "$target matched by ", $re->matched, "\n";
2874							}
2875
2876							Note that when this functionality is enabled, no
2877							reduction is performed and no character classes are
2878							generated. In other words, C is not
2879							reduced down to C<(?:br\|t)ag> and C is not
2880							reduced to C.
2881
2882							=head2 unroll_plus(0\|1)
2883
2884							Turns the unrolling of plus metacharacters on or off. When
2885							a pattern is broken up, C becomes C, C (and
2886							C becomes C, C. This may allow the freed C
2887							to assemble with other patterns. Not enabled by default.
2888
2889							=head2 lex(SCALAR)
2890
2891							Change the pattern used to break a string apart into tokens.
2892							You can examine the C script as a starting point.
2893
2894							=head2 reduce(0\|1)
2895
2896							Turns pattern reduction on or off. A reduced pattern may
2897							be considerably shorter than an unreduced pattern. Consider
2898							C I C. An unreduced
2899							pattern will be very similar to those produced by
2900							C. Reduction is on by default. Turning
2901							it off speeds assembly (but assembly is pretty fast -- it's
2902							the breaking up of the initial patterns in the lexing stage
2903							that can consume a non-negligible amount of time).
2904
2905							=head2 mutable(0\|1)
2906
2907							This method has been marked as DEPRECATED. It will be removed
2908							in a future release. See the C method for a technique
2909							to replace its functionality.
2910
2911							=head2 reset()
2912
2913							Empties out the patterns that have been Ced or C-ed
2914							into the object. Does not modify the state of controller attributes
2915							such as C, C, C and the like.
2916
2917							=head2 Default_Lexer
2918
2919							B the C function is a class method, not
2920							an object method. It is a fatal error to call it as an object
2921							method.
2922
2923							The C method lets you replace the default pattern
2924							used for all subsequently created C objects. It
2925							will not have any effect on existing objects. (It is also possible
2926							to override the lexer pattern used on a per-object basis).
2927
2928							The parameter should be an ordinary scalar, not a compiled
2929							pattern. If the pattern fails to match all parts of the string,
2930							the missing parts will be returned as single chunks. Therefore
2931							the following pattern is legal (albeit rather cork-brained):
2932
2933							Regexp::Assemble::Default_Lexer( '\\d' );
2934
2935							The above pattern will split up input strings digit by digit, and
2936							all non-digit characters as single chunks.
2937
2938							=head1 DIAGNOSTICS
2939
2940							"Cannot pass a C to Default_Lexer"
2941
2942							You tried to replace the default lexer pattern with an object
2943							instead of a scalar. Solution: You probably tried to call
2944							C<< $obj->Default_Lexer >>. Call the qualified class method instead
2945							C.
2946
2947							"filter method not passed a coderef"
2948
2949							"pre_filter method not passed a coderef"
2950
2951							A reference to a subroutine (anonymous or otherwise) was expected.
2952							Solution: read the documentation for the C method.
2953
2954							"duplicate pattern added: /.../"
2955
2956							The C attribute is active, and a duplicate pattern was
2957							added (well duh!). Solution: clean your data.
2958
2959							"cannot open [file] for input: [reason]"
2960
2961							The C method was unable to open the specified file for
2962							whatever reason. Solution: make sure the file exists and the script
2963							has the required privileges to read it.
2964
2965							=head1 NOTES
2966
2967							This module has been tested successfully with a range of versions
2968							of perl, from 5.005_03 to 5.9.3. Use of 5.6.0 is not recommended.
2969
2970							The expressions produced by this module can be used with the PCRE
2971							library.
2972
2973							Remember to "double up" your backslashes if the patterns are
2974							hard-coded as constants in your program. That is, you should
2975							literally C rather than C. It
2976							usually will work either way, but it's good practice to do so.
2977
2978							Where possible, supply the simplest tokens possible. Don't add
2979							C when C will do. The reason is that
2980							if you also add C the resulting assembly changes
2981							dramatically: C I
2982							C. Since R::A doesn't perform enough analysis,
2983							it won't "unroll" the C<{2}> quantifier, and will fail to notice
2984							the divergence after the first C<-d\d+>.
2985
2986							Furthermore, when the string 'X-123000P' is matched against the
2987							first assembly, the regexp engine will have to backtrack over each
2988							alternation (the one that ends in Y B the one that ends in Z)
2989							before determining that there is no match. No such backtracking
2990							occurs in the second pattern: as soon as the engine encounters the
2991							'P' in the target string, neither of the alternations at that point
2992							(C<-\d+Y> or C) could succeed and so the match fails.
2993
2994							C does, however, know how to build character
2995							classes. Given C, C and C, it will assemble these
2996							into C. When C<-> (dash) appears as a candidate for a
2997							character class it will be the first character in the class. When
2998							C<^> (circumflex) appears as a candidate for a character class it
2999							will be the last character in the class.
3000
3001							It also knows about meta-characters than can "absorb" regular
3002							characters. For instance, given C and C, it knows that
3003							C<5> can be represented by C<\d> and so the assembly is just C.
3004							The "absorbent" meta-characters it deals with are C<.>, C<\d>, C<\s>
3005							and C<\W> and their complements. It will replace C<\d>/C<\D>,
3006							C<\s>/C<\S> and C<\w>/C<\W> by C<.> (dot), and it will drop C<\d>
3007							if C<\w> is also present (as will C<\D> in the presence of C<\W>).
3008
3009							C deals correctly with C's propensity
3010							to backslash many characters that have no need to be. Backslashes on
3011							non-metacharacters will be removed. Similarly, in character classes,
3012							a number of characters lose their magic and so no longer need to be
3013							backslashed within a character class. Two common examples are C<.>
3014							(dot) and C<$>. Such characters will lose their backslash.
3015
3016							At the same time, it will also process C<\Q...\E> sequences. When
3017							such a sequence is encountered, the inner section is extracted and
3018							C is applied to the section. The resulting quoted text
3019							is then used in place of the original unquoted text, and the C<\Q>
3020							and C<\E> metacharacters are thrown away. Similar processing occurs
3021							with the C<\U...\E> and C<\L...\E> sequences. This may have surprising
3022							effects when using a dispatch table. In this case, you will need
3023							to know exactly what the module makes of your input. Use the C
3024							method to find out what's going on:
3025
3026							$pattern = join( '', @{$re->lexstr($pattern)} );
3027
3028							If all the digits 0..9 appear in a character class, C
3029							will replace them by C<\d>. I'd do it for letters as well, but
3030							thinking about accented characters and other glyphs hurts my head.
3031
3032							In an alternation, the longest paths are chosen first (for example,
3033							C). When two paths have the same length, the path
3034							with the most subpaths will appear first. This aims to put the
3035							"busiest" paths to the front of the alternation. For example, the
3036							list C, C, C, C and C will produce the
3037							pattern C<(?:f(?:ew\|ig\|un)\|b(?:ad\|it))>. See F for a
3038							real-world example of how alternations are sorted. Once you have
3039							looked at that, everything should be crystal clear.
3040
3041							When tracking is in use, no reduction is performed. nor are
3042							character classes formed. The reason is that it is
3043							too difficult to determine the original pattern afterwards. Consider the
3044							two patterns C and C. These should be reduced to
3045							C. The final character matches one of two possibilities.
3046							To resolve whether it matched an C<'e'> or C<'m'> would require
3047							keeping track of the fact that the pattern finished up in a character
3048							class, which would the require a whole lot more work to figure out
3049							which character of the class matched. Without character classes
3050							it becomes much easier. Instead, C is produced, which
3051							lets us find out more simply where we ended up.
3052
3053							Similarly, C and C should form C<(?:dog\|sea)food>.
3054							When the pattern is being assembled, the tracking decision needs
3055							to be made at the end of the grouping, but the tail of the pattern
3056							has not yet been visited. Deferring things to make this work correctly
3057							is a vast hassle. In this case, the pattern becomes merely
3058							C<(?:dogfood\|seafood>. Tracked patterns will therefore be bulkier than
3059							simple patterns.
3060
3061							There is an open bug on this issue:
3062
3063							L
3064
3065							If this bug is ever resolved, tracking would become much easier to
3066							deal with (none of the C hassle would be required - you could
3067							just match like a regular RE and it would Just Work).
3068
3069							=head1 SEE ALSO
3070
3071							=over 4
3072
3073							=item L
3074
3075							General information about Perl's regular expressions.
3076
3077							=item L
3078
3079							Specific information about C.
3080
3081							=item Regex::PreSuf
3082
3083							C takes a string and chops it itself into tokens of
3084							length 1. Since it can't deal with tokens of more than one character,
3085							it can't deal with meta-characters and thus no regular expressions.
3086							Which is the main reason why I wrote this module.
3087
3088							=item Regexp::Optimizer
3089
3090							C produces regular expressions that are similar to
3091							those produced by R::A with reductions switched off. It's biggest
3092							drawback is that it is exponentially slower than Regexp::Assemble on
3093							very large sets of patterns.
3094
3095							=item Regexp::Parser
3096
3097							Fine grained analysis of regular expressions.
3098
3099							=item Regexp::Trie
3100
3101							Funnily enough, this was my working name for C
3102							during its development. I changed the name because I thought it
3103							was too obscure. Anyway, C does much the same as
3104							C and C except that it runs
3105							much faster (according to the author). It does not recognise
3106							meta characters (that is, 'a+b' is interpreted as 'a\+b').
3107
3108							=item Text::Trie
3109
3110							C is well worth investigating. Tries can outperform very
3111							bushy (read: many alternations) patterns.
3112
3113							=item Tree::Trie
3114
3115							C is another module that builds tries. The algorithm that
3116							C uses appears to be quite similar to the
3117							algorithm described therein, except that C solves its
3118							end-marker problem without having to rewrite the leaves.
3119
3120							=back
3121
3122							=head1 See Also
3123
3124							For alternatives to this module, consider one of:
3125
3126							=over 4
3127
3128							=item o L
3129
3130							=item o L
3131
3132							=item o L
3133
3134							=back
3135
3136							=head1 LIMITATIONS
3137
3138							Some mildly complex cases are not handled well. See examples/failure.01.pl
3139							and L.
3140
3141							See also L for a discussion
3142							of some of the issues arising with the use of a huge number of alterations. Thanx to
3143							Slaven Rezic for the details of trie 'v' non-trie operations within Perl which influence
3144							regexp handling of alternations.
3145
3146							does not attempt to find common substrings. For
3147							instance, it will not collapse C down to C.
3148							If there's a module out there that performs this sort of string
3149							analysis I'd like to know about it. But keep in mind that the
3150							algorithms that do this are very expensive: quadratic or worse.
3151
3152							C does not interpret meta-character modifiers.
3153							For instance, if the following two patterns are
3154							given: C and C, it will not determine that C<\d> can be
3155							matched by C<\d+>. Instead, it will produce C. Along
3156							a similar line of reasoning, it will not determine that C and
3157							C is equivalent to C (It will produce C
3158							instead).
3159
3160							You cannot remove a pattern that has been added to an object. You'll
3161							just have to start over again. Adding a pattern is difficult enough,
3162							I'd need a solid argument to convince me to add a C method.
3163							If you need to do this you should read the documentation for the
3164							C method.
3165
3166							C does not (yet)? employ the C<(?E...)>
3167							construct.
3168
3169							The module does not produce POSIX-style regular expressions. This
3170							would be quite easy to add, if there was a demand for it.
3171
3172							=head1 BUGS
3173
3174							Patterns that generate look-ahead assertions sometimes produce
3175							incorrect patterns in certain obscure corner cases. If you
3176							suspect that this is occurring in your pattern, disable
3177							lookaheads.
3178
3179							Tracking doesn't really work at all with 5.6.0. It works better
3180							in subsequent 5.6 releases. For maximum reliability, the use of
3181							a 5.8 release is strongly recommended. Tracking barely works with
3182							5.005_04. Of note, using C<\d>-style meta-characters invariably
3183							causes panics. Tracking really comes into its own in Perl 5.10.
3184
3185							If you feed C patterns with nested parentheses,
3186							there is a chance that the resulting pattern will be uncompilable
3187							due to mismatched parentheses (not enough closing parentheses). This
3188							is normal, so long as the default lexer pattern is used. If you want
3189							to find out which pattern among a list of 3000 patterns are to blame
3190							(speaking from experience here), the F script offers
3191							a strategy for pinpointing the pattern at fault. While you may not
3192							be able to use the script directly, the general approach is easy to
3193							implement.
3194
3195							The algorithm used to assemble the regular expressions makes extensive
3196							use of mutually-recursive functions (that is, A calls B, B calls
3197							A, ...) For deeply similar expressions, it may be possible to provoke
3198							"Deep recursion" warnings.
3199
3200							The module has been tested extensively, and has an extensive test
3201							suite (that achieves close to 100% statement coverage), but you
3202							never know... A bug may manifest itself in two ways: creating a
3203							pattern that cannot be compiled, such as C, or a pattern
3204							that compiles correctly but that either matches things it shouldn't,
3205							or doesn't match things it should. It is assumed that Such problems
3206							will occur when the reduction algorithm encounters some sort of
3207							edge case. A temporary work-around is to disable reductions:
3208
3209							my $pattern = $assembler->reduce(0)->re;
3210
3211							A discussion about implementation details and where bugs might lurk
3212							appears in the README file. If this file is not available locally,
3213							you should be able to find a copy on the Web at your nearest CPAN
3214							mirror.
3215
3216							Seriously, though, a number of people have been using this module to
3217							create expressions anywhere from 140Kb to 600Kb in size, and it seems to
3218							be working according to spec. Thus, I don't think there are any serious
3219							bugs remaining.
3220
3221							If you are feeling brave, extensive debugging traces are available to
3222							figure out where assembly goes wrong.
3223
3224							Please report all bugs at
3225							L
3226
3227							Make sure you include the output from the following two commands:
3228
3229							perl -MRegexp::Assemble -le 'print $Regexp::Assemble::VERSION'
3230							perl -V
3231
3232							There is a mailing list for the discussion of C.
3233							Subscription details are available at
3234							L.
3235
3236							=head1 ACKNOWLEDGEMENTS
3237
3238							This module grew out of work I did building access maps for Postfix,
3239							a modern SMTP mail transfer agent. See L
3240							for more information. I used Perl to build large regular expressions
3241							for blocking dynamic/residential IP addresses to cut down on spam
3242							and viruses. Once I had the code running for this, it was easy to
3243							start adding stuff to block really blatant spam subject lines, bogus
3244							HELO strings, spammer mailer-ids and more...
3245
3246							I presented the work at the French Perl Workshop in 2004, and the
3247							thing most people asked was whether the underlying mechanism for
3248							assembling the REs was available as a module. At that time it was
3249							nothing more that a twisty maze of scripts, all different. The
3250							interest shown indicated that a module was called for. I'd like to
3251							thank the people who showed interest. Hey, it's going to make I
3252							messy scripts smaller, in any case.
3253
3254							Thomas Drugeon was a valuable sounding board for trying out
3255							early ideas. Jean Forget and Philippe Blayo looked over an early
3256							version. H.Merijn Brandt stopped over in Paris one evening, and
3257							discussed things over a few beers.
3258
3259							Nicholas Clark pointed out that while what this module does
3260							(?:c\|sh)ould be done in perl's core, as per the 2004 TODO, he
3261							encouraged me to continue with the development of this module. In
3262							any event, this module allows one to gauge the difficulty of
3263							undertaking the endeavour in C. I'd rather gouge my eyes out with
3264							a blunt pencil.
3265
3266							Paul Johnson settled the question as to whether this module should
3267							live in the Regex:: namespace, or Regexp:: namespace. If you're
3268							not convinced, try running the following one-liner:
3269
3270							perl -le 'print ref qr//'
3271
3272							Philippe Bruhat found a couple of corner cases where this module
3273							could produce incorrect results. Such feedback is invaluable,
3274							and only improves the module's quality.
3275
3276							=head1 Machine-Readable Change Log
3277
3278							The file Changes was converted into Changelog.ini by L.
3279
3280							=head1 AUTHOR
3281
3282							David Landgren
3283
3284							Copyright (C) 2004-2011. All rights reserved.
3285
3286							http://www.landgren.net/perl/
3287
3288							If you use this module, I'd love to hear about what you're using
3289							it for. If you want to be informed of updates, send me a note.
3290
3291							Ron Savage is co-maint of the module, starting with V 0.36.
3292
3293							=head1 Repository
3294
3295							L
3296
3297							=head1 TODO
3298
3299							1. Tree equivalencies. Currently, /contend/ /content/ /resend/ /resent/
3300							produces (?:conten[dt]\|resend[dt]) but it is possible to produce
3301							(?:cont\|res)en[dt] if one can spot the common tail nodes (and walk back
3302							the equivalent paths). Or be by me my => /[bm][ey]/ in the simplest case.
3303
3304							To do this requires a certain amount of restructuring of the code.
3305							Currently, the algorithm uses a two-phase approach. In the first
3306							phase, the trie is traversed and reductions are performed. In the
3307							second phase, the reduced trie is traversed and the pattern is
3308							emitted.
3309
3310							What has to occur is that the reduction and emission have to occur
3311							together. As a node is completed, it is replaced by its string
3312							representation. This then allows child nodes to be compared for
3313							equality with a simple 'eq'. Since there is only a single traversal,
3314							the overall generation time might drop, even though the context
3315							baggage required to delve through the tree will be more expensive
3316							to carry along (a hash rather than a couple of scalars).
3317
3318							Actually, a simpler approach is to take on a secret sentinel
3319							atom at the end of every pattern, which gives the reduction
3320							algorithm sufficient traction to create a perfect trie.
3321
3322							I'm rewriting the reduction code using this technique.
3323
3324							2. Investigate how (?>foo) works. Can it be applied?
3325
3326							5. How can a tracked pattern be serialised? (Add freeze and thaw methods).
3327
3328							6. Store callbacks per tracked pattern.
3329
3330							12. utf-8... hmmmm...
3331
3332							14. Adding qr//'ed patterns. For example, consider
3333							$r->add ( qr/^abc/i )
3334							->add( qr/^abd/ )
3335							->add( qr/^ab e/x );
3336							this should admit abc abC aBc aBC abd abe as matches
3337
3338							16. Allow a fast, unsafe tracking mode, that can be used if a(?bc)?
3339							can't happen. (Possibly carp if it does appear during traversal)?
3340
3341							17. given a-\d+-\d+-\d+-\d+-b, produce a(?:-\d+){4}-b. Something
3342							along the lines of (.{4))(\1+) would let the regexp engine
3343							itself be brought to bear on the matter, which is a rather
3344							appealing idea. Consider
3345
3346							while(/(?!\+)(\S{2,}?)(\1+)/g) { ... $1, $2 ... }
3347
3348							as a starting point.
3349
3350							19. The reduction code has become unbelievably baroque. Adding code
3351							to handle (sting,singing,sing) => s(?:(?:ing)?\|t)ing was far
3352							too difficult. Adding more stuff just breaks existing behaviour.
3353							And fixing the ^abcd$ ... bug broke stuff all over again.
3354							Now that the corner cases are more clearly identified, a full
3355							rewrite of the reduction code is needed. And would admit the
3356							possibility of implementing items 1 and 17.
3357
3358							20. Handle debug unrev with a separate bit
3359
3360							23. Japhy's http://www.perlmonks.org/index.pl?node_id=90876 list2range
3361							regexp
3362
3363							24. Lookahead assertions contain serious bugs (as shown by
3364							assembling powersets. Need to save more context during reduction,
3365							which in turn will simplify the preparation of the lookahead
3366							classes. See also 19.
3367
3368							26. _lex() swamps the overall run-time. It stems from the decision
3369							to use a single regexp to pull apart any pattern. A suite of
3370							simpler regexp to pick of parens, char classes, quantifiers
3371							and bare tokens may be faster. (This has been implemented as
3372							_fastlex(), but it's only marginally faster. Perhaps split-by-
3373							char and lex a la C?
3374
3375							27. We don't, as yet, unroll_plus a paren e.g. (abc)+?
3376
3377							28. We don't reroll unrolled a a* to a+ in indented or tracked
3378							output
3379
3380							29. Use (MARK n) in blead for tracked patterns, and use (FAIL) for
3381							the unmatchable pattern.
3382
3383							=head1 LICENSE
3384
3385							This library is free software; you can redistribute it and/or modify
3386							it under the same terms as Perl itself.
3387
3388							=cut
3389
3390							# Return a +ve value to tell Perl the module is ready to go.
3391
3392							'The Lusty Decadent Delights of Imperial Pompeii';