File Coverage

blib/lib/Regexp/Assemble.pm

Criterion	Covered	Total	%
statement	1149	1155	99.4
branch	782	796	98.2
condition	164	177	92.6
subroutine	98	98	100.0
pod	49	49	100.0
total	2242	2275	98.5

line	stmt	bran	cond	sub	pod	time	code
1							# Regexp::Assemple.pm
2							#
3							# Copyright (c) 2004-2011 David Landgren
4							# All rights reserved
5
6							package Regexp::Assemble;
7
8	11			11		338167	use strict;
	11					19
	11					359
9	11			11		50	use warnings;
	11					16
	11					386
10
11	11			11		48	use constant DEBUG_ADD => 1;
	11					17
	11					905
12	11			11		53	use constant DEBUG_TAIL => 2;
	11					14
	11					580
13	11			11		54	use constant DEBUG_LEX => 4;
	11					15
	11					600
14	11			11		62	use constant DEBUG_TIME => 8;
	11					39
	11					663
15
16	11			11		57	use vars qw/$have_Storable $Current_Lexer $Default_Lexer $Single_Char $Always_Fail/;
	11					15
	11					52151
17
18							# The following patterns were generated with examples/naive.
19
20							$Default_Lexer = qr/(?![[(\\]).(?:[+?]\??\|\{\d+(?:,\d)?\}\??)?\|\\(?:[bABCEGLQUXZ]\|[lu].\|(?:[^\w]\|[aefnrtdDwWsS]\|c.\|0\d{2}\|x(?:[\da-fA-F]{2}\|{[\da-fA-F]{4}})\|N\{\w+\}\|[Pp](?:\{\w+\}\|.))(?:[+?]\??\|\{\d+(?:,\d)?\}\??)?)\|\[.*?(?
21
22							$Single_Char = qr/^(?:\\(?:[aefnrtdDwWsS]\|c.\|[^\w\/{\|}-]\|0\d{2}\|x(?:[\da-fA-F]{2}\|{[\da-fA-F]{4}}))\|[^\$^])$/;
23
24							# The pattern to return when nothing has been added (and thus not match anything)
25
26							$Always_Fail = "^\\b\0";
27
28							our $VERSION = '0.36';
29
30							# ------------------------------------------------
31
32							sub new {
33	2197			2197	1	1878970	my $class = shift;
34	2197					5102	my %args = @_;
35
36	2197					2196	my $anc;
37	2197					3799	for $anc (qw(word line string)) {
38	6591	100				15952	if (exists $args{"anchor_$anc"}) {
39	135					267	my $val = delete $args{"anchor_$anc"};
40	135					397	for my $anchor ("anchor_${anc}_begin", "anchor_${anc}_end") {
41	270	100				928	$args{$anchor} = $val unless exists $args{$anchor};
42							}
43							}
44							}
45
46							# anchor_string_absolute sets anchor_string_begin and anchor_string_end_absolute
47	2197	100				5147	if (exists $args{anchor_string_absolute}) {
48	3					5	my $val = delete $args{anchor_string_absolute};
49	3					4	for my $anchor (qw(anchor_string_begin anchor_string_end_absolute)) {
50	6	100				16	$args{$anchor} = $val unless exists $args{$anchor};
51							}
52							}
53
54	2197		100			49118	exists $args{$_} or $args{$_} = 0 for qw(
55							anchor_word_begin
56							anchor_word_end
57							anchor_line_begin
58							anchor_line_end
59							anchor_string_begin
60							anchor_string_end
61							anchor_string_end_absolute
62							debug
63							dup_warn
64							indent
65							lookahead
66							mutable
67							track
68							unroll_plus
69							);
70
71	2197		100			14315	exists $args{$_} or $args{$_} = 1 for qw(
72							fold_meta_pairs
73							reduce
74							chomp
75							);
76
77	2197					6180	@args{qw(re str path)} = (undef, undef, []);
78
79	2197		100			13933	$args{flags} \|\|= delete $args{modifiers} \|\| '';
			100
80	2197	100				4428	$args{lex} = $Current_Lexer if defined $Current_Lexer;
81
82	2197					3852	my $self = bless \%args, $class;
83
84	2197	100				5412	if ($self->_debug(DEBUG_TIME)) {
85	1					5	$self->_init_time_func();
86	1					5	$self->{_begin_time} = $self->{_time_func}->();
87							}
88							$self->{input_record_separator} = delete $self->{rs}
89	2197	100				4659	if exists $self->{rs};
90	2197	100				3947	exists $self->{file} and $self->add_file($self->{file});
91
92	2196					5466	return $self;
93							}
94
95							sub _init_time_func {
96	9			9		20	my $self = shift;
97	9	100				22	return if exists $self->{_time_func};
98
99							# attempt to improve accuracy
100	6	100				16	if (!defined($self->{_use_time_hires})) {
101	5					14	eval {require Time::HiRes};
	5					1223
102	5					2751	$self->{_use_time_hires} = $@;
103							}
104							$self->{_time_func} = length($self->{_use_time_hires}) > 0
105	4			4		10	? sub { time }
106	6	100				41	: \&Time::HiRes::time
107							;
108							}
109
110							sub clone {
111	55			55	1	518	my $self = shift;
112	55					70	my $clone;
113	55					354	my @attr = grep {$_ ne 'path'} keys %$self;
	1314					1927
114	55					145	@{$clone}{@attr} = @{$self}{@attr};
	55					640
	55					159
115	55					177	$clone->{path} = _path_clone($self->_path);
116	55					600	bless $clone, ref($self);
117							}
118
119							sub _fastlex {
120	884			884		821	my $self = shift;
121	884					776	my $record = shift;
122	884					700	my $len = 0;
123	884					1022	my @path = ();
124	884					806	my $case = '';
125	884					671	my $qm = '';
126
127	884					941	my $debug = $self->{debug} & DEBUG_LEX;
128	884					788	my $unroll_plus = $self->{unroll_plus};
129
130	884					737	my $token;
131							my $qualifier;
132	884	100				3118	$debug and print "# _lex <$record>\n";
133	884					880	my $modifier = q{(?:[+?]\\??\|\\{(?:\\d+(?:,\d)?\|,\d+)\\}\\??)?};
134	884					2410	my $class_matcher = qr/\[(?:\[:[a-z]+:\]\|\\?.)*?\]/;
135	884					3362	my $paren_matcher = qr/\(.*?(?
136	884					2553	my $misc_matcher = qr/(?:(c)(.)\|(0)(\d{2}))($modifier)/;
137	884					2373	my $regular_matcher = qr/([^\\[(])($modifier)/;
138	884					1432	my $qm_matcher = qr/(\\?.)/;
139
140	884					805	my $matcher = $regular_matcher;
141							{
142	884	100				675	if ($record =~ /\G$matcher/gc) {
	5960	100				24296
		100
		100
143							# neither a \\ nor [ nor ( followed by a modifer
144	3344	100	100			9624	if ($1 eq '\\E') {
		100	66
145	12	100				118	$debug and print "# E\n";
146	12					18	$case = $qm = '';
147	12					13	$matcher = $regular_matcher;
148	12					14	redo;
149							}
150							elsif ($qm and ($1 eq '\\L' or $1 eq '\\U')) {
151	5	100				13	$debug and print "# ignore \\L, \\U\n";
152	5					14	redo;
153							}
154	3327					3049	$token = $1;
155	3327	100				4401	$qualifier = defined $2 ? $2 : '';
156	3327	100				7522	$debug and print "# token <$token> <$qualifier>\n";
157	3327	100				3555	if ($qm) {
158	90					95	$token = quotemeta($token);
159	90					134	$token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{}\/])$/$1/;
160							}
161							else {
162	3237					3770	$token =~ s{\A([][{}*+?@\\/])\Z}{\\$1};
163							}
164	3327	100	100			5559	if ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/) {
165	22	100				56	$1 and $qualifier .= $1;
166	22	100				885	$debug and print " unroll <$token><$token><$qualifier>\n";
167	22	100				53	$case and $token = $case eq 'L' ? lc($token) : uc($token);
		100
168	22					86	push @path, $token, "$token$qualifier";
169							}
170							else {
171	3305	100				5888	$debug and print " clean <$token>\n";
172	3305	100				6703	push @path,
		100
173							$case eq 'L' ? lc($token).$qualifier
174							: $case eq 'U' ? uc($token).$qualifier
175							: $token.$qualifier
176							;
177							}
178	3327					2892	redo;
179							}
180
181							elsif ($record =~ /\G\\/gc) {
182	1680	100				3447	$debug and print "# backslash\n";
183							# backslash
184	1680	100				13206	if ($record =~ /\G([sdwSDW])($modifier)/gc) {
		100
		100
		100
		100
		100
		100
		100
185	443					616	($token, $qualifier) = ($1, $2);
186	443	100				871	$debug and print "# meta <$token> <$qualifier>\n";
187	443	100	100			1094	push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
		100
188							? ("\\$token", "\\$token$qualifier" . (defined $1 ? $1 : ''))
189							: "\\$token$qualifier";
190							}
191							elsif ($record =~ /\Gx([\da-fA-F]{2})($modifier)/gc) {
192	9	100				294	$debug and print "# x $1\n";
193	9					37	$token = quotemeta(chr(hex($1)));
194	9					13	$qualifier = $2;
195	9	100				284	$debug and print "# cooked <$token>\n";
196	9					31	$token =~ s/^\\([^\w$()*+.?\[\\\]^\|{\/])$/$1/; # } balance
197	9	100				307	$debug and print "# giving <$token>\n";
198	9	100	100			67	push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
		100
199							? ($token, "$token$qualifier" . (defined $1 ? $1 : ''))
200							: "$token$qualifier";
201							}
202							elsif ($record =~ /\GQ/gc) {
203	26	100				232	$debug and print "# Q\n";
204	26					30	$qm = 1;
205	26					31	$matcher = $qm_matcher;
206							}
207							elsif ($record =~ /\G([LU])/gc) {
208	15	100				413	$debug and print "# case $1\n";
209	15					33	$case = $1;
210							}
211							elsif ($record =~ /\GE/gc) {
212	6	100				17	$debug and print "# E\n";
213	6					9	$case = $qm = '';
214	6					10	$matcher = $regular_matcher;
215							}
216							elsif ($record =~ /\G([lu])(.)/gc) {
217	6	100				18	$debug and print "# case $1 to <$2>\n";
218	6	100				35	push @path, $1 eq 'l' ? lc($2) : uc($2);
219							}
220	30					40	elsif (my @arg = grep {defined} $record =~ /\G$misc_matcher/gc) {
221	6	50				12	if ($] < 5.007) {
222	0					0	my $len = 0;
223	0					0	$len += length($_) for @arg;
224	0	0				0	$debug and print "# pos ", pos($record), " fixup add $len\n";
225	0					0	pos($record) = pos($record) + $len;
226							}
227	6					8	my $directive = shift @arg;
228	6	100				30	if ($directive eq 'c') {
229	3	100				10	$debug and print "# ctrl <@arg>\n";
230	3					8	push @path, "\\c" . uc(shift @arg);
231							}
232							else { # elsif ($directive eq '0') {
233	3	100				10	$debug and print "# octal <@arg>\n";
234	3					10	my $ascii = oct(shift @arg);
235	3	100				11	push @path, ($ascii < 32)
236							? "\\c" . chr($ascii+64)
237							: chr($ascii)
238							;
239							}
240	6					9	$path[-1] .= join( '', @arg ); # if @arg;
241	6					10	redo;
242							}
243							elsif ($record =~ /\G(.)/gc) {
244	1165					1712	$token = $1;
245	1165					2273	$token =~ s{[AZabefnrtz\[\]{}()\\\$*+.?@\|/^]}{\\$token};
246	1165	100				1698	$debug and print "# meta <$token>\n";
247	1165					1741	push @path, $token;
248							}
249							else {
250	4	100				19	$debug and print "# ignore char at ", pos($record), " of <$record>\n";
251							}
252	1674					2203	redo;
253							}
254
255							elsif ($record =~ /\G($class_matcher)($modifier)/gc) {
256							# [class] followed by a modifer
257	39					65	my $class = $1;
258	39	50				82	my $qualifier = defined $2 ? $2 : '';
259	39	100				375	$debug and print "# class begin <$class> <$qualifier>\n";
260	39	100				115	if ($class =~ /\A\[\\?(.)]\Z/) {
261	12					25	$class = quotemeta $1;
262	12					23	$class =~ s{\A\\([!@%])\Z}{$1};
263	12	100				143	$debug and print "# class unwrap $class\n";
264							}
265	39	100				353	$debug and print "# class end <$class> <$qualifier>\n";
266	39	100	100			178	push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
		100
267							? ($class, "$class$qualifier" . (defined $1 ? $1 : ''))
268							: "$class$qualifier";
269	39					58	redo;
270							}
271
272							elsif ($record =~ /\G($paren_matcher)/gc) {
273	13	100				35	$debug and print "# paren <$1>\n";
274							# (paren) followed by a modifer
275	13					34	push @path, $1;
276	13					22	redo;
277							}
278
279							}
280	884					3622	return \@path;
281							}
282
283							sub _lex {
284	211			211		359	my $self = shift;
285	211					220	my $record = shift;
286	211					205	my $len = 0;
287	211					282	my @path = ();
288	211					214	my $case = '';
289	211					201	my $qm = '';
290							my $re = defined $self->{lex} ? $self->{lex}
291	211	50				556	: defined $Current_Lexer ? $Current_Lexer
		100
292							: $Default_Lexer;
293	211					258	my $debug = $self->{debug} & DEBUG_LEX;
294	211	100				2018	$debug and print "# _lex <$record>\n";
295	211					196	my ($token, $next_token, $diff, $token_len);
296	211					3693	while( $record =~ /($re)/g ) {
297	480					939	$token = $1;
298	480					646	$token_len = length($token);
299	480	100				8723	$debug and print "# lexed <$token> len=$token_len\n";
300	480	100				1066	if( pos($record) - $len > $token_len ) {
301	15					19	$next_token = $token;
302	15					32	$token = substr( $record, $len, $diff = pos($record) - $len - $token_len );
303	15	100				814	$debug and print "# recover <", substr( $record, $len, $diff ), "> as <$token>, save <$next_token>\n";
304	15					23	$len += $diff;
305							}
306	480					450	$len += $token_len;
307							TOKEN: {
308	480	100				406	if( substr( $token, 0, 1 ) eq '\\' ) {
	495					980
309	226	100				1132	if( $token =~ /^\\([ELQU])$/ ) {
		100
		100
310	51	100				195	if( $1 eq 'E' ) {
		100
311							$qm and $re = defined $self->{lex} ? $self->{lex}
312	12	50				47	: defined $Current_Lexer ? $Current_Lexer
		100
		100
313							: $Default_Lexer;
314	12					27	$case = $qm = '';
315							}
316							elsif( $1 eq 'Q' ) {
317	19					28	$qm = $1;
318							# switch to a more precise lexer to quotemeta individual characters
319	19					71	$re = qr/\\?./;
320							}
321							else {
322	20					101	$case = $1;
323							}
324	51	100				1687	$debug and print "# state change qm=<$qm> case=<$case>\n";
325	51					716	goto NEXT_TOKEN;
326							}
327							elsif( $token =~ /^\\([lu])(.)$/ ) {
328	3	100				100	$debug and print "# apply case=<$1> to <$2>\n";
329	3	100				19	push @path, $1 eq 'l' ? lc($2) : uc($2);
330	3					44	goto NEXT_TOKEN;
331							}
332							elsif( $token =~ /^\\x([\da-fA-F]{2})$/ ) {
333	41					113	$token = quotemeta(chr(hex($1)));
334	41	100				229	$debug and print "# cooked <$token>\n";
335	41					84	$token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{\/])$/$1/; # } balance
336	41	100				165	$debug and print "# giving <$token>\n";
337							}
338							else {
339	131					229	$token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{\/])$/$1/; # } balance
340	131	100				662	$debug and print "# backslashed <$token>\n";
341							}
342							}
343							else {
344	269	100				480	$case and $token = $case eq 'U' ? uc($token) : lc($token);
		100
345	269	100				434	$qm and $token = quotemeta($token);
346	269	100				466	$token = '\\/' if $token eq '/';
347							}
348							# undo quotemeta's brute-force escapades
349	441	100				736	$qm and $token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{}\/])$/$1/;
350	441	100				7576	$debug and print "# <$token> case=<$case> qm=<$qm>\n";
351	441					801	push @path, $token;
352
353							NEXT_TOKEN:
354	495	100				4271	if( defined $next_token ) {
355	15	100				937	$debug and print "# redo <$next_token>\n";
356	15					27	$token = $next_token;
357	15					16	$next_token = undef;
358	15					20	redo TOKEN;
359							}
360							}
361							}
362	211	100				378	if( $len < length($record) ) {
363							# NB: the remainder only arises in the case of degenerate lexer,
364							# and if \Q is operative, the lexer will have been switched to
365							# /\\?./, which means there can never be a remainder, so we
366							# don't have to bother about quotemeta. In other words:
367							# $qm will never be true in this block.
368	7					16	my $remain = substr($record,$len);
369	7	100				22	$case and $remain = $case eq 'U' ? uc($remain) : lc($remain);
		100
370	7	100				630	$debug and print "# add remaining <$remain> case=<$case> qm=<$qm>\n";
371	7					26	push @path, $remain;
372							}
373	211	100				2056	$debug and print "# _lex out <@path>\n";
374	211					1540	return \@path;
375							}
376
377							sub add {
378	1069			1069	1	13649	my $self = shift;
379	1069					928	my $record;
380	1069					1391	my $debug = $self->{debug} & DEBUG_LEX;
381	1069					2449	while( defined( $record = shift @_ )) {
382	2279	100				4411	CORE::chomp($record) if $self->{chomp};
383	2279	100	100			4812	next if $self->{pre_filter} and not $self->{pre_filter}->($record);
384	2278	100				6925	$debug and print "# add <$record>\n";
385	2278					2692	$self->{stats_raw} += length $record;
386							my $list = $record =~ /[+*?(\\\[{]/ # }]) restore equilibrium
387	2278	100				14642	? $self->{lex} ? $self->_lex($record) : $self->_fastlex($record)
		100
388							: [split //, $record]
389							;
390	2278	100	100			5794	next if $self->{filter} and not $self->{filter}->(@$list);
391	2277					4091	$self->_insertr( $list );
392							}
393	1069					2305	return $self;
394							}
395
396							sub add_file {
397	13			13	1	19	my $self = shift;
398	13					14	my $rs;
399							my @file;
400	13	100				37	if (ref($_[0]) eq 'HASH') {
401	6					10	my $arg = shift;
402							$rs = $arg->{rs}
403							\|\| $arg->{input_record_separator}
404							\|\| $self->{input_record_separator}
405	6		66			40	\|\| $/;
406							@file = ref($arg->{file}) eq 'ARRAY'
407	4					13	? @{$arg->{file}}
408	6	100				21	: $arg->{file};
409							}
410							else {
411	7		66			29	$rs = $self->{input_record_separator} \|\| $/;
412	7					18	@file = @_;
413							}
414	13					52	local $/ = $rs;
415	13					14	my $file;
416	13					24	for $file (@file) {
417	15	100				1531	open my $fh, '<', $file or do {
418	1					13	require Carp;
419	1					247	Carp::croak("cannot open $file for input: $!");
420							};
421	14					23739	while (defined (my $rec = <$fh>)) {
422	55					132	$self->add($rec);
423							}
424	14					171	close $fh;
425							}
426	12					74	return $self;
427							}
428
429							sub insert {
430	3398			3398	1	12413	my $self = shift;
431	3398	100	100			7647	return if $self->{filter} and not $self->{filter}->(@_);
432	3397					21858	$self->_insertr( [@_] );
433	3397					7273	return $self;
434							}
435
436							sub _insertr {
437	7084			7084		6984	my $self = shift;
438	7084		100			20848	my $dup = $self->{stats_dup} \|\| 0;
439	7084					11252	$self->{path} = $self->_insert_path( $self->_path, $self->_debug(DEBUG_ADD), $_[0] );
440	7084	100	100			18935	if( not defined $self->{stats_dup} or $dup == $self->{stats_dup} ) {
		100
441	7073					8313	++$self->{stats_add};
442	7073	100				5888	$self->{stats_cooked} += defined($_) ? length($_) : 0 for @{$_[0]};
	7073					37982
443							}
444							elsif( $self->{dup_warn} ) {
445	4	100				10	if( ref $self->{dup_warn} eq 'CODE' ) {
446	2					7	$self->{dup_warn}->($self, $_[0]);
447							}
448							else {
449	2					2	my $pattern = join( '', @{$_[0]} );
	2					5
450	2					11	require Carp;
451	2					36	Carp::carp("duplicate pattern added: /$pattern/");
452							}
453							}
454	7084					18152	$self->{str} = $self->{re} = undef;
455							}
456
457							sub lexstr {
458	2			2	1	13	return shift->_lex(shift);
459							}
460
461							sub pre_filter {
462	3			3	1	623	my $self = shift;
463	3					7	my $pre_filter = shift;
464	3	100	100			19	if( defined $pre_filter and ref($pre_filter) ne 'CODE' ) {
465	1					5	require Carp;
466	1					69	Carp::croak("pre_filter method not passed a coderef");
467							}
468	2					2	$self->{pre_filter} = $pre_filter;
469	2					6	return $self;
470							}
471
472
473							sub filter {
474	4			4	1	307	my $self = shift;
475	4					7	my $filter = shift;
476	4	100	100			22	if( defined $filter and ref($filter) ne 'CODE' ) {
477	1					9	require Carp;
478	1					166	Carp::croak("filter method not passed a coderef");
479							}
480	3					7	$self->{filter} = $filter;
481	3					9	return $self;
482							}
483
484							sub as_string {
485	800			800	1	1388	my $self = shift;
486	800	100				1513	if( not defined $self->{str} ) {
487	798	100				1356	if( $self->{track} ) {
488	8					22	$self->{m} = undef;
489	8					23	$self->{mcount} = 0;
490	8					23	$self->{mlist} = [];
491	8					27	$self->{str} = _re_path_track($self, $self->_path, '', '');
492							}
493							else {
494	790	100	100			4327	$self->_reduce unless ($self->{mutable} or not $self->{reduce});
495	790					1217	my $arg = {@_};
496							$arg->{indent} = $self->{indent}
497	790	100	100			3689	if not exists $arg->{indent} and $self->{indent} > 0;
498	790	100	100			2595	if( exists $arg->{indent} and $arg->{indent} > 0 ) {
		100
499	42					81	$arg->{depth} = 0;
500	42					101	$self->{str} = _re_path_pretty($self, $self->_path, $arg);
501							}
502							elsif( $self->{lookahead} ) {
503	35					90	$self->{str} = _re_path_lookahead($self, $self->_path);
504							}
505							else {
506	713					1356	$self->{str} = _re_path($self, $self->_path);
507							}
508							}
509	798	100				1835	if (not length $self->{str}) {
510							# explicitly fail to match anything if no pattern was generated
511	9					19	$self->{str} = $Always_Fail;
512							}
513							else {
514							my $begin =
515							$self->{anchor_word_begin} ? '\\b'
516							: $self->{anchor_line_begin} ? '^'
517	789	100				2495	: $self->{anchor_string_begin} ? '\A'
		100
		100
518							: ''
519							;
520							my $end =
521							$self->{anchor_word_end} ? '\\b'
522							: $self->{anchor_line_end} ? '$'
523							: $self->{anchor_string_end} ? '\Z'
524	789	100				2439	: $self->{anchor_string_end_absolute} ? '\z'
		100
		100
		100
525							: ''
526							;
527	789					1520	$self->{str} = "$begin$self->{str}$end";
528							}
529	798	100				2024	$self->{path} = [] unless $self->{mutable};
530							}
531	800					4493	return $self->{str};
532							}
533
534							sub re {
535	122			122	1	1345	my $self = shift;
536	122	100				486	$self->_build_re($self->as_string(@_)) unless defined $self->{re};
537	122					644	return $self->{re};
538							}
539
540							use overload '""' => sub {
541	2131			2131		692647	my $self = shift;
542	2131	100				17310	return $self->{re} if $self->{re};
543	489					972	$self->_build_re($self->as_string());
544	489					6245	return $self->{re};
545	11			11		16230	};
	11					12073
	11					128
546
547							sub _build_re {
548	618			618		658	my $self = shift;
549	618					610	my $str = shift;
550	618	100				1161	if( $self->{track} ) {
551	11			11		1059	use re 'eval';
	11					18
	11					128533
552							$self->{re} = length $self->{flags}
553	8	100				1571	? qr/(?$self->{flags}:$str)/
554							: qr/$str/
555							;
556							}
557							else {
558							# how could I not repeat myself?
559							$self->{re} = length $self->{flags}
560	610	100				13474	? qr/(?$self->{flags}:$str)/
561							: qr/$str/
562							;
563							}
564							}
565
566							sub match {
567	29			29	1	6342	my $self = shift;
568	29					47	my $target = shift;
569	29	100				140	$self->_build_re($self->as_string(@_)) unless defined $self->{re};
570	29					64	$self->{m} = undef;
571	29					61	$self->{mvar} = [];
572	29	100				1014	if( not $target =~ /$self->{re}/ ) {
573	8					23	$self->{mbegin} = [];
574	8					26	$self->{mend} = [];
575	8					57	return undef;
576							}
577	21	50				92	$self->{m} = $^R if $] >= 5.009005;
578	21					148	$self->{mbegin} = _path_copy([@-]);
579	21					132	$self->{mend} = _path_copy([@+]);
580	21					52	my $n = 0;
581	21					81	for( my $n = 0; $n < @-; ++$n ) {
582	43	100	66			257	push @{$self->{mvar}}, substr($target, $-[$n], $+[$n] - $-[$n])
	33					281
583							if defined $-[$n] and defined $+[$n];
584							}
585	21	100				63	if( $self->{track} ) {
586	20	50				190	return defined $self->{m} ? $self->{mlist}[$self->{m}] : 1;
587							}
588							else {
589	1					56	return 1;
590							}
591							}
592
593							sub source {
594	4			4	1	387	my $self = shift;
595	4	100				20	return unless $self->{track};
596	3	100				17	defined($_[0]) and return $self->{mlist}[$_[0]];
597	2	100				15	return unless defined $self->{m};
598	1					6	return $self->{mlist}[$self->{m}];
599							}
600
601							sub mbegin {
602	3			3	1	12	my $self = shift;
603	3	100				30	return exists $self->{mbegin} ? $self->{mbegin} : [];
604							}
605
606							sub mend {
607	3			3	1	7	my $self = shift;
608	3	100				58	return exists $self->{mend} ? $self->{mend} : [];
609							}
610
611							sub mvar {
612	19			19	1	35	my $self = shift;
613	19	100				68	return undef unless exists $self->{mvar};
614	18	100				134	return defined($_[0]) ? $self->{mvar}[$_[0]] : $self->{mvar};
615							}
616
617							sub capture {
618	5			5	1	20	my $self = shift;
619	5	100				19	if( $self->{mvar} ) {
620	4					6	my @capture = @{$self->{mvar}};
	4					15
621	4					8	shift @capture;
622	4					20	return @capture;
623							}
624	1					8	return ();
625							}
626
627							sub matched {
628	9			9	1	476	my $self = shift;
629	9	100				82	return defined $self->{m} ? $self->{mlist}[$self->{m}] : undef;
630							}
631
632							sub stats_add {
633	2			2	1	6	my $self = shift;
634	2		100			10	return $self->{stats_add} \|\| 0;
635							}
636
637							sub stats_dup {
638	2			2	1	3	my $self = shift;
639	2		100			11	return $self->{stats_dup} \|\| 0;
640							}
641
642							sub stats_raw {
643	2			2	1	4	my $self = shift;
644	2		100			12	return $self->{stats_raw} \|\| 0;
645							}
646
647							sub stats_cooked {
648	2			2	1	5	my $self = shift;
649	2		100			11	return $self->{stats_cooked} \|\| 0;
650							}
651
652							sub stats_length {
653	6			6	1	2618	my $self = shift;
654	6	100	100			41	return (defined $self->{str} and $self->{str} ne $Always_Fail) ? length $self->{str} : 0;
655							}
656
657							sub dup_warn {
658	5			5	1	1250	my $self = shift;
659	5	100				13	$self->{dup_warn} = defined($_[0]) ? $_[0] : 1;
660	5					11	return $self;
661							}
662
663							sub anchor_word_begin {
664	5			5	1	6	my $self = shift;
665	5	100				16	$self->{anchor_word_begin} = defined($_[0]) ? $_[0] : 1;
666	5					17	return $self;
667							}
668
669							sub anchor_word_end {
670	4			4	1	3	my $self = shift;
671	4	100				6	$self->{anchor_word_end} = defined($_[0]) ? $_[0] : 1;
672	4					9	return $self;
673							}
674
675							sub anchor_word {
676	2			2	1	3	my $self = shift;
677	2					2	my $state = shift;
678	2					6	$self->anchor_word_begin($state)->anchor_word_end($state);
679	2					6	return $self;
680							}
681
682							sub anchor_line_begin {
683	4			4	1	4	my $self = shift;
684	4	100				12	$self->{anchor_line_begin} = defined($_[0]) ? $_[0] : 1;
685	4					7	return $self;
686							}
687
688							sub anchor_line_end {
689	2			2	1	2	my $self = shift;
690	2	100				6	$self->{anchor_line_end} = defined($_[0]) ? $_[0] : 1;
691	2					2	return $self;
692							}
693
694							sub anchor_line {
695	2			2	1	3	my $self = shift;
696	2					2	my $state = shift;
697	2					5	$self->anchor_line_begin($state)->anchor_line_end($state);
698	2					5	return $self;
699							}
700
701							sub anchor_string_begin {
702	277			277	1	199	my $self = shift;
703	277	100				453	$self->{anchor_string_begin} = defined($_[0]) ? $_[0] : 1;
704	277					411	return $self;
705							}
706
707							sub anchor_string_end {
708	276			276	1	203	my $self = shift;
709	276	100				421	$self->{anchor_string_end} = defined($_[0]) ? $_[0] : 1;
710	276					215	return $self;
711							}
712
713							sub anchor_string_end_absolute {
714	3			3	1	4	my $self = shift;
715	3	100				8	$self->{anchor_string_end_absolute} = defined($_[0]) ? $_[0] : 1;
716	3					8	return $self;
717							}
718
719							sub anchor_string {
720	274			274	1	248	my $self = shift;
721	274	100				406	my $state = defined($_[0]) ? $_[0] : 1;
722	274					387	$self->anchor_string_begin($state)->anchor_string_end($state);
723	274					685	return $self;
724							}
725
726							sub anchor_string_absolute {
727	2			2	1	3	my $self = shift;
728	2	100				7	my $state = defined($_[0]) ? $_[0] : 1;
729	2					6	$self->anchor_string_begin($state)->anchor_string_end_absolute($state);
730	2					6	return $self;
731							}
732
733							sub debug {
734	605			605	1	2217	my $self = shift;
735	605	100				1241	$self->{debug} = defined($_[0]) ? $_[0] : 0;
736	605	100				911	if ($self->_debug(DEBUG_TIME)) {
737							# hmm, debugging time was switched on after instantiation
738	4					12	$self->_init_time_func;
739	4					14	$self->{_begin_time} = $self->{_time_func}->();
740							}
741	605					774	return $self;
742							}
743
744							sub dump {
745	9			9	1	1429	return _dump($_[0]->_path);
746							}
747
748							sub chomp {
749	22			22	1	584	my $self = shift;
750	22	100				56	$self->{chomp} = defined($_[0]) ? $_[0] : 1;
751	22					48	return $self;
752							}
753
754							sub fold_meta_pairs {
755	5			5	1	7	my $self = shift;
756	5	100				14	$self->{fold_meta_pairs} = defined($_[0]) ? $_[0] : 1;
757	5					10	return $self;
758							}
759
760							sub indent {
761	4			4	1	593	my $self = shift;
762	4	100				11	$self->{indent} = defined($_[0]) ? $_[0] : 0;
763	4					8	return $self;
764							}
765
766							sub lookahead {
767	22			22	1	32	my $self = shift;
768	22	100				82	$self->{lookahead} = defined($_[0]) ? $_[0] : 1;
769	22					65	return $self;
770							}
771
772							sub flags {
773	24			24	1	1190	my $self = shift;
774	24	100				85	$self->{flags} = defined($_[0]) ? $_[0] : '';
775	24					84	return $self;
776							}
777
778							sub modifiers {
779	4			4	1	1130	my $self = shift;
780	4					9	return $self->flags(@_);
781							}
782
783							sub track {
784	5			5	1	1181	my $self = shift;
785	5	100				20	$self->{track} = defined($_[0]) ? $_[0] : 1;
786	5					12	return $self;
787							}
788
789							sub unroll_plus {
790	2			2	1	840	my $self = shift;
791	2	100				8	$self->{unroll_plus} = defined($_[0]) ? $_[0] : 1;
792	2					8	return $self;
793							}
794
795							sub lex {
796	1			1	1	3	my $self = shift;
797	1					52	$self->{lex} = qr($_[0]);
798	1					6	return $self;
799							}
800
801							sub reduce {
802	19			19	1	975	my $self = shift;
803	19	100				74	$self->{reduce} = defined($_[0]) ? $_[0] : 1;
804	19					70	return $self;
805							}
806
807							sub mutable {
808	5			5	1	905	my $self = shift;
809	5	100				21	$self->{mutable} = defined($_[0]) ? $_[0] : 1;
810	5					9	return $self;
811							}
812
813							sub reset {
814							# reinitialise the internal state of the object
815	19			19	1	2451	my $self = shift;
816	19					45	$self->{path} = [];
817	19					48	$self->{re} = undef;
818	19					30	$self->{str} = undef;
819	19					82	return $self;
820							}
821
822							sub Default_Lexer {
823	4	100		4	1	3135	if( $_[0] ) {
824	3	100				13	if( my $refname = ref($_[0]) ) {
825	1					11	require Carp;
826	1					162	Carp::croak("Cannot pass a $refname to Default_Lexer");
827							}
828	2					5	$Current_Lexer = $_[0];
829							}
830	3	100				15	return defined $Current_Lexer ? $Current_Lexer : $Default_Lexer;
831							}
832
833							# --- no user serviceable parts below ---
834
835							# -- debug helpers
836
837							sub _debug {
838	12316			12316		10213	my $self = shift;
839	12316	100				37393	return $self->{debug} & shift() ? 1 : 0;
840							}
841
842							# -- helpers
843
844							sub _path {
845							# access the path
846	10027			10027		23461	return $_[0]->{path};
847							}
848
849							# -- the heart of the matter
850
851							$have_Storable = do {
852							eval {
853							require Storable;
854							import Storable 'dclone';
855							};
856							$@ ? 0 : 1;
857							};
858
859							sub _path_clone {
860	55	100		55		3533	$have_Storable ? dclone($_[0]) : _path_copy($_[0]);
861							}
862
863							sub _path_copy {
864	80			80		661	my $path = shift;
865	80					88	my $new = [];
866	80					197	for( my $p = 0; $p < @$path; ++$p ) {
867	201	100				400	if( ref($path->[$p]) eq 'HASH' ) {
		100
868	9					16	push @$new, _node_copy($path->[$p]);
869							}
870							elsif( ref($path->[$p]) eq 'ARRAY' ) {
871	3					6	push @$new, _path_copy($path->[$p]);
872							}
873							else {
874	189					438	push @$new, $path->[$p];
875							}
876							}
877	80					189	return $new;
878							}
879
880							sub _node_copy {
881	11			11		11	my $node = shift;
882	11					12	my $new = {};
883	11					33	while( my( $k, $v ) = each %$node ) {
884	27	100				49	$new->{$k} = defined($v)
885							? _path_copy($v)
886							: undef
887							;
888							}
889	11					26	return $new;
890							}
891
892							sub _insert_path {
893	7158			7158		6708	my $self = shift;
894	7158					5836	my $list = shift;
895	7158					6522	my $debug = shift;
896	7158					5917	my @in = @{shift()}; # create a new copy
	7158					17541
897	7158	100				13612	if( @$list == 0 ) { # special case the first time
898	2033	100	100			9831	if( @in == 0 or (@in == 1 and (not defined $in[0] or $in[0] eq ''))) {
			66
			66
899	33					221	return [{'' => undef}];
900							}
901							else {
902	2000					4686	return \@in;
903							}
904							}
905	5125	100				9268	$debug and print "# _insert_path @{[_dump(\@in)]} into @{[_dump($list)]}\n";
	151					235
	151					176
906	5125					4905	my $path = $list;
907	5125					4172	my $offset = 0;
908	5125					4019	my $token;
909	5125	100				8533	if( not @in ) {
910	2	100				6	if( ref($list->[0]) ne 'HASH' ) {
911	1					4	return [ { '' => undef, $list->[0] => $list } ];
912							}
913							else {
914	1					2	$list->[0]{''} = undef;
915	1					2	return $list;
916							}
917							}
918	5123					10827	while( defined( $token = shift @in )) {
919	17747	100				34171	if( ref($token) eq 'HASH' ) {
920	282	100				551	$debug and print "# p0=", _dump($path), "\n";
921	282					777	$path = $self->_insert_node( $path, $offset, $token, $debug, @in );
922	282	100				540	$debug and print "# p1=", _dump($path), "\n";
923	282					354	last;
924							}
925	17465	100				27747	if( ref($path->[$offset]) eq 'HASH' ) {
926	3714	100				5657	$debug and print "# at (off=$offset len=@{[scalar @$path]}) ", _dump($path->[$offset]), "\n";
	54					220
927	3714					3558	my $node = $path->[$offset];
928	3714	100				5625	if( exists( $node->{$token} )) {
929	2632	100				4425	if ($offset < $#$path) {
930							my $new = {
931							$token => [$token, @in],
932	1					5	_re_path($self, [$node]) => [@{$path}[$offset..$#$path]],
	1					3
933							};
934	1					4	splice @$path, $offset, @$path-$offset, $new;
935	1					2	last;
936							}
937							else {
938	2631	100				4083	$debug and print "# descend key=$token @{[_dump($node->{$token})]}\n";
	31					64
939	2631					2892	$path = $node->{$token};
940	2631					2190	$offset = 0;
941	2631					2992	redo;
942							}
943							}
944							else {
945	1082	100				1797	$debug and print "# add path ($token:@{[_dump(\@in)]}) into @{[_dump($path)]} at off=$offset to end=@{[scalar $#$path]}\n";
	23					52
	23					45
	23					1101
946	1082	100				1793	if( $offset == $#$path ) {
947	1075					3063	$node->{$token} = [ $token, @in ];
948							}
949							else {
950							my $new = {
951							_node_key($token) => [ $token, @in ],
952	7					20	_node_key($node) => [@{$path}[$offset..$#{$path}]],
	7					26
	7					14
953							};
954	7					27	splice( @$path, $offset, @$path - $offset, $new );
955	7	100				21	$debug and print "# fused node=@{[_dump($new)]} path=@{[_dump($path)]}\n";
	1					2
	1					2
956							}
957	1082					1410	last;
958							}
959							}
960
961	13751	100				18804	if( $debug ) {
962	306					289	my $msg = '';
963	306					223	my $n;
964	306					533	for( $n = 0; $n < @$path; ++$n ) {
965	1093	100				1385	$msg .= ' ' if $n;
966							my $atom = ref($path->[$n]) eq 'HASH'
967	1093	100				1396	? '{'.join( ' ', keys(%{$path->[$n]})).'}'
	81					209
968							: $path->[$n]
969							;
970	1093	100				2432	$msg .= $n == $offset ? "<$atom>" : $atom;
971							}
972	306					12480	print "# at path ($msg)\n";
973							}
974
975	13751	100				33357	if( $offset >= @$path ) {
		100
		100
976	732					3047	push @$path, { $token => [ $token, @in ], '' => undef };
977	732	100				1680	$debug and print "# added remaining @{[_dump($path)]}\n";
	21					48
978	732					953	last;
979							}
980							elsif( $token ne $path->[$offset] ) {
981	2103	100				6803	$debug and print "# token $token not present\n";
982							splice @$path, $offset, @$path-$offset, {
983							length $token
984							? ( _node_key($token) => [$token, @in])
985							: ( '' => undef )
986							,
987	2103	100				5961	$path->[$offset] => [@{$path}[$offset..$#{$path}]],
	2103					8690
	2103					3045
988							};
989	2103	100				4418	$debug and print "# path=@{[_dump($path)]}\n";
	79					119
990	2103					2404	last;
991							}
992							elsif( not @in ) {
993	923	100				1952	$debug and print "# last token to add\n";
994	923	100				2171	if( defined( $path->[$offset+1] )) {
995	912					971	++$offset;
996	912	100				1655	if( ref($path->[$offset]) eq 'HASH' ) {
997	118	100				322	$debug and print "# add sentinel to node\n";
998	118					216	$path->[$offset]{''} = undef;
999							}
1000							else {
1001	794	100				1397	$debug and print "# convert <$path->[$offset]> to node for sentinel\n";
1002							splice @$path, $offset, @$path-$offset, {
1003							'' => undef,
1004	794					2166	$path->[$offset] => [ @{$path}[$offset..$#{$path}] ],
	794					3758
	794					1262
1005							};
1006							}
1007							}
1008							else {
1009							# already seen this pattern
1010	11					22	++$self->{stats_dup};
1011							}
1012	923					1370	last;
1013							}
1014							# if we get here then @_ still contains a token
1015	9993					16401	++$offset;
1016							}
1017	5123					11451	$list;
1018							}
1019
1020							sub _insert_node {
1021	282			282		310	my $self = shift;
1022	282					283	my $path = shift;
1023	282					263	my $offset = shift;
1024	282					306	my $token = shift;
1025	282					270	my $debug = shift;
1026	282					379	my $path_end = [@{$path}[$offset..$#{$path}]];
	282					609
	282					420
1027							# NB: $path->[$offset] and $[path_end->[0] are equivalent
1028	282					751	my $token_key = _re_path($self, [$token]);
1029	282	100				726	$debug and print "# insert node(@{[_dump($token)]}:@{[_dump(\@_)]}) (key=$token_key)",
	26					47
	26					52
1030	26					48	" at path=@{[_dump($path_end)]}\n";
1031	282	100				720	if( ref($path_end->[0]) eq 'HASH' ) {
1032	195	100				684	if( exists($path_end->[0]{$token_key}) ) {
		100
1033	25	100				70	if( @$path_end > 1 ) {
1034	2					9	my $path_key = _re_path($self, [$path_end->[0]]);
1035	2					12	my $new = {
1036							$path_key => [ @$path_end ],
1037							$token_key => [ $token, @_ ],
1038							};
1039	2	100				8	$debug and print "# +bifurcate new=@{[_dump($new)]}\n";
	1					3
1040	2					39	splice( @$path, $offset, @$path_end, $new );
1041							}
1042							else {
1043	23					41	my $old_path = $path_end->[0]{$token_key};
1044	23					36	my $new_path = [];
1045	23		100			115	while( @$old_path and _node_eq( $old_path->[0], $token )) {
1046	30	100				111	$debug and print "# identical nodes in sub_path ",
		100
1047							ref($token) ? _dump($token) : $token, "\n";
1048	30					58	push @$new_path, shift(@$old_path);
1049	30					97	$token = shift @_;
1050							}
1051	23	50				72	if( @$new_path ) {
1052	23					74	my $new;
1053	23					29	my $token_key = $token;
1054	23	100				46	if( @_ ) {
1055	6					13	$new = {
1056							_re_path($self, $old_path) => $old_path,
1057							$token_key => [$token, @_],
1058							};
1059	6	100				16	$debug and print "# insert_node(bifurc) n=@{[_dump([$new])]}\n";
	1					2
1060							}
1061							else {
1062	17	100				43	$debug and print "# insert $token into old path @{[_dump($old_path)]}\n";
	5					9
1063	17	100				46	if( @$old_path ) {
1064	11					67	$new = ($self->_insert_path( $old_path, $debug, [$token] ))->[0];
1065							}
1066							else {
1067	6					19	$new = { '' => undef, $token => [$token] };
1068							}
1069							}
1070	23					47	push @$new_path, $new;
1071							}
1072	23					36	$path_end->[0]{$token_key} = $new_path;
1073	23	100				59	$debug and print "# +_insert_node result=@{[_dump($path_end)]}\n";
	6					9
1074	23					68	splice( @$path, $offset, @$path_end, @$path_end );
1075							}
1076							}
1077							elsif( not _node_eq( $path_end->[0], $token )) {
1078	70	100				149	if( @$path_end > 1 ) {
1079	11					36	my $path_key = _re_path($self, [$path_end->[0]]);
1080	11					74	my $new = {
1081							$path_key => [ @$path_end ],
1082							$token_key => [ $token, @_ ],
1083							};
1084	11	100				69	$debug and print "# path->node1 at $path_key/$token_key @{[_dump($new)]}\n";
	1					3
1085	11					44	splice( @$path, $offset, @$path_end, $new );
1086							}
1087							else {
1088	59	100				121	$debug and print "# next in path is node, trivial insert at $token_key\n";
1089	59					212	$path_end->[0]{$token_key} = [$token, @_];
1090	59					157	splice( @$path, $offset, @$path_end, @$path_end );
1091							}
1092							}
1093							else {
1094	100		100			392	while( @$path_end and _node_eq( $path_end->[0], $token )) {
1095	131	100				419	$debug and print "# identical nodes @{[_dump([$token])]}\n";
	9					21
1096	131					187	shift @$path_end;
1097	131					254	$token = shift @_;
1098	131					350	++$offset;
1099							}
1100	100	100				202	if( @$path_end ) {
1101	57	100				130	$debug and print "# insert at $offset $token:@{[_dump(\@_)]} into @{[_dump($path_end)]}\n";
	4					9
	4					8
1102	57					318	$path_end = $self->_insert_path( $path_end, $debug, [$token, @_] );
1103	57	100				149	$debug and print "# got off=$offset s=@{[scalar @_]} path_add=@{[_dump($path_end)]}\n";
	4					15
	4					8
1104	57					175	splice( @$path, $offset, @$path - $offset, @$path_end );
1105	57	100				150	$debug and print "# got final=@{[_dump($path)]}\n";
	4					9
1106							}
1107							else {
1108	43					113	$token_key = _node_key($token);
1109	43					212	my $new = {
1110							'' => undef,
1111							$token_key => [ $token, @_ ],
1112							};
1113	43	100				129	$debug and print "# convert opt @{[_dump($new)]}\n";
	3					5
1114	43					101	push @$path, $new;
1115							}
1116							}
1117							}
1118							else {
1119	87	100				172	if( @$path_end ) {
1120	74					356	my $new = {
1121							$path_end->[0] => [ @$path_end ],
1122							$token_key => [ $token, @_ ],
1123							};
1124	74	100				165	$debug and print "# atom->node @{[_dump($new)]}\n";
	5					8
1125	74					205	splice( @$path, $offset, @$path_end, $new );
1126	74	100				180	$debug and print "# out=@{[_dump($path)]}\n";
	5					9
1127							}
1128							else {
1129	13	100				34	$debug and print "# add opt @{[_dump([$token,@_])]} via $token_key\n";
	4					10
1130	13					55	push @$path, {
1131							'' => undef,
1132							$token_key => [ $token, @_ ],
1133							};
1134							}
1135							}
1136	282					952	$path;
1137							}
1138
1139							sub _reduce {
1140	810			810		994	my $self = shift;
1141	810					1384	my $context = { debug => $self->_debug(DEBUG_TAIL), depth => 0 };
1142
1143	810	100				1563	if ($self->_debug(DEBUG_TIME)) {
1144	4					6	$self->_init_time_func;
1145	4					10	my $now = $self->{_time_func}->();
1146	4	100				14	if (exists $self->{_begin_time}) {
1147	3					302	printf "# load=%0.6f\n", $now - $self->{_begin_time};
1148							}
1149							else {
1150	1					44	printf "# load-epoch=%0.6f\n", $now;
1151							}
1152	4					17	$self->{_begin_time} = $self->{_time_func}->();
1153							}
1154
1155	810					1316	my ($head, $tail) = _reduce_path( $self->_path, $context );
1156	810	100				1632	$context->{debug} and print "# final head=", _dump($head), ' tail=', _dump($tail), "\n";
1157	810	100				1356	if( !@$head ) {
1158	651					991	$self->{path} = $tail;
1159							}
1160							else {
1161							$self->{path} = [
1162	159					345	@{_unrev_path( $tail, $context )},
1163	159					174	@{_unrev_path( $head, $context )},
	159					249
1164							];
1165							}
1166
1167	810	100				1961	if ($self->_debug(DEBUG_TIME)) {
1168	4					10	my $now = $self->{_time_func}->();
1169	4	50				9	if (exists $self->{_begin_time}) {
1170	4					329	printf "# reduce=%0.6f\n", $now - $self->{_begin_time};
1171							}
1172							else {
1173	0					0	printf "# reduce-epoch=%0.6f\n", $now;
1174							}
1175	4					16	$self->{_begin_time} = $self->{_time_func}->();
1176							}
1177
1178	810	100				1695	$context->{debug} and print "# final path=", _dump($self->{path}), "\n";
1179	810					1566	return $self;
1180							}
1181
1182							sub _remove_optional {
1183	1710	100		1710		3395	if( exists $_[0]->{''} ) {
1184	350					633	delete $_[0]->{''};
1185	350					545	return 1;
1186							}
1187	1360					1598	return 0;
1188							}
1189
1190							sub _reduce_path {
1191	810			810		925	my ($path, $ctx) = @_;
1192	810					1504	my $indent = ' ' x $ctx->{depth};
1193	810					823	my $debug = $ctx->{debug};
1194	810	100				1420	$debug and print "#$indent _reduce_path $ctx->{depth} ", _dump($path), "\n";
1195	810					700	my $new;
1196	810					960	my $head = [];
1197	810					1010	my $tail = [];
1198	810					1897	while( defined( my $p = pop @$path )) {
1199	1159	100				1946	if( ref($p) eq 'HASH' ) {
1200	514					937	my ($node_head, $node_tail) = _reduce_node($p, _descend($ctx) );
1201	514	100				1496	$debug and print "#$indent\| head=", _dump($node_head), " tail=", _dump($node_tail), "\n";
1202	514	100				1233	push @$head, @$node_head if scalar @$node_head;
1203	514	100				4140	push @$tail, ref($node_tail) eq 'HASH' ? $node_tail : @$node_tail;
1204							}
1205							else {
1206	645	100				943	if( @$head ) {
1207	125	100				257	$debug and print "#$indent\| push $p leaves @{[_dump($path)]}\n";
	7					38
1208	125					383	push @$tail, $p;
1209							}
1210							else {
1211	520	100				1135	$debug and print "#$indent\| unshift $p\n";
1212	520					1573	unshift @$tail, $p;
1213							}
1214							}
1215							}
1216	20					278	$debug and print "#$indent\| tail nr=@{[scalar @$tail]} t0=", ref($tail->[0]),
1217	810	100				1511	(ref($tail->[0]) eq 'HASH' ? " n=" . scalar(keys %{$tail->[0]}) : '' ),
	18	100				894
1218							"\n";
1219	810	100	100			2566	if( @$tail > 1
			100
1220							and ref($tail->[0]) eq 'HASH'
1221	97					437	and keys %{$tail->[0]} == 2
1222							) {
1223	72					81	my $opt;
1224							my $fixed;
1225	72					107	while( my ($key, $path) = each %{$tail->[0]} ) {
	216					586
1226	144	100				262	$debug and print "#$indent\| scan k=$key p=@{[_dump($path)]}\n";
	14					29
1227	144	100				287	next unless $path;
1228	123	100	100			415	if (@$path == 1 and ref($path->[0]) eq 'HASH') {
1229	8					21	$opt = $path->[0];
1230							}
1231							else {
1232	115					175	$fixed = $path;
1233							}
1234							}
1235	72	100				197	if( exists $tail->[0]{''} ) {
1236	21					45	my $path = [@{$tail}[1..$#{$tail}]];
	21					67
	21					39
1237	21					41	$tail = $tail->[0];
1238	21					63	($head, $tail, $path) = _slide_tail( $head, $tail, $path, _descend($ctx) );
1239	21					83	$tail = [$tail, @$path];
1240							}
1241							}
1242	810	100				1396	$debug and print "#$indent _reduce_path $ctx->{depth} out head=", _dump($head), ' tail=', _dump($tail), "\n";
1243	810					1473	return ($head, $tail);
1244							}
1245
1246							sub _reduce_node {
1247	1080			1080		1338	my ($node, $ctx) = @_;
1248	1080					2204	my $indent = ' ' x $ctx->{depth};
1249	1080					1100	my $debug = $ctx->{debug};
1250	1080					1513	my $optional = _remove_optional($node);
1251	1080	100				1935	$debug and print "#$indent _reduce_node $ctx->{depth} in @{[_dump($node)]} opt=$optional\n";
	57					101
1252	1080	100	100			2500	if( $optional and scalar keys %$node == 1 ) {
1253	70					128	my $path = (values %$node)[0];
1254	70	100				105	if( not grep { ref($_) eq 'HASH' } @$path ) {
	117					311
1255							# if we have removed an optional, and there is only one path
1256							# left then there is nothing left to compare. Because of the
1257							# optional it cannot participate in any further reductions.
1258							# (unless we test for equality among sub-trees).
1259	61					197	my $result = {
1260							'' => undef,
1261							$path->[0] => $path
1262							};
1263	61	100				121	$debug and print "#$indent\| fast fail @{[_dump($result)]}\n";
	2					5
1264	61					155	return [], $result;
1265							}
1266							}
1267
1268	1019					1416	my( $fail, $reduce ) = _scan_node( $node, _descend($ctx) );
1269
1270	1019	100				2711	$debug and print "#$indent\|_scan_node done opt=$optional reduce=@{[_dump($reduce)]} fail=@{[_dump($fail)]}\n";
	55					100
	55					91
1271
1272							# We now perform tail reduction on each of the nodes in the reduce
1273							# hash. If we have only one key, we know we will have a successful
1274							# reduction (since everything that was inserted into the node based
1275							# on the value of the last token of each path all mapped to the same
1276							# value).
1277
1278	1019	100	100			5796	if( @$fail == 0 and keys %$reduce == 1 and not $optional) {
			100
1279							# every path shares a common path
1280	506					825	my $path = (values %$reduce)[0];
1281	506					906	my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1282	506	100				1373	$debug and print "#$indent\|_reduce_node $ctx->{depth} common=@{[_dump($common)]} tail=", _dump($tail), "\n";
	50					105
1283	506					2350	return( $common, $tail );
1284							}
1285
1286							# this node resulted in a list of paths, game over
1287	513					1027	$ctx->{indent} = $indent;
1288	513					970	return _reduce_fail( $reduce, $fail, $optional, _descend($ctx) );
1289							}
1290
1291							sub _reduce_fail {
1292	513			513		628	my( $reduce, $fail, $optional, $ctx ) = @_;
1293	513					555	my( $debug, $depth, $indent ) = @{$ctx}{qw(debug depth indent)};
	513					935
1294	513					533	my %result;
1295	513	100				892	$result{''} = undef if $optional;
1296	513					472	my $p;
1297	513					1024	for $p (keys %$reduce) {
1298	1181					1157	my $path = $reduce->{$p};
1299	1181	100				1649	if( scalar @$path == 1 ) {
1300	1114					1137	$path = $path->[0];
1301	1114	100				1633	$debug and print "#$indent\| -simple opt=$optional unrev @{[_dump($path)]}\n";
	7					12
1302	1114					1383	$path = _unrev_path($path, _descend($ctx) );
1303	1114					2347	$result{_node_key($path->[0])} = $path;
1304							}
1305							else {
1306	67	100				283	$debug and print "#$indent\| _do_reduce(@{[_dump($path)]})\n";
	1					3
1307	67					143	my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1308							$path = [
1309							(
1310							ref($tail) eq 'HASH'
1311							? _unrev_node($tail, _descend($ctx) )
1312							: _unrev_path($tail, _descend($ctx) )
1313							),
1314	67	100				330	@{_unrev_path($common, _descend($ctx) )}
	67					131
1315							];
1316	67	100				266	$debug and print "#$indent\| +reduced @{[_dump($path)]}\n";
	1					3
1317	67					286	$result{_node_key($path->[0])} = $path;
1318							}
1319							}
1320	513					542	my $f;
1321	513					786	for $f( @$fail ) {
1322	219	100				369	$debug and print "#$indent\| +fail @{[_dump($f)]}\n";
	3					6
1323	219					383	$result{$f->[0]} = $f;
1324							}
1325	513	100				846	$debug and print "#$indent _reduce_fail $depth fail=@{[_dump(\%result)]}\n";
	5					11
1326	513					2252	return ( [], \%result );
1327							}
1328
1329							sub _scan_node {
1330	1019			1019		1144	my( $node, $ctx ) = @_;
1331	1019					1590	my $indent = ' ' x $ctx->{depth};
1332	1019					1142	my $debug = $ctx->{debug};
1333
1334							# For all the paths in the node, reverse them. If the first token
1335							# of the path is a scalar, push it onto an array in a hash keyed by
1336							# the value of the scalar.
1337							#
1338							# If it is a node, call _reduce_node on this node beforehand. If we
1339							# get back a common head, all of the paths in the subnode shared a
1340							# common tail. We then store the common part and the remaining node
1341							# of paths (which is where the paths diverged from the end and install
1342							# this into the same hash. At this point both the common and the tail
1343							# are in reverse order, just as simple scalar paths are.
1344							#
1345							# On the other hand, if there were no common path returned then all
1346							# the paths of the sub-node diverge at the end character. In this
1347							# case the tail cannot participate in any further reductions and will
1348							# appear in forward order.
1349							#
1350							# certainly the hurgliest function in the whole file :(
1351
1352							# $debug = 1 if $depth >= 8;
1353	1019					908	my @fail;
1354							my %reduce;
1355
1356	0					0	my $n;
1357	1019					2186	for $n(
1358	2743					5801	map { substr($_, index($_, '#')+1) }
1359							sort
1360							map {
1361							join( '\|' =>
1362	9029					13768	scalar(grep {ref($_) eq 'HASH'} @{$node->{$_}}),
	2743					4056
1363							_node_offset($node->{$_}),
1364	2743					2315	scalar @{$node->{$_}},
	2743					10173
1365							)
1366							. "#$_"
1367							}
1368							keys %$node ) {
1369	2743					2164	my( $end, @path ) = reverse @{$node->{$n}};
	2743					6082
1370	2743	100				4069	if( ref($end) ne 'HASH' ) {
1371	2137	100				3262	$debug and print "# $indent\|_scan_node push reduce ($end:@{[_dump(\@path)]})\n";
	87					177
1372	2137					1658	push @{$reduce{$end}}, [ $end, @path ];
	2137					7966
1373							}
1374							else {
1375	606	100				1208	$debug and print "# $indent\|_scan_node head=", _dump(\@path), ' tail=', _dump($end), "\n";
1376	606					627	my $new_path;
1377							# deal with sing, singing => s(?:ing)?ing
1378	606	100	66			2278	if( keys %$end == 2 and exists $end->{''} ) {
1379	94					180	my ($key, $opt_path) = each %$end;
1380	94	100				248	($key, $opt_path) = each %$end if $key eq '';
1381	94					94	$opt_path = [reverse @{$opt_path}];
	94					229
1382	94	100				196	$debug and print "# $indent\| check=", _dump($opt_path), "\n";
1383	94					328	my $end = { '' => undef, $opt_path->[0] => [@$opt_path] };
1384	94					169	my $head = [];
1385	94					187	my $path = [@path];
1386	94					207	($head, my $slide, $path) = _slide_tail( $head, $end, $path, $ctx );
1387	94	100				331	if( @$head ) {
1388	40					184	$new_path = [ @$head, $slide, @$path ];
1389							}
1390							}
1391	606	100				874	if( $new_path ) {
1392	40	100				84	$debug and print "# $indent\|_scan_node slid=", _dump($new_path), "\n";
1393	40					42	push @{$reduce{$new_path->[0]}}, $new_path;
	40					173
1394							}
1395							else {
1396	566					1003	my( $common, $tail ) = _reduce_node( $end, _descend($ctx) );
1397	566	100				1634	if( not @$common ) {
1398	219	100				586	$debug and print "# $indent\| +failed $n\n";
1399	219					839	push @fail, [reverse(@path), $tail];
1400							}
1401							else {
1402	347					782	my $path = [@path];
1403	347	100				705	$debug and print "# $indent\|_scan_node ++recovered common=@{[_dump($common)]} tail=",
	34					67
1404	34					46	_dump($tail), " path=@{[_dump($path)]}\n";
1405	347	100	100			1864	if( ref($tail) eq 'HASH'
1406							and keys %$tail == 2
1407							) {
1408	287	100				632	if( exists $tail->{''} ) {
1409	121					280	($common, $tail, $path) = _slide_tail( $common, $tail, $path, $ctx );
1410							}
1411							}
1412	347	100				432	push @{$reduce{$common->[0]}}, [
	347					2496
1413							@$common,
1414							(ref($tail) eq 'HASH' ? $tail : @$tail ),
1415							@$path
1416							];
1417							}
1418							}
1419							}
1420							}
1421	1019	100				2588	$debug and print
1422	55					179	"# $indent\|_scan_node counts: reduce=@{[scalar keys %reduce]} fail=@{[scalar @fail]}\n";
	55					2347
1423	1019					2340	return( \@fail, \%reduce );
1424							}
1425
1426							sub _do_reduce {
1427	573			573		679	my ($path, $ctx) = @_;
1428	573					954	my $indent = ' ' x $ctx->{depth};
1429	573					652	my $debug = $ctx->{debug};
1430	573					1495	my $ra = Regexp::Assemble->new(chomp=>0);
1431	573					1279	$ra->debug($debug);
1432	573	100				1026	$debug and print "# $indent\| do @{[_dump($path)]}\n";
	51					80
1433	573					1778	$ra->_insertr( $_ ) for
1434							# When nodes come into the picture, we have to be careful
1435							# about how we insert the paths into the assembly.
1436							# Paths with nodes first, then closest node to front
1437							# then shortest path. Merely because if we can control
1438							# order in which paths containing nodes get inserted,
1439							# then we can make a couple of assumptions that simplify
1440							# the code in _insert_node.
1441							sort {
1442	6000					7831	scalar(grep {ref($_) eq 'HASH'} @$a)
1443	1113	50	100			1441	<=> scalar(grep {ref($_) eq 'HASH'} @$b)
	6578					9782
1444							\|\|
1445							_node_offset($b) <=> _node_offset($a)
1446							\|\|
1447							scalar @$a <=> scalar @$b
1448							}
1449							@$path
1450							;
1451	573					1188	$path = $ra->_path;
1452	573					712	my $common = [];
1453	573					3553	push @$common, shift @$path while( ref($path->[0]) ne 'HASH' );
1454	573	100				1173	my $tail = scalar( @$path ) > 1 ? [@$path] : $path->[0];
1455	573	100				980	$debug and print "# $indent\| _do_reduce common=@{[_dump($common)]} tail=@{[_dump($tail)]}\n";
	51					89
	51					68
1456	573					3129	return ($common, $tail);
1457							}
1458
1459							sub _node_offset {
1460							# return the offset that the first node is found, or -ve
1461							# optimised for speed
1462	4422			4422		3172	my $nr = @{$_[0]};
	4422					5067
1463	4422					3803	my $atom = -1;
1464	4422		100			33844	ref($_[0]->[$atom]) eq 'HASH' and return $atom while ++$atom < $nr;
1465	3458					6136	return -1;
1466							}
1467
1468							sub _slide_tail {
1469	240			240		17552	my $head = shift;
1470	240					242	my $tail = shift;
1471	240					224	my $path = shift;
1472	240					245	my $ctx = shift;
1473	240					434	my $indent = ' ' x $ctx->{depth};
1474	240					279	my $debug = $ctx->{debug};
1475	240	100				444	$debug and print "# $indent\| slide in h=", _dump($head),
1476							' t=', _dump($tail), ' p=', _dump($path), "\n";
1477	240					486	my $slide_path = (each %$tail)[-1];
1478	240	100				585	$slide_path = (each %$tail)[-1] unless defined $slide_path;
1479	240	100				461	$debug and print "# $indent\| slide potential ", _dump($slide_path), " over ", _dump($path), "\n";
1480	240		100			1147	while( defined $path->[0] and $path->[0] eq $slide_path->[0] ) {
1481	154	100				478	$debug and print "# $indent\| slide=tail=$slide_path->[0]\n";
1482	154					195	my $slide = shift @$path;
1483	154					165	shift @$slide_path;
1484	154					241	push @$slide_path, $slide;
1485	154					570	push @$head, $slide;
1486							}
1487	240	100				520	$debug and print "# $indent\| slide path ", _dump($slide_path), "\n";
1488	240					501	my $slide_node = {
1489							'' => undef,
1490							_node_key($slide_path->[0]) => $slide_path,
1491							};
1492	240	100				571	$debug and print "# $indent\| slide out h=", _dump($head),
1493							' s=', _dump($slide_node), ' p=', _dump($path), "\n";
1494	240					702	return ($head, $slide_node, $path);
1495							}
1496
1497							sub _unrev_path {
1498	2690			2690		2747	my ($path, $ctx) = @_;
1499	2690					3576	my $indent = ' ' x $ctx->{depth};
1500	2690					2415	my $debug = $ctx->{debug};
1501	2690					2003	my $new;
1502	2690	100				2869	if( not grep { ref($_) } @$path ) {
	6383					10476
1503	2196	100				3404	$debug and print "# ${indent}_unrev path fast ", _dump($path);
1504	2196					3934	$new = [reverse @$path];
1505	2196	100				3520	$debug and print "# -> ", _dump($new), "\n";
1506	2196					3500	return $new;
1507							}
1508	494	100				860	$debug and print "# ${indent}unrev path in ", _dump($path), "\n";
1509	494					1147	while( defined( my $p = pop @$path )) {
1510	1483	100				4520	push @$new,
		100
1511							ref($p) eq 'HASH' ? _unrev_node($p, _descend($ctx) )
1512							: ref($p) eq 'ARRAY' ? _unrev_path($p, _descend($ctx) )
1513							: $p
1514							;
1515							}
1516	494	100				907	$debug and print "# ${indent}unrev path out ", _dump($new), "\n";
1517	494					755	return $new;
1518							}
1519
1520							sub _unrev_node {
1521	630			630		727	my ($node, $ctx ) = @_;
1522	630					1011	my $indent = ' ' x $ctx->{depth};
1523	630					591	my $debug = $ctx->{debug};
1524	630					866	my $optional = _remove_optional($node);
1525	630	100				1147	$debug and print "# ${indent}unrev node in ", _dump($node), " opt=$optional\n";
1526	630					643	my $new;
1527	630	100				1103	$new->{''} = undef if $optional;
1528	630					464	my $n;
1529	630					1455	for $n( keys %$node ) {
1530	1167					1780	my $path = _unrev_path($node->{$n}, _descend($ctx) );
1531	1167					2645	$new->{_node_key($path->[0])} = $path;
1532							}
1533	630	100				1142	$debug and print "# ${indent}unrev node out ", _dump($new), "\n";
1534	630					2628	return $new;
1535							}
1536
1537							sub _node_key {
1538	4657			4657		5056	my $node = shift;
1539	4657	100				7581	return _node_key($node->[0]) if ref($node) eq 'ARRAY';
1540	4646	100				15824	return $node unless ref($node) eq 'HASH';
1541	205					243	my $key = '';
1542	205					213	my $k;
1543	205					490	for $k( keys %$node ) {
1544	452	100				749	next if $k eq '';
1545	374	100	100			1311	$key = $k if $key eq '' or $key gt $k;
1546							}
1547	205					633	return $key;
1548							}
1549
1550							sub _descend {
1551							# Take a context object, and increase the depth by one.
1552							# By creating a fresh hash each time, we don't have to
1553							# bother adding make-work code to decrease the depth
1554							# when we return from what we called.
1555	6190			6190		5261	my $ctx = shift;
1556	6190					24211	return {%$ctx, depth => $ctx->{depth}+1};
1557							}
1558
1559							#####################################################################
1560
1561							sub _make_class {
1562	650			650		708	my $self = shift;
1563	650					1053	my %set = map { ($_,1) } @_;
	1675					3230
1564	650	100				1481	delete $set{'\\d'} if exists $set{'\\w'};
1565	650	100				1185	delete $set{'\\D'} if exists $set{'\\W'};
1566							return '.' if exists $set{'.'}
1567							or ($self->{fold_meta_pairs} and (
1568							(exists $set{'\\d'} and exists $set{'\\D'})
1569							or (exists $set{'\\s'} and exists $set{'\\S'})
1570	650	100	66			5172	or (exists $set{'\\w'} and exists $set{'\\W'})
			66
			66
1571							))
1572							;
1573	632					1001	for my $meta( q/\\d/, q/\\D/, q/\\s/, q/\\S/, q/\\w/, q/\\W/ ) {
1574	3792	100				6044	if( exists $set{$meta} ) {
1575	28					180	my $re = qr/$meta/;
1576	28					39	my @delete;
1577	28		66			379	$_ =~ /^$re$/ and push @delete, $_ for keys %set;
1578	28	100				90	delete @set{@delete} if @delete;
1579							}
1580							}
1581	632	100				1352	return (keys %set)[0] if keys %set == 1;
1582	625					880	for my $meta( '.', '+', '*', '?', '(', ')', '^', '@', '$', '[', '/', ) {
1583	6875	100				11239	exists $set{"\\$meta"} and $set{$meta} = delete $set{"\\$meta"};
1584							}
1585	625	100				1211	my $dash = exists $set{'-'} ? do { delete($set{'-'}), '-' } : '';
	20					47
1586	625	100				975	my $caret = exists $set{'^'} ? do { delete($set{'^'}), '^' } : '';
	7					16
1587	625					2184	my $class = join( '' => sort keys %set );
1588	625	100	100			1938	$class =~ s/0123456789/\\d/ and $class eq '\\d' and return $class;
1589	622					4706	return "[$dash$class$caret]";
1590							}
1591
1592							sub _re_sort {
1593	999		100	999		5178	return length $b <=> length $a \|\| $a cmp $b
1594							}
1595
1596							sub _combine {
1597	140			140		155	my $self = shift;
1598	140					134	my $type = shift;
1599							# print "c in = @{[_dump(\@_)]}\n";
1600							# my $combine =
1601							return '('
1602							. $type
1603	140					177	. do {
1604	140					113	my( @short, @long );
1605	140	100				249	push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
	377					2240
1606	140	100				325	if( @short == 1 ) {
		100
1607	31					126	@long = sort _re_sort @long, @short;
1608							}
1609							elsif( @short > 1 ) {
1610							# yucky but true
1611	77					151	my @combine = (_make_class($self, @short), sort _re_sort @long);
1612	77					211	@long = @combine;
1613							}
1614							else {
1615	32					103	@long = sort _re_sort @long;
1616							}
1617	140					473	join( '\|', @long );
1618							}
1619							. ')';
1620							# print "combine <$combine>\n";
1621							# $combine;
1622							}
1623
1624							sub _combine_new {
1625	1738			1738		1746	my $self = shift;
1626	1738					1560	my( @short, @long );
1627	1738	100				2705	push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
	3244					20283
1628	1738	100	100			8016	if( @short == 1 and @long == 0 ) {
		100	100
1629	365					2017	return $short[0];
1630							}
1631							elsif( @short > 1 and @short == @_ ) {
1632	494					904	return _make_class($self, @short);
1633							}
1634							else {
1635	879	100				13737	return '(?:'
1636							. join( '\|' =>
1637							@short > 1
1638							? ( _make_class($self, @short), sort _re_sort @long)
1639							: ( (sort _re_sort( @long )), @short )
1640							)
1641							. ')';
1642							}
1643							}
1644
1645							sub _re_path {
1646	4772			4772		4334	my $self = shift;
1647							# in shorter assemblies, _re_path() is the second hottest
1648							# routine. after insert(), so make it fast.
1649
1650	4772	100				7927	if ($self->{unroll_plus}) {
1651							# but we can't easily make this blockless
1652	72					81	my @arr = @{$_[0]};
	72					126
1653	72					67	my $str = '';
1654	72					58	my $skip = 0;
1655	72					125	for my $i (0..$#arr) {
1656	127	100	100			1188	if (ref($arr[$i]) eq 'ARRAY') {
		100
		100
		100
1657	1					3	$str .= _re_path($self, $arr[$i]);
1658							}
1659							elsif (ref($arr[$i]) eq 'HASH') {
1660							$str .= exists $arr[$i]->{''}
1661							? _combine_new( $self,
1662	7					19	map { _re_path( $self, $arr[$i]->{$_} ) } grep { $_ ne '' } keys %{$arr[$i]}
	14					23
	7					21
1663							) . '?'
1664	28	100				48	: _combine_new($self, map { _re_path( $self, $arr[$i]->{$_} ) } keys %{$arr[$i]})
	42					73
	21					50
1665							;
1666							}
1667							elsif ($i < $#arr and $arr[$i+1] =~ /\A$arr[$i]\*(\??)\Z/) {
1668	7	50				29	$str .= "$arr[$i]+" . (defined $1 ? $1 : '');
1669	7					13	++$skip;
1670							}
1671							elsif ($skip) {
1672	7					13	$skip = 0;
1673							}
1674							else {
1675	84					137	$str .= $arr[$i];
1676							}
1677							}
1678	72					200	return $str;
1679							}
1680
1681	4700	50				5156	return join( '', @_ ) unless grep { length ref $_ } @_;
	4700					10016
1682	4700					3412	my $p;
1683							return join '', map {
1684							ref($_) eq '' ? $_
1685	9506	100				24009	: ref($_) eq 'HASH' ? do {
		100
1686							# In the case of a node, see whether there's a '' which
1687							# indicates that the whole thing is optional and thus
1688							# requires a trailing ?
1689							# Unroll the two different paths to avoid the needless
1690							# grep when it isn't necessary.
1691	1710					1479	$p = $_;
1692							exists $_->{''}
1693							? _combine_new( $self,
1694	801					1613	map { _re_path( $self, $p->{$_} ) } grep { $_ ne '' } keys %$_
	1508					2527
1695							) . '?'
1696	1710	100				5146	: _combine_new($self, map { _re_path( $self, $p->{$_} ) } keys %$_ )
	2394					3791
1697							}
1698							: _re_path($self, $_) # ref($_) eq 'ARRAY'
1699	4700					3629	} @{$_[0]}
	4700					6443
1700							}
1701
1702							sub _lookahead {
1703	132			132		141	my $in = shift;
1704	132					129	my %head;
1705							my $path;
1706	132					244	for $path( keys %$in ) {
1707	328	100				577	next unless defined $in->{$path};
1708							# print "look $path: ", ref($in->{$path}[0]), ".\n";
1709	267	100				595	if( ref($in->{$path}[0]) eq 'HASH' ) {
		100
1710	15					20	my $next = 0;
1711	15		100			54	while( ref($in->{$path}[$next]) eq 'HASH' and @{$in->{$path}} > $next + 1 ) {
	16					70
1712	11	100				30	if( exists $in->{$path}[$next]{''} ) {
1713	5					11	++$head{$in->{$path}[$next+1]};
1714							}
1715	11					31	++$next;
1716							}
1717	15					48	my $inner = _lookahead( $in->{$path}[0] );
1718	15					70	@head{ keys %$inner } = (values %$inner);
1719							}
1720							elsif( ref($in->{$path}[0]) eq 'ARRAY' ) {
1721	2					5	my $subpath = $in->{$path}[0];
1722	2					10	for( my $sp = 0; $sp < @$subpath; ++$sp ) {
1723	3	100				11	if( ref($subpath->[$sp]) eq 'HASH' ) {
1724	2					8	my $follow = _lookahead( $subpath->[$sp] );
1725	2					9	@head{ keys %$follow } = (values %$follow);
1726	2	100				13	last unless exists $subpath->[$sp]{''};
1727							}
1728							else {
1729	1					3	++$head{$subpath->[$sp]};
1730	1					2	last;
1731							}
1732							}
1733							}
1734							else {
1735	250					488	++$head{ $in->{$path}[0] };
1736							}
1737							}
1738							# print "_lookahead ", _dump($in), '==>', _dump([keys %head]), "\n";
1739	132					277	return \%head;
1740							}
1741
1742							sub _re_path_lookahead {
1743	265			265		250	my $self = shift;
1744	265					226	my $in = shift;
1745							# print "_re_path_la in ", _dump($in), "\n";
1746	265					241	my $out = '';
1747	265					526	for( my $p = 0; $p < @$in; ++$p ) {
1748	573	100				979	if( ref($in->[$p]) eq '' ) {
		100
1749	462					460	$out .= $in->[$p];
1750	462					794	next;
1751							}
1752							elsif( ref($in->[$p]) eq 'ARRAY' ) {
1753	2					9	$out .= _re_path_lookahead($self, $in->[$p]);
1754	2					8	next;
1755							}
1756							# print "$p ", _dump($in->[$p]), "\n";
1757							my $path = [
1758	228					455	map { _re_path_lookahead($self, $in->[$p]{$_} ) }
1759	275					445	grep { $_ ne '' }
1760	109					102	keys %{$in->[$p]}
	109					294
1761							];
1762	109					287	my $ahead = _lookahead($in->[$p]);
1763	109					115	my $more = 0;
1764	109	100	100			393	if( exists $in->[$p]{''} and $p + 1 < @$in ) {
1765	12					14	my $next = 1;
1766	12					38	while( $p + $next < @$in ) {
1767	14	100				36	if( ref( $in->[$p+$next] ) eq 'HASH' ) {
1768	2					5	my $follow = _lookahead( $in->[$p+$next] );
1769	2					8	@{$ahead}{ keys %$follow } = (values %$follow);
	2					7
1770							}
1771							else {
1772	12					28	++$ahead->{$in->[$p+$next]};
1773	12					15	last;
1774							}
1775	2					6	++$next;
1776							}
1777	12					17	$more = 1;
1778							}
1779	109					146	my $nr_one = grep { /^$Single_Char$/ } @$path;
	228					1500
1780	109					129	my $nr = @$path;
1781	109	100	100			340	if( $nr_one > 1 and $nr_one == $nr ) {
1782	18					53	$out .= _make_class($self, @$path);
1783	18	100				110	$out .= '?' if exists $in->[$p]{''};
1784							}
1785							else {
1786							my $zwla = keys(%$ahead) > 1
1787	91	100				281	? _combine($self, '?=', grep { s/\+$//; $_ } keys %$ahead )
	191					190
	191					296
1788							: '';
1789	91	100				255	my $patt = $nr > 1 ? _combine($self, '?:', @$path ) : $path->[0];
1790							# print "have nr=$nr n1=$nr_one n=", _dump($in->[$p]), ' a=', _dump([keys %$ahead]), " zwla=$zwla patt=$patt @{[_dump($path)]}\n";
1791	91	100				216	if( exists $in->[$p]{''} ) {
1792	44	100				282	$out .= $more ? "$zwla(?:$patt)?" : "(?:$zwla$patt)?";
1793							}
1794							else {
1795	47					228	$out .= "$zwla$patt";
1796							}
1797							}
1798							}
1799	265					714	return $out;
1800							}
1801
1802							sub _re_path_track {
1803	33			33		48	my $self = shift;
1804	33					35	my $in = shift;
1805	33					36	my $normal = shift;
1806	33					49	my $augmented = shift;
1807	33					36	my $o;
1808	33					41	my $simple = '';
1809	33					41	my $augment = '';
1810	33					90	for( my $n = 0; $n < @$in; ++$n ) {
1811	114	100				188	if( ref($in->[$n]) eq '' ) {
1812	104					118	$o = $in->[$n];
1813	104					99	$simple .= $o;
1814	104					108	$augment .= $o;
1815	104	100	100			896	if( (
			66
			100
1816							$n < @$in - 1
1817							and ref($in->[$n+1]) eq 'HASH' and exists $in->[$n+1]{''}
1818							)
1819							or $n == @$in - 1
1820							) {
1821	24					24	push @{$self->{mlist}}, $normal . $simple ;
	24					97
1822	24	50				93	$augment .= $] < 5.009005
1823							? "(?{\$self->{m}=$self->{mcount}})"
1824							: "(?{$self->{mcount}})"
1825							;
1826	24					83	++$self->{mcount};
1827							}
1828							}
1829							else {
1830							my $path = [
1831	25					111	map { $self->_re_path_track( $in->[$n]{$_}, $normal.$simple , $augmented.$augment ) }
1832	26					58	grep { $_ ne '' }
1833	10					14	keys %{$in->[$n]}
	10					43
1834							];
1835	10					80	$o = '(?:' . join( '\|' => sort _re_sort @$path ) . ')';
1836	10	100				31	$o .= '?' if exists $in->[$n]{''};
1837	10					20	$simple .= $o;
1838	10					42	$augment .= $o;
1839							}
1840							}
1841	33					116	return $augment;
1842							}
1843
1844							sub _re_path_pretty {
1845	411			411		377	my $self = shift;
1846	411					362	my $in = shift;
1847	411					331	my $arg = shift;
1848	411					759	my $pre = ' ' x (($arg->{depth}+0) * $arg->{indent});
1849	411					536	my $indent = ' ' x (($arg->{depth}+1) * $arg->{indent});
1850	411					389	my $out = '';
1851	411					355	$arg->{depth}++;
1852	411					342	my $prev_was_paren = 0;
1853	411					774	for( my $p = 0; $p < @$in; ++$p ) {
1854	1084	100				1705	if( ref($in->[$p]) eq '' ) {
		100
1855	910	100				1303	$out .= "\n$pre" if $prev_was_paren;
1856	910					824	$out .= $in->[$p];
1857	910					1554	$prev_was_paren = 0;
1858							}
1859							elsif( ref($in->[$p]) eq 'ARRAY' ) {
1860	3					13	$out .= _re_path($self, $in->[$p]);
1861							}
1862							else {
1863							my $path = [
1864	369					683	map { _re_path_pretty($self, $in->[$p]{$_}, $arg ) }
1865	419					629	grep { $_ ne '' }
1866	171					145	keys %{$in->[$p]}
	171					451
1867							];
1868	171					334	my $nr = @$path;
1869	171					148	my( @short, @long );
1870	171	100				271	push @{/^$Single_Char$/ ? \@short : \@long}, $_ for @$path;
	369					2421
1871	171	100				278	if( @short == $nr ) {
1872	37	100				123	$out .= $nr == 1 ? $path->[0] : _make_class($self, @short);
1873	37	100				205	$out .= '?' if exists $in->[$p]{''};
1874							}
1875							else {
1876	134	100				291	$out .= "\n" if length $out;
1877	134	100				221	$out .= $pre if $p;
1878	134					206	$out .= "(?:\n$indent";
1879	134	100				209	if( @short < 2 ) {
1880	133					138	my $r = 0;
1881							$out .= join( "\n$indent\|" => map {
1882	133	100				546	$r++ and $_ =~ s/^\(\?:/\n$indent(?:/;
	298					547
1883	298					574	$_
1884							}
1885							sort _re_sort @$path
1886							);
1887							}
1888							else {
1889	1					7	$out .= join( "\n$indent\|" => ( (sort _re_sort @long), _make_class($self, @short) ));
1890							}
1891	134					231	$out .= "\n$pre)";
1892	134	100				241	if( exists $in->[$p]{''} ) {
1893	37					68	$out .= "\n$pre?";
1894	37					133	$prev_was_paren = 0;
1895							}
1896							else {
1897	97					314	$prev_was_paren = 1;
1898							}
1899							}
1900							}
1901							}
1902	411					371	$arg->{depth}--;
1903	411					1009	return $out;
1904							}
1905
1906							sub _node_eq {
1907	425	100	66	425		1534	return 0 if not defined $_[0] or not defined $_[1];
1908	422	100				974	return 0 if ref $_[0] ne ref $_[1];
1909							# Now that we have determined that the reference of each
1910							# argument are the same, we only have to test the first
1911							# one, which gives us a nice micro-optimisation.
1912	381	100				741	if( ref($_[0]) eq 'HASH' ) {
		100
1913	305					634	keys %{$_[0]} == keys %{$_[1]}
	305					878
1914							and
1915							# does this short-circuit to avoid _re_path() cost more than it saves?
1916	305	100	100			271	join( '\|' => sort keys %{$_[0]}) eq join( '\|' => sort keys %{$_[1]})
	272					957
	272					1669
1917							and
1918							_re_path(undef, [$_[0]] ) eq _re_path(undef, [$_[1]] );
1919							}
1920							elsif( ref($_[0]) eq 'ARRAY' ) {
1921	9	100				8	scalar @{$_[0]} == scalar @{$_[1]}
	9					15
	9					31
1922							and
1923							_re_path(undef, $_[0]) eq _re_path(undef, $_[1]);
1924							}
1925							else {
1926	67					232	$_[0] eq $_[1];
1927							}
1928							}
1929
1930							sub _pretty_dump {
1931	7			7		32	return sprintf "\\x%02x", ord(shift);
1932							}
1933
1934							sub _dump {
1935	5579			5579		5210	my $path = shift;
1936	5579	100				9288	return _dump_node($path) if ref($path) eq 'HASH';
1937	5057					3824	my $dump = '[';
1938	5057					3311	my $d;
1939	5057					3539	my $nr = 0;
1940	5057					5040	for $d( @$path ) {
1941	11036	100				15461	$dump .= ' ' if $nr++;
1942	11036	100				18870	if( ref($d) eq 'HASH' ) {
		100
		100
1943	1340					1584	$dump .= _dump_node($d);
1944							}
1945							elsif( ref($d) eq 'ARRAY' ) {
1946	242					297	$dump .= _dump($d);
1947							}
1948							elsif( defined $d ) {
1949							# D::C indicates the second test is redundant
1950							# $dump .= ( $d =~ /\s/ or not length $d )
1951	9453	100				22412	$dump .= (
		100
1952							$d =~ /\s/ ? qq{'$d'} :
1953							$d =~ /^[\x00-\x1f]$/ ? _pretty_dump($d) :
1954							$d
1955							);
1956							}
1957							else {
1958	1					3	$dump .= '*';
1959							}
1960							}
1961	5057					55197	return $dump . ']';
1962							}
1963
1964							sub _dump_node {
1965	1862			1862		1470	my $node = shift;
1966	1862					1417	my $dump = '{';
1967	1862					1316	my $nr = 0;
1968	1862					1349	my $n;
1969	1862					4209	for $n (sort keys %$node) {
1970	3899	100				5573	$dump .= ' ' if $nr++;
1971							# Devel::Cover shows this to test to be redundant
1972							# $dump .= ( $n eq '' and not defined $node->{$n} )
1973							$dump .= $n eq ''
1974							? '*'
1975							: ($n =~ /^[\x00-\x1f]$/ ? _pretty_dump($n) : $n)
1976	3899	100				9647	. "=>" . _dump($node->{$n})
		100
1977							;
1978							}
1979	1862					19697	return $dump . '}';
1980							}
1981
1982							=pod
1983
1984							=head1 NAME
1985
1986							Regexp::Assemble - Assemble multiple Regular Expressions into a single RE
1987
1988							=head1 SYNOPSIS
1989
1990							use Regexp::Assemble;
1991
1992							my $ra = Regexp::Assemble->new;
1993							$ra->add( 'ab+c' );
1994							$ra->add( 'ab+-' );
1995							$ra->add( 'a\w\d+' );
1996							$ra->add( 'a\d+' );
1997							print $ra->re; # prints a(?:\w?\d+\|b+[-c])
1998
1999							=head1 DESCRIPTION
2000
2001							Regexp::Assemble takes an arbitrary number of regular expressions
2002							and assembles them into a single regular expression (or RE) that
2003							matches all that the individual REs match.
2004
2005							As a result, instead of having a large list of expressions to loop
2006							over, a target string only needs to be tested against one expression.
2007							This is interesting when you have several thousand patterns to deal
2008							with. Serious effort is made to produce the smallest pattern possible.
2009
2010							It is also possible to track the original patterns, so that you can
2011							determine which, among the source patterns that form the assembled
2012							pattern, was the one that caused the match to occur.
2013
2014							You should realise that large numbers of alternations are processed
2015							in perl's regular expression engine in O(n) time, not O(1). If you
2016							are still having performance problems, you should look at using a
2017							trie. Note that Perl's own regular expression engine will implement
2018							trie optimisations in perl 5.10 (they are already available in
2019							perl 5.9.3 if you want to try them out). C will
2020							do the right thing when it knows it's running on a trie'd perl.
2021							(At least in some version after this one).
2022
2023							Some more examples of usage appear in the accompanying README. If
2024							that file is not easy to access locally, you can find it on a web
2025							repository such as
2026							L or
2027							L.
2028
2029							See also L.
2030
2031							=head1 Methods
2032
2033							=head2 add(LIST)
2034
2035							Takes a string, breaks it apart into a set of tokens (respecting
2036							meta characters) and inserts the resulting list into the C
2037							object. It uses a naive regular expression to lex the string
2038							that may be fooled complex expressions (specifically, it will
2039							fail to lex nested parenthetical expressions such as
2040							C correctly). If this is the case, the end of
2041							the string will not be tokenised correctly and returned as one
2042							long string.
2043
2044							On the one hand, this may indicate that the patterns you are
2045							trying to feed the C object are too complex. Simpler
2046							patterns might allow the algorithm to work more effectively and
2047							perform more reductions in the resulting pattern.
2048
2049							On the other hand, you can supply your own pattern to perform the
2050							lexing if you need. The test suite contains an example of a lexer
2051							pattern that will match one level of nested parentheses.
2052
2053							Note that there is an internal optimisation that will bypass a
2054							much of the lexing process. If a string contains no C<\>
2055							(backslash), C<[> (open square bracket), C<(> (open paren),
2056							C (question mark), C<+> (plus), C<*> (star) or C<{> (open
2057							curly), a character split will be performed directly.
2058
2059							A list of strings may be supplied, thus you can pass it a file
2060							handle of a file opened for reading:
2061
2062							$re->add( '\d+-\d+-\d+-\d+\.example\.com' );
2063							$re->add( );
2064
2065							If the file is very large, it may be more efficient to use a
2066							C loop, to read the file line-by-line:
2067
2068							$re->add($_) while ;
2069
2070							The C method will chomp the lines automatically. If you
2071							do not want this to occur (you want to keep the record
2072							separator), then disable Cing.
2073
2074							$re->chomp(0);
2075							$re->add($_) while ;
2076
2077							This method is chainable.
2078
2079							=head2 add_file(FILENAME [...])
2080
2081							Takes a list of file names. Each file is opened and read
2082							line by line. Each line is added to the assembly.
2083
2084							$r->add_file( 'file.1', 'file.2' );
2085
2086							If a file cannot be opened, the method will croak. If you cannot
2087							afford to let this happen then you should wrap the call in a C
2088							block.
2089
2090							Chomping happens automatically unless you the C method
2091							to disable it. By default, input lines are read according to the
2092							value of the C attribute (if defined), and
2093							will otherwise fall back to the current setting of the system C<$/>
2094							variable. The record separator may also be specified on each
2095							call to C. Internally, the routine Cises the
2096							value of C<$/> to whatever is required, for the duration of the
2097							call.
2098
2099							An alternate calling mechanism using a hash reference is
2100							available. The recognised keys are:
2101
2102							=over 4
2103
2104							=item file
2105
2106							Reference to a list of file names, or the name of a single
2107							file.
2108
2109							$r->add_file({file => ['file.1', 'file.2', 'file.3']});
2110							$r->add_file({file => 'file.n'});
2111
2112							=item input_record_separator
2113
2114							If present, indicates what constitutes a line
2115
2116							$r->add_file({file => 'data.txt', input_record_separator => ':' });
2117
2118							=item rs
2119
2120							An alias for input_record_separator (mnemonic: same as the
2121							English variable names).
2122
2123							=back
2124
2125							$r->add_file( {
2126							file => [ 'pattern.txt', 'more.txt' ],
2127							input_record_separator => "\r\n",
2128							});
2129
2130							=head2 clone()
2131
2132							Clones the contents of a Regexp::Assemble object and creates a new
2133							object (in other words it performs a deep copy).
2134
2135							If the Storable module is installed, its dclone method will be used,
2136							otherwise the cloning will be performed using a pure perl approach.
2137
2138							You can use this method to take a snapshot of the patterns that have
2139							been added so far to an object, and generate an assembly from the
2140							clone. Additional patterns may to be added to the original object
2141							afterwards.
2142
2143							my $re = $main->clone->re();
2144							$main->add( 'another-pattern-\\d+' );
2145
2146							=head2 insert(LIST)
2147
2148							Takes a list of tokens representing a regular expression and
2149							stores them in the object. Note: you should not pass it a bare
2150							regular expression, such as C. You must pass it as
2151							a list of tokens, I C<('a', 'b+', 'c?', 'd*', 'e')>.
2152
2153							This method is chainable, I:
2154
2155							my $ra = Regexp::Assemble->new
2156							->insert( qw[ a b+ c? d* e ] )
2157							->insert( qw[ a c+ d+ e* f ] );
2158
2159							Lexing complex patterns with metacharacters and so on can consume
2160							a significant proportion of the overall time to build an assembly.
2161							If you have the information available in a tokenised form, calling
2162							C directly can be a big win.
2163
2164							=head2 lexstr
2165
2166							Use the C method if you are curious to see how a pattern
2167							gets tokenised. It takes a scalar on input, representing a pattern,
2168							and returns a reference to an array, containing the tokenised
2169							pattern. You can recover the original pattern by performing a
2170							C:
2171
2172							my @token = $re->lexstr($pattern);
2173							my $new_pattern = join( '', @token );
2174
2175							If the original pattern contains unnecessary backslashes, or C<\x4b>
2176							escapes, or quotemeta escapes (C<\Q>...C<\E>) the resulting pattern
2177							may not be identical.
2178
2179							Call C does not add the pattern to the object, it is merely
2180							for exploratory purposes. It will, however, update various statistical
2181							counters.
2182
2183							=head2 pre_filter(CODE)
2184
2185							Allows you to install a callback to check that the pattern being
2186							loaded contains valid input. It receives the pattern as a whole to
2187							be added, before it been tokenised by the lexer. It may to return
2188							0 or C to indicate that the pattern should not be added, any
2189							true value indicates that the contents are fine.
2190
2191							A filter to strip out trailing comments (marked by #):
2192
2193							$re->pre_filter( sub { $_[0] =~ s/\s#.$//; 1 } );
2194
2195							A filter to ignore blank lines:
2196
2197							$re->pre_filter( sub { length(shift) } );
2198
2199							If you want to remove the filter, pass C as a parameter.
2200
2201							$ra->pre_filter(undef);
2202
2203							This method is chainable.
2204
2205							=head2 filter(CODE)
2206
2207							Allows you to install a callback to check that the pattern being
2208							loaded contains valid input. It receives a list on input, after it
2209							has been tokenised by the lexer. It may to return 0 or undef to
2210							indicate that the pattern should not be added, any true value
2211							indicates that the contents are fine.
2212
2213							If you know that all patterns you expect to assemble contain
2214							a restricted set of of tokens (e.g. no spaces), you could do
2215							the following:
2216
2217							$ra->filter(sub { not grep { / / } @_ });
2218
2219							or
2220
2221							sub only_spaces_and_digits {
2222							not grep { ![\d ] } @_
2223							}
2224							$ra->filter( \&only_spaces_and_digits );
2225
2226							These two examples will silently ignore faulty patterns, If you
2227							want the user to be made aware of the problem you should raise an
2228							error (via C or C), log an error message, whatever is
2229							best. If you want to remove a filter, pass C as a parameter.
2230
2231							$ra->filter(undef);
2232
2233							This method is chainable.
2234
2235							=head2 as_string
2236
2237							Assemble the expression and return it as a string. You may want to do
2238							this if you are writing the pattern to a file. The following arguments
2239							can be passed to control the aspect of the resulting pattern:
2240
2241							B, the number of spaces used to indent nested grouping of
2242							a pattern. Use this to produce a pretty-printed pattern (for some
2243							definition of "pretty"). The resulting output is rather verbose. The
2244							reason is to ensure that the metacharacters C<(?:> and C<)> always
2245							occur on otherwise empty lines. This allows you grep the result for an
2246							even more synthetic view of the pattern:
2247
2248							egrep -v '^ *[()]'
2249
2250							The result of the above is quite readable. Remember to backslash the
2251							spaces appearing in your own patterns if you wish to use an indented
2252							pattern in an C construct. Indenting is ignored if tracking
2253							is enabled.
2254
2255							The B argument takes precedence over the C
2256							method/attribute of the object.
2257
2258							Calling this
2259							method will drain the internal data structure. Large numbers of patterns
2260							can eat a significant amount of memory, and this lets perl recover the
2261							memory used for other purposes.
2262
2263							If you want to reduce the pattern I continue to add new patterns,
2264							clone the object and reduce the clone, leaving the original object intact.
2265
2266							=head2 re
2267
2268							Assembles the pattern and return it as a compiled RE, using the
2269							C operator.
2270
2271							As with C, calling this method will reset the internal data
2272							structures to free the memory used in assembling the RE.
2273
2274							The B attribute, documented in the C method, can be
2275							used here (it will be ignored if tracking is enabled).
2276
2277							With method chaining, it is possible to produce a RE without having
2278							a temporary C object lying around, I:
2279
2280							my $re = Regexp::Assemble->new
2281							->add( q[ab+cd+e] )
2282							->add( q[ac\\d+e] )
2283							->add( q[c\\d+e] )
2284							->re;
2285
2286							The C<$re> variable now contains a Regexp object that can be used
2287							directly:
2288
2289							while( <> ) {
2290							/$re/ and print "Something in [$_] matched\n";
2291							)
2292
2293							The C method is called when the object is used in string context
2294							(hence, within an C operator), so by and large you do not even
2295							need to save the RE in a separate variable. The following will work
2296							as expected:
2297
2298							my $re = Regexp::Assemble->new->add( qw[ fee fie foe fum ] );
2299							while( ) {
2300							if( /($re)/ ) {
2301							print "Here be giants: $1\n";
2302							}
2303							}
2304
2305							This approach does not work with tracked patterns. The
2306							C and C methods must be used instead, see below.
2307
2308							=head2 match(SCALAR)
2309
2310							The following information applies to Perl 5.8 and below. See
2311							the section that follows for information on Perl 5.10.
2312
2313							If pattern tracking is in use, you must C in order
2314							to make things work correctly. At a minimum, this will make your
2315							code look like this:
2316
2317							my $did_match = do { use re 'eval'; $target =~ /$ra/ }
2318							if( $did_match ) {
2319							print "matched ", $ra->matched, "\n";
2320							}
2321
2322							(The main reason is that the C<$^R> variable is currently broken
2323							and an ugly workaround that runs some Perl code during the match
2324							is required, in order to simulate what C<$^R> should be doing. See
2325							Perl bug #32840 for more information if you are curious. The README
2326							also contains more information). This bug has been fixed in 5.10.
2327
2328							The important thing to note is that with C, THERE
2329							ARE SECURITY IMPLICATIONS WHICH YOU IGNORE AT YOUR PERIL. The problem
2330							is this: if you do not have strict control over the patterns being
2331							fed to C when tracking is enabled, and someone
2332							slips you a pattern such as C and you
2333							attempt to match a string against the resulting pattern, you will
2334							know Fear and Loathing.
2335
2336							What is more, the C<$^R> workaround means that that tracking does
2337							not work if you perform a bare C pattern match as shown
2338							above. You have to instead call the C method, in order to
2339							supply the necessary context to take care of the tracking housekeeping
2340							details.
2341
2342							if( defined( my $match = $ra->match($_)) ) {
2343							print " $_ matched by $match\n";
2344							}
2345
2346							In the case of a successful match, the original matched pattern
2347							is returned directly. The matched pattern will also be available
2348							through the C method.
2349
2350							(Except that the above is not true for 5.6.0: the C method
2351							returns true or undef, and the C method always returns
2352							undef).
2353
2354							If you are capturing parts of the pattern I C
2355							you will want to get at the captures. See the C, C,
2356							C and C methods. If you are not using captures
2357							then you may safely ignore this section.
2358
2359							In 5.10, since the bug concerning C<$^R> has been resolved, there
2360							is no need to use C and the assembled pattern does
2361							not require any Perl code to be executed during the match.
2362
2363							=head2 new()
2364
2365							Creates a new C object. The following optional
2366							key/value parameters may be employed. All keys have a corresponding
2367							method that can be used to change the behaviour later on. As a
2368							general rule, especially if you're just starting out, you don't
2369							have to bother with any of these.
2370
2371							B, a family of optional attributes that allow anchors
2372							(C<^>, C<\b>, C<\Z>...) to be added to the resulting pattern.
2373
2374							B, sets the C flags to add to the assembled regular
2375							expression. Warning: no error checking is done, you should ensure
2376							that the flags you pass are understood by the version of Perl you
2377							are using. B exists as an alias, for users familiar
2378							with L.
2379
2380							B, controls whether the pattern should be chomped before being
2381							lexed. Handy if you are reading patterns from a file. By default,
2382							Cing is performed (this behaviour changed as of version 0.24,
2383							prior versions did not chomp automatically).
2384							See also the C attribute and the C method.
2385
2386							B, slurp the contents of the specified file and add them
2387							to the assembly. Multiple files may be processed by using a list.
2388
2389							my $r = Regexp::Assemble->new(file => 're.list');
2390
2391							my $r = Regexp::Assemble->new(file => ['re.1', 're.2']);
2392
2393							If you really don't want chomping to occur, you will have to set
2394							the C attribute to 0 (zero). You may also want to look at
2395							the C attribute, as well.
2396
2397							B, controls what constitutes a record
2398							separator when using the C attribute or the C
2399							method. May be abbreviated to B. See the C<$/> variable in
2400							L.
2401
2402							B, controls whether the pattern should contain zero-width
2403							lookahead assertions (For instance: (?=[abc])(?:bob\|alice\|charles).
2404							This is not activated by default, because in many circumstances the
2405							cost of processing the assertion itself outweighs the benefit of
2406							its faculty for short-circuiting a match that will fail. This is
2407							sensitive to the probability of a match succeeding, so if you're
2408							worried about performance you'll have to benchmark a sample population
2409							of targets to see which way the benefits lie.
2410
2411							B, controls whether you want know which of the initial
2412							patterns was the one that matched. See the C method for
2413							more details. Note for version 5.8 of Perl and below, in this mode
2414							of operation YOU SHOULD BE AWARE OF THE SECURITY IMPLICATIONS that
2415							this entails. Perl 5.10 does not suffer from any such restriction.
2416
2417							B, the number of spaces used to indent nested grouping of
2418							a pattern. Use this to produce a pretty-printed pattern. See the
2419							C method for a more detailed explanation.
2420
2421							B, allows you to add a callback to enable sanity checks
2422							on the pattern being loaded. This callback is triggered before the
2423							pattern is split apart by the lexer. In other words, it operates
2424							on the entire pattern. If you are loading patterns from a file,
2425							this would be an appropriate place to remove comments.
2426
2427							B, allows you to add a callback to enable sanity checks on
2428							the pattern being loaded. This callback is triggered after the
2429							pattern has been split apart by the lexer.
2430
2431							B, controls whether to unroll, for example, C into
2432							C, C, which may allow additional reductions in the
2433							resulting assembled pattern.
2434
2435							B, controls whether tail reduction occurs or not. If set,
2436							patterns like C will be reduced to C.
2437							That is, the end of the pattern in each part of the b... and d...
2438							alternations is identical, and hence is hoisted out of the alternation
2439							and placed after it. On by default. Turn it off if you're really
2440							pressed for short assembly times.
2441
2442							B, specifies the pattern used to lex the input lines into
2443							tokens. You could replace the default pattern by a more sophisticated
2444							version that matches arbitrarily nested parentheses, for example.
2445
2446							B, controls whether copious amounts of output is produced
2447							during the loading stage or the reducing stage of assembly.
2448
2449							my $ra = Regexp::Assemble->new;
2450							my $rb = Regexp::Assemble->new( chomp => 1, debug => 3 );
2451
2452							B, controls whether new patterns can be added to the object
2453							after the assembled pattern is generated. DEPRECATED.
2454
2455							This method/attribute will be removed in a future release. It doesn't
2456							really serve any purpose, and may be more effectively replaced by
2457							cloning an existing C object and spinning out a
2458							pattern from that instead.
2459
2460							=head2 source()
2461
2462							When using tracked mode, after a successful match is made, returns
2463							the original source pattern that caused the match. In Perl 5.10,
2464							the C<$^R> variable can be used to as an index to fetch the correct
2465							pattern from the object.
2466
2467							If no successful match has been performed, or the object is not in
2468							tracked mode, this method returns C.
2469
2470							my $r = Regexp::Assemble->new->track(1)->add(qw(foo? bar{2} [Rr]at));
2471
2472							for my $w (qw(this food is rather barren)) {
2473							if ($w =~ /$r/) {
2474							print "$w matched by ", $r->source($^R), $/;
2475							}
2476							else {
2477							print "$w no match\n";
2478							}
2479							}
2480
2481							=head2 mbegin()
2482
2483							This method returns a copy of C<@-> at the moment of the
2484							last match. You should ordinarily not need to bother with
2485							this, C should be able to supply all your needs.
2486
2487							=head2 mend()
2488
2489							This method returns a copy of C<@+> at the moment of the
2490							last match.
2491
2492							=head2 mvar(NUMBER)
2493
2494							The C method returns the captures of the last match.
2495							C corresponds to $1, C to $2, and so on.
2496							C happens to return the target string matched,
2497							as a byproduct of walking down the C<@-> and C<@+> arrays
2498							after the match.
2499
2500							If called without a parameter, C will return a
2501							reference to an array containing all captures.
2502
2503							=head2 capture
2504
2505							The C method returns the the captures of the last
2506							match as an array. Unlink C, this method does not
2507							include the matched string. It is equivalent to getting an
2508							array back that contains C<$1, $2, $3, ...>.
2509
2510							If no captures were found in the match, an empty array is
2511							returned, rather than C. You are therefore guaranteed
2512							to be able to use C<< for my $c ($re->capture) { ... >>
2513							without have to check whether anything was captured.
2514
2515							=head2 matched()
2516
2517							If pattern tracking has been set, via the C attribute,
2518							or through the C method, this method will return the
2519							original pattern of the last successful match. Returns undef
2520							match has yet been performed, or tracking has not been enabled.
2521
2522							See below in the NOTES section for additional subtleties of
2523							which you should be aware of when tracking patterns.
2524
2525							Note that this method is not available in 5.6.0, due to
2526							limitations in the implementation of C<(?{...})> at the time.
2527
2528							=head2 Statistics/Reporting routines
2529
2530							=head2 stats_add
2531
2532							Returns the number of patterns added to the assembly (whether
2533							by C or C). Duplicate patterns are not included
2534							in this total.
2535
2536							=head2 stats_dup
2537
2538							Returns the number of duplicate patterns added to the assembly.
2539							If non-zero, this may be a sign that something is wrong with
2540							your data (or at the least, some needless redundancy). This may
2541							occur when you have two patterns (for instance, C and
2542							C) which map to the same result.
2543
2544							=head2 stats_raw()
2545
2546							Returns the raw number of bytes in the patterns added to the
2547							assembly. This includes both original and duplicate patterns.
2548							For instance, adding the two patterns C and C will
2549							count as 4 bytes.
2550
2551							=head2 stats_cooked()
2552
2553							Return the true number of bytes added to the assembly. This
2554							will not include duplicate patterns. Furthermore, it may differ
2555							from the raw bytes due to quotemeta treatment. For instance,
2556							C will count as 7 (not 8) bytes, because C<\,> will
2557							be stored as C<,>. Also, C<\Qa.b\E> is 7 bytes long, however,
2558							after the quotemeta directives are processed, C will be
2559							stored, for a total of 4 bytes.
2560
2561							=head2 stats_length()
2562
2563							Returns the length of the resulting assembled expression.
2564							Until C or C have been called, the length
2565							will be 0 (since the assembly will have not yet been
2566							performed). The length includes only the pattern, not the
2567							additional (C<(?-xism...>) fluff added by the compilation.
2568
2569							=head2 dup_warn(NUMBER\|CODEREF)
2570
2571							Turns warnings about duplicate patterns on or off. By
2572							default, no warnings are emitted. If the method is
2573							called with no parameters, or a true parameter,
2574							the object will carp about patterns it has
2575							already seen. To turn off the warnings, use 0 as a
2576							parameter.
2577
2578							$r->dup_warn();
2579
2580							The method may also be passed a code block. In this case
2581							the code will be executed and it will receive a reference
2582							to the object in question, and the lexed pattern.
2583
2584							$r->dup_warn(
2585							sub {
2586							my $self = shift;
2587							print $self->stats_add, " patterns added at line $.\n",
2588							join( '', @_ ), " added previously\n";
2589							}
2590							)
2591
2592							=head2 Anchor routines
2593
2594							Suppose you wish to assemble a series of patterns that all begin
2595							with C<^> and end with C<$> (anchor pattern to the beginning and
2596							end of line). Rather than add the anchors to each and every pattern
2597							(and possibly forget to do so when a new entry is added), you may
2598							specify the anchors in the object, and they will appear in the
2599							resulting pattern, and you no longer need to (or should) put them
2600							in your source patterns. For example, the two following snippets
2601							will produce identical patterns:
2602
2603							$r->add(qw(^this ^that ^them))->as_string;
2604
2605							$r->add(qw(this that them))->anchor_line_begin->as_string;
2606
2607							# both techniques will produce ^th(?:at\|em\|is)
2608
2609							All anchors are possible word (C<\b>) boundaries, line
2610							boundaries (C<^> and C<$>) and string boundaries (C<\A>
2611							and C<\Z> (or C<\z> if you absolutely need it)).
2612
2613							The shortcut C> implies both
2614							C_begin> C_end>
2615							is also available. If different anchors are specified
2616							the most specific anchor wins. For instance, if both
2617							C and C are
2618							specified, C takes precedence.
2619
2620							All the anchor methods are chainable.
2621
2622							=head2 anchor_word_begin
2623
2624							The resulting pattern will be prefixed with a C<\b>
2625							word boundary assertion when the value is true. Set
2626							to 0 to disable.
2627
2628							$r->add('pre')->anchor_word_begin->as_string;
2629							# produces '\bpre'
2630
2631							=head2 anchor_word_end
2632
2633							The resulting pattern will be suffixed with a C<\b>
2634							word boundary assertion when the value is true. Set
2635							to 0 to disable.
2636
2637							$r->add(qw(ing tion))
2638							->anchor_word_end
2639							->as_string; # produces '(?:tion\|ing)\b'
2640
2641							=head2 anchor_word
2642
2643							The resulting pattern will be have C<\b>
2644							word boundary assertions at the beginning and end
2645							of the pattern when the value is true. Set
2646							to 0 to disable.
2647
2648							$r->add(qw(cat carrot)
2649							->anchor_word(1)
2650							->as_string; # produces '\bca(?:rro)t\b'
2651
2652							=head2 anchor_line_begin
2653
2654							The resulting pattern will be prefixed with a C<^>
2655							line boundary assertion when the value is true. Set
2656							to 0 to disable.
2657
2658							$r->anchor_line_begin;
2659							# or
2660							$r->anchor_line_begin(1);
2661
2662							=head2 anchor_line_end
2663
2664							The resulting pattern will be suffixed with a C<$>
2665							line boundary assertion when the value is true. Set
2666							to 0 to disable.
2667
2668							# turn it off
2669							$r->anchor_line_end(0);
2670
2671							=head2 anchor_line
2672
2673							The resulting pattern will be have the C<^> and C<$>
2674							line boundary assertions at the beginning and end
2675							of the pattern, respectively, when the value is true. Set
2676							to 0 to disable.
2677
2678							$r->add(qw(cat carrot)
2679							->anchor_line
2680							->as_string; # produces '^ca(?:rro)t$'
2681
2682							=head2 anchor_string_begin
2683
2684							The resulting pattern will be prefixed with a C<\A>
2685							string boundary assertion when the value is true. Set
2686							to 0 to disable.
2687
2688							$r->anchor_string_begin(1);
2689
2690							=head2 anchor_string_end
2691
2692							The resulting pattern will be suffixed with a C<\Z>
2693							string boundary assertion when the value is true. Set
2694							to 0 to disable.
2695
2696							# disable the string boundary end anchor
2697							$r->anchor_string_end(0);
2698
2699							=head2 anchor_string_end_absolute
2700
2701							The resulting pattern will be suffixed with a C<\z>
2702							string boundary assertion when the value is true. Set
2703							to 0 to disable.
2704
2705							# disable the string boundary absolute end anchor
2706							$r->anchor_string_end_absolute(0);
2707
2708							If you don't understand the difference between
2709							C<\Z> and C<\z>, the former will probably do what
2710							you want.
2711
2712							=head2 anchor_string
2713
2714							The resulting pattern will be have the C<\A> and C<\Z>
2715							string boundary assertions at the beginning and end
2716							of the pattern, respectively, when the value is true. Set
2717							to 0 to disable.
2718
2719							$r->add(qw(cat carrot)
2720							->anchor_string
2721							->as_string; # produces '\Aca(?:rro)t\Z'
2722
2723							=head2 anchor_string_absolute
2724
2725							The resulting pattern will be have the C<\A> and C<\z>
2726							string boundary assertions at the beginning and end
2727							of the pattern, respectively, when the value is true. Set
2728							to 0 to disable.
2729
2730							$r->add(qw(cat carrot)
2731							->anchor_string_absolute
2732							->as_string; # produces '\Aca(?:rro)t\z'
2733
2734							=head2 debug(NUMBER)
2735
2736							Turns debugging on or off. Statements are printed
2737							to the currently selected file handle (STDOUT by default).
2738							If you are already using this handle, you will have to
2739							arrange to select an output handle to a file of your own
2740							choosing, before call the C, C or C)
2741							functions, otherwise it will scribble all over your
2742							carefully formatted output.
2743
2744							=over 4
2745
2746							=item * 0
2747
2748							Off. Turns off all debugging output.
2749
2750							=item * 1
2751
2752							Add. Trace the addition of patterns.
2753
2754							=item * 2
2755
2756							Reduce. Trace the process of reduction and assembly.
2757
2758							=item * 4
2759
2760							Lex. Trace the lexing of the input patterns into its constituent
2761							tokens.
2762
2763							=item * 8
2764
2765							Time. Print to STDOUT the time taken to load all the patterns. This is
2766							nothing more than the difference between the time the object was
2767							instantiated and the time reduction was initiated.
2768
2769							# load=
2770
2771							Any lengthy computation performed in the client code will be reflected
2772							in this value. Another line will be printed after reduction is
2773							complete.
2774
2775							# reduce=
2776
2777							The above output lines will be changed to C and
2778							C if the internal state of the object is corrupted
2779							and the initial timestamp is lost.
2780
2781							The code attempts to load L in order to report fractional
2782							seconds. If this is not successful, the elapsed time is displayed
2783							in whole seconds.
2784
2785							=back
2786
2787							Values can be added (or or'ed together) to trace everything
2788
2789							$r->debug(7)->add( '\\d+abc' );
2790
2791							Calling C with no arguments turns debugging off.
2792
2793							=head2 dump()
2794
2795							Produces a synthetic view of the internal data structure. How
2796							to interpret the results is left as an exercise to the reader.
2797
2798							print $r->dump;
2799
2800							=head2 chomp(0\|1)
2801
2802							Turns chomping on or off.
2803
2804							IMPORTANT: As of version 0.24, chomping is now on by default as it
2805							makes C Just Work. The only time you may run into trouble
2806							is with C. So don't do that, or else explicitly turn
2807							off chomping.
2808
2809							To avoid incorporating (spurious)
2810							record separators (such as "\n" on Unix) when reading from a file,
2811							C Cs its input. If you don't want this to happen,
2812							call C with a false value.
2813
2814							$re->chomp(0); # really want the record separators
2815							$re->add();
2816
2817							=head2 fold_meta_pairs(NUMBER)
2818
2819							Determines whether C<\s>, C<\S> and C<\w>, C<\W> and C<\d>, C<\D>
2820							are folded into a C<.> (dot). Folding happens by default (for
2821							reasons of backwards compatibility, even though it is wrong when
2822							the C expression modifier is active).
2823
2824							Call this method with a false value to prevent this behaviour (which
2825							is only a problem when dealing with C<\n> if the C expression
2826							modifier is also set).
2827
2828							$re->add( '\\w', '\\W' );
2829							my $clone = $re->clone;
2830
2831							$clone->fold_meta_pairs(0);
2832							print $clone->as_string; # prints '.'
2833							print $re->as_string; # print '[\W\w]'
2834
2835							=head2 indent(NUMBER)
2836
2837							Sets the level of indent for pretty-printing nested groups
2838							within a pattern. See the C method for more details.
2839							When called without a parameter, no indenting is performed.
2840
2841							$re->indent( 4 );
2842							print $re->as_string;
2843
2844							=head2 lookahead(0\|1)
2845
2846							Turns on zero-width lookahead assertions. This is usually
2847							beneficial when you expect that the pattern will usually fail.
2848							If you expect that the pattern will usually match you will
2849							probably be worse off.
2850
2851							=head2 flags(STRING)
2852
2853							Sets the flags that govern how the pattern behaves (for
2854							versions of Perl up to 5.9 or so, these are C). By
2855							default no flags are enabled.
2856
2857							=head2 modifiers(STRING)
2858
2859							An alias of the C method, for users familiar with
2860							C.
2861
2862							=head2 track(0\|1)
2863
2864							Turns tracking on or off. When this attribute is enabled,
2865							additional housekeeping information is inserted into the
2866							assembled expression using C<({...}> embedded code
2867							constructs. This provides the necessary information to
2868							determine which, of the original patterns added, was the
2869							one that caused the match.
2870
2871							$re->track( 1 );
2872							if( $target =~ /$re/ ) {
2873							print "$target matched by ", $re->matched, "\n";
2874							}
2875
2876							Note that when this functionality is enabled, no
2877							reduction is performed and no character classes are
2878							generated. In other words, C is not
2879							reduced down to C<(?:br\|t)ag> and C is not
2880							reduced to C.
2881
2882							=head2 unroll_plus(0\|1)
2883
2884							Turns the unrolling of plus metacharacters on or off. When
2885							a pattern is broken up, C becomes C, C (and
2886							C becomes C, C. This may allow the freed C
2887							to assemble with other patterns. Not enabled by default.
2888
2889							=head2 lex(SCALAR)
2890
2891							Change the pattern used to break a string apart into tokens.
2892							You can examine the C script as a starting point.
2893
2894							=head2 reduce(0\|1)
2895
2896							Turns pattern reduction on or off. A reduced pattern may
2897							be considerably shorter than an unreduced pattern. Consider
2898							C I C. An unreduced
2899							pattern will be very similar to those produced by
2900							C. Reduction is on by default. Turning
2901							it off speeds assembly (but assembly is pretty fast -- it's
2902							the breaking up of the initial patterns in the lexing stage
2903							that can consume a non-negligible amount of time).
2904
2905							=head2 mutable(0\|1)
2906
2907							This method has been marked as DEPRECATED. It will be removed
2908							in a future release. See the C method for a technique
2909							to replace its functionality.
2910
2911							=head2 reset()
2912
2913							Empties out the patterns that have been Ced or C-ed
2914							into the object. Does not modify the state of controller attributes
2915							such as C, C, C and the like.
2916
2917							=head2 Default_Lexer
2918
2919							B the C function is a class method, not
2920							an object method. It is a fatal error to call it as an object
2921							method.
2922
2923							The C method lets you replace the default pattern
2924							used for all subsequently created C objects. It
2925							will not have any effect on existing objects. (It is also possible
2926							to override the lexer pattern used on a per-object basis).
2927
2928							The parameter should be an ordinary scalar, not a compiled
2929							pattern. If the pattern fails to match all parts of the string,
2930							the missing parts will be returned as single chunks. Therefore
2931							the following pattern is legal (albeit rather cork-brained):
2932
2933							Regexp::Assemble::Default_Lexer( '\\d' );
2934
2935							The above pattern will split up input strings digit by digit, and
2936							all non-digit characters as single chunks.
2937
2938							=head1 DIAGNOSTICS
2939
2940							"Cannot pass a C to Default_Lexer"
2941
2942							You tried to replace the default lexer pattern with an object
2943							instead of a scalar. Solution: You probably tried to call
2944							C<< $obj->Default_Lexer >>. Call the qualified class method instead
2945							C.
2946
2947							"filter method not passed a coderef"
2948
2949							"pre_filter method not passed a coderef"
2950
2951							A reference to a subroutine (anonymous or otherwise) was expected.
2952							Solution: read the documentation for the C method.
2953
2954							"duplicate pattern added: /.../"
2955
2956							The C attribute is active, and a duplicate pattern was
2957							added (well duh!). Solution: clean your data.
2958
2959							"cannot open [file] for input: [reason]"
2960
2961							The C method was unable to open the specified file for
2962							whatever reason. Solution: make sure the file exists and the script
2963							has the required privileges to read it.
2964
2965							=head1 NOTES
2966
2967							This module has been tested successfully with a range of versions
2968							of perl, from 5.005_03 to 5.9.3. Use of 5.6.0 is not recommended.
2969
2970							The expressions produced by this module can be used with the PCRE
2971							library.
2972
2973							Remember to "double up" your backslashes if the patterns are
2974							hard-coded as constants in your program. That is, you should
2975							literally C rather than C. It
2976							usually will work either way, but it's good practice to do so.
2977
2978							Where possible, supply the simplest tokens possible. Don't add
2979							C when C will do. The reason is that
2980							if you also add C the resulting assembly changes
2981							dramatically: C I
2982							C. Since R::A doesn't perform enough analysis,
2983							it won't "unroll" the C<{2}> quantifier, and will fail to notice
2984							the divergence after the first C<-d\d+>.
2985
2986							Furthermore, when the string 'X-123000P' is matched against the
2987							first assembly, the regexp engine will have to backtrack over each
2988							alternation (the one that ends in Y B the one that ends in Z)
2989							before determining that there is no match. No such backtracking
2990							occurs in the second pattern: as soon as the engine encounters the
2991							'P' in the target string, neither of the alternations at that point
2992							(C<-\d+Y> or C) could succeed and so the match fails.
2993
2994							C does, however, know how to build character
2995							classes. Given C, C and C, it will assemble these
2996							into C. When C<-> (dash) appears as a candidate for a
2997							character class it will be the first character in the class. When
2998							C<^> (circumflex) appears as a candidate for a character class it
2999							will be the last character in the class.
3000
3001							It also knows about meta-characters than can "absorb" regular
3002							characters. For instance, given C and C, it knows that
3003							C<5> can be represented by C<\d> and so the assembly is just C.
3004							The "absorbent" meta-characters it deals with are C<.>, C<\d>, C<\s>
3005							and C<\W> and their complements. It will replace C<\d>/C<\D>,
3006							C<\s>/C<\S> and C<\w>/C<\W> by C<.> (dot), and it will drop C<\d>
3007							if C<\w> is also present (as will C<\D> in the presence of C<\W>).
3008
3009							C deals correctly with C's propensity
3010							to backslash many characters that have no need to be. Backslashes on
3011							non-metacharacters will be removed. Similarly, in character classes,
3012							a number of characters lose their magic and so no longer need to be
3013							backslashed within a character class. Two common examples are C<.>
3014							(dot) and C<$>. Such characters will lose their backslash.
3015
3016							At the same time, it will also process C<\Q...\E> sequences. When
3017							such a sequence is encountered, the inner section is extracted and
3018							C is applied to the section. The resulting quoted text
3019							is then used in place of the original unquoted text, and the C<\Q>
3020							and C<\E> metacharacters are thrown away. Similar processing occurs
3021							with the C<\U...\E> and C<\L...\E> sequences. This may have surprising
3022							effects when using a dispatch table. In this case, you will need
3023							to know exactly what the module makes of your input. Use the C
3024							method to find out what's going on:
3025
3026							$pattern = join( '', @{$re->lexstr($pattern)} );
3027
3028							If all the digits 0..9 appear in a character class, C
3029							will replace them by C<\d>. I'd do it for letters as well, but
3030							thinking about accented characters and other glyphs hurts my head.
3031
3032							In an alternation, the longest paths are chosen first (for example,
3033							C). When two paths have the same length, the path
3034							with the most subpaths will appear first. This aims to put the
3035							"busiest" paths to the front of the alternation. For example, the
3036							list C, C, C, C and C will produce the
3037							pattern C<(?:f(?:ew\|ig\|un)\|b(?:ad\|it))>. See F for a
3038							real-world example of how alternations are sorted. Once you have
3039							looked at that, everything should be crystal clear.
3040
3041							When tracking is in use, no reduction is performed. nor are
3042							character classes formed. The reason is that it is
3043							too difficult to determine the original pattern afterwards. Consider the
3044							two patterns C and C. These should be reduced to
3045							C. The final character matches one of two possibilities.
3046							To resolve whether it matched an C<'e'> or C<'m'> would require
3047							keeping track of the fact that the pattern finished up in a character
3048							class, which would the require a whole lot more work to figure out
3049							which character of the class matched. Without character classes
3050							it becomes much easier. Instead, C is produced, which
3051							lets us find out more simply where we ended up.
3052
3053							Similarly, C and C should form C<(?:dog\|sea)food>.
3054							When the pattern is being assembled, the tracking decision needs
3055							to be made at the end of the grouping, but the tail of the pattern
3056							has not yet been visited. Deferring things to make this work correctly
3057							is a vast hassle. In this case, the pattern becomes merely
3058							C<(?:dogfood\|seafood>. Tracked patterns will therefore be bulkier than
3059							simple patterns.
3060
3061							There is an open bug on this issue:
3062
3063							L
3064
3065							If this bug is ever resolved, tracking would become much easier to
3066							deal with (none of the C hassle would be required - you could
3067							just match like a regular RE and it would Just Work).
3068
3069							=head1 SEE ALSO
3070
3071							=over 4
3072
3073							=item L
3074
3075							General information about Perl's regular expressions.
3076
3077							=item L
3078
3079							Specific information about C.
3080
3081							=item Regex::PreSuf
3082
3083							C takes a string and chops it itself into tokens of
3084							length 1. Since it can't deal with tokens of more than one character,
3085							it can't deal with meta-characters and thus no regular expressions.
3086							Which is the main reason why I wrote this module.
3087
3088							=item Regexp::Optimizer
3089
3090							C produces regular expressions that are similar to
3091							those produced by R::A with reductions switched off. It's biggest
3092							drawback is that it is exponentially slower than Regexp::Assemble on
3093							very large sets of patterns.
3094
3095							=item Regexp::Parser
3096
3097							Fine grained analysis of regular expressions.
3098
3099							=item Regexp::Trie
3100
3101							Funnily enough, this was my working name for C
3102							during its development. I changed the name because I thought it
3103							was too obscure. Anyway, C does much the same as
3104							C and C except that it runs
3105							much faster (according to the author). It does not recognise
3106							meta characters (that is, 'a+b' is interpreted as 'a\+b').
3107
3108							=item Text::Trie
3109
3110							C is well worth investigating. Tries can outperform very
3111							bushy (read: many alternations) patterns.
3112
3113							=item Tree::Trie
3114
3115							C is another module that builds tries. The algorithm that
3116							C uses appears to be quite similar to the
3117							algorithm described therein, except that C solves its
3118							end-marker problem without having to rewrite the leaves.
3119
3120							=back
3121
3122							=head1 LIMITATIONS
3123
3124							Some mildly complex cases are not handled well. See examples/failure.01.pl
3125							and L.
3126
3127							does not attempt to find common substrings. For
3128							instance, it will not collapse C down to C.
3129							If there's a module out there that performs this sort of string
3130							analysis I'd like to know about it. But keep in mind that the
3131							algorithms that do this are very expensive: quadratic or worse.
3132
3133							C does not interpret meta-character modifiers.
3134							For instance, if the following two patterns are
3135							given: C and C, it will not determine that C<\d> can be
3136							matched by C<\d+>. Instead, it will produce C. Along
3137							a similar line of reasoning, it will not determine that C and
3138							C is equivalent to C (It will produce C
3139							instead).
3140
3141							You cannot remove a pattern that has been added to an object. You'll
3142							just have to start over again. Adding a pattern is difficult enough,
3143							I'd need a solid argument to convince me to add a C method.
3144							If you need to do this you should read the documentation for the
3145							C method.
3146
3147							C does not (yet)? employ the C<(?E...)>
3148							construct.
3149
3150							The module does not produce POSIX-style regular expressions. This
3151							would be quite easy to add, if there was a demand for it.
3152
3153							=head1 BUGS
3154
3155							Patterns that generate look-ahead assertions sometimes produce
3156							incorrect patterns in certain obscure corner cases. If you
3157							suspect that this is occurring in your pattern, disable
3158							lookaheads.
3159
3160							Tracking doesn't really work at all with 5.6.0. It works better
3161							in subsequent 5.6 releases. For maximum reliability, the use of
3162							a 5.8 release is strongly recommended. Tracking barely works with
3163							5.005_04. Of note, using C<\d>-style meta-characters invariably
3164							causes panics. Tracking really comes into its own in Perl 5.10.
3165
3166							If you feed C patterns with nested parentheses,
3167							there is a chance that the resulting pattern will be uncompilable
3168							due to mismatched parentheses (not enough closing parentheses). This
3169							is normal, so long as the default lexer pattern is used. If you want
3170							to find out which pattern among a list of 3000 patterns are to blame
3171							(speaking from experience here), the F script offers
3172							a strategy for pinpointing the pattern at fault. While you may not
3173							be able to use the script directly, the general approach is easy to
3174							implement.
3175
3176							The algorithm used to assemble the regular expressions makes extensive
3177							use of mutually-recursive functions (that is, A calls B, B calls
3178							A, ...) For deeply similar expressions, it may be possible to provoke
3179							"Deep recursion" warnings.
3180
3181							The module has been tested extensively, and has an extensive test
3182							suite (that achieves close to 100% statement coverage), but you
3183							never know... A bug may manifest itself in two ways: creating a
3184							pattern that cannot be compiled, such as C, or a pattern
3185							that compiles correctly but that either matches things it shouldn't,
3186							or doesn't match things it should. It is assumed that Such problems
3187							will occur when the reduction algorithm encounters some sort of
3188							edge case. A temporary work-around is to disable reductions:
3189
3190							my $pattern = $assembler->reduce(0)->re;
3191
3192							A discussion about implementation details and where bugs might lurk
3193							appears in the README file. If this file is not available locally,
3194							you should be able to find a copy on the Web at your nearest CPAN
3195							mirror.
3196
3197							Seriously, though, a number of people have been using this module to
3198							create expressions anywhere from 140Kb to 600Kb in size, and it seems to
3199							be working according to spec. Thus, I don't think there are any serious
3200							bugs remaining.
3201
3202							If you are feeling brave, extensive debugging traces are available to
3203							figure out where assembly goes wrong.
3204
3205							Please report all bugs at
3206							L
3207
3208							Make sure you include the output from the following two commands:
3209
3210							perl -MRegexp::Assemble -le 'print $Regexp::Assemble::VERSION'
3211							perl -V
3212
3213							There is a mailing list for the discussion of C.
3214							Subscription details are available at
3215							L.
3216
3217							=head1 ACKNOWLEDGEMENTS
3218
3219							This module grew out of work I did building access maps for Postfix,
3220							a modern SMTP mail transfer agent. See L
3221							for more information. I used Perl to build large regular expressions
3222							for blocking dynamic/residential IP addresses to cut down on spam
3223							and viruses. Once I had the code running for this, it was easy to
3224							start adding stuff to block really blatant spam subject lines, bogus
3225							HELO strings, spammer mailer-ids and more...
3226
3227							I presented the work at the French Perl Workshop in 2004, and the
3228							thing most people asked was whether the underlying mechanism for
3229							assembling the REs was available as a module. At that time it was
3230							nothing more that a twisty maze of scripts, all different. The
3231							interest shown indicated that a module was called for. I'd like to
3232							thank the people who showed interest. Hey, it's going to make I
3233							messy scripts smaller, in any case.
3234
3235							Thomas Drugeon was a valuable sounding board for trying out
3236							early ideas. Jean Forget and Philippe Blayo looked over an early
3237							version. H.Merijn Brandt stopped over in Paris one evening, and
3238							discussed things over a few beers.
3239
3240							Nicholas Clark pointed out that while what this module does
3241							(?:c\|sh)ould be done in perl's core, as per the 2004 TODO, he
3242							encouraged me to continue with the development of this module. In
3243							any event, this module allows one to gauge the difficulty of
3244							undertaking the endeavour in C. I'd rather gouge my eyes out with
3245							a blunt pencil.
3246
3247							Paul Johnson settled the question as to whether this module should
3248							live in the Regex:: namespace, or Regexp:: namespace. If you're
3249							not convinced, try running the following one-liner:
3250
3251							perl -le 'print ref qr//'
3252
3253							Philippe Bruhat found a couple of corner cases where this module
3254							could produce incorrect results. Such feedback is invaluable,
3255							and only improves the module's quality.
3256
3257							=head1 Machine-Readable Change Log
3258
3259							The file Changes was converted into Changelog.ini by L.
3260
3261							=head1 AUTHOR
3262
3263							David Landgren
3264
3265							Copyright (C) 2004-2011. All rights reserved.
3266
3267							http://www.landgren.net/perl/
3268
3269							If you use this module, I'd love to hear about what you're using
3270							it for. If you want to be informed of updates, send me a note.
3271
3272							Ron Savage is co-maint of the module, starting with V 0.36.
3273
3274							=head1 Repository
3275
3276							L
3277
3278							=head1 TODO
3279
3280							1. Tree equivalencies. Currently, /contend/ /content/ /resend/ /resent/
3281							produces (?:conten[dt]\|resend[dt]) but it is possible to produce
3282							(?:cont\|res)en[dt] if one can spot the common tail nodes (and walk back
3283							the equivalent paths). Or be by me my => /[bm][ey]/ in the simplest case.
3284
3285							To do this requires a certain amount of restructuring of the code.
3286							Currently, the algorithm uses a two-phase approach. In the first
3287							phase, the trie is traversed and reductions are performed. In the
3288							second phase, the reduced trie is traversed and the pattern is
3289							emitted.
3290
3291							What has to occur is that the reduction and emission have to occur
3292							together. As a node is completed, it is replaced by its string
3293							representation. This then allows child nodes to be compared for
3294							equality with a simple 'eq'. Since there is only a single traversal,
3295							the overall generation time might drop, even though the context
3296							baggage required to delve through the tree will be more expensive
3297							to carry along (a hash rather than a couple of scalars).
3298
3299							Actually, a simpler approach is to take on a secret sentinel
3300							atom at the end of every pattern, which gives the reduction
3301							algorithm sufficient traction to create a perfect trie.
3302
3303							I'm rewriting the reduction code using this technique.
3304
3305							2. Investigate how (?>foo) works. Can it be applied?
3306
3307							5. How can a tracked pattern be serialised? (Add freeze and thaw methods).
3308
3309							6. Store callbacks per tracked pattern.
3310
3311							12. utf-8... hmmmm...
3312
3313							14. Adding qr//'ed patterns. For example, consider
3314							$r->add ( qr/^abc/i )
3315							->add( qr/^abd/ )
3316							->add( qr/^ab e/x );
3317							this should admit abc abC aBc aBC abd abe as matches
3318
3319							16. Allow a fast, unsafe tracking mode, that can be used if a(?bc)?
3320							can't happen. (Possibly carp if it does appear during traversal)?
3321
3322							17. given a-\d+-\d+-\d+-\d+-b, produce a(?:-\d+){4}-b. Something
3323							along the lines of (.{4))(\1+) would let the regexp engine
3324							itself be brought to bear on the matter, which is a rather
3325							appealing idea. Consider
3326
3327							while(/(?!\+)(\S{2,}?)(\1+)/g) { ... $1, $2 ... }
3328
3329							as a starting point.
3330
3331							19. The reduction code has become unbelievably baroque. Adding code
3332							to handle (sting,singing,sing) => s(?:(?:ing)?\|t)ing was far
3333							too difficult. Adding more stuff just breaks existing behaviour.
3334							And fixing the ^abcd$ ... bug broke stuff all over again.
3335							Now that the corner cases are more clearly identified, a full
3336							rewrite of the reduction code is needed. And would admit the
3337							possibility of implementing items 1 and 17.
3338
3339							20. Handle debug unrev with a separate bit
3340
3341							23. Japhy's http://www.perlmonks.org/index.pl?node_id=90876 list2range
3342							regexp
3343
3344							24. Lookahead assertions contain serious bugs (as shown by
3345							assembling powersets. Need to save more context during reduction,
3346							which in turn will simplify the preparation of the lookahead
3347							classes. See also 19.
3348
3349							26. _lex() swamps the overall run-time. It stems from the decision
3350							to use a single regexp to pull apart any pattern. A suite of
3351							simpler regexp to pick of parens, char classes, quantifiers
3352							and bare tokens may be faster. (This has been implemented as
3353							_fastlex(), but it's only marginally faster. Perhaps split-by-
3354							char and lex a la C?
3355
3356							27. We don't, as yet, unroll_plus a paren e.g. (abc)+?
3357
3358							28. We don't reroll unrolled a a* to a+ in indented or tracked
3359							output
3360
3361							29. Use (MARK n) in blead for tracked patterns, and use (FAIL) for
3362							the unmatchable pattern.
3363
3364							=head1 LICENSE
3365
3366							This library is free software; you can redistribute it and/or modify
3367							it under the same terms as Perl itself.
3368
3369							=cut
3370
3371							# Return a +ve value to tell Perl the module is ready to go.
3372
3373							'The Lusty Decadent Delights of Imperial Pompeii';