File Coverage

blib/lib/Regexp/Assemble.pm
Criterion Covered Total %
statement 1149 1155 99.4
branch 782 796 98.2
condition 164 177 92.6
subroutine 98 98 100.0
pod 49 49 100.0
total 2242 2275 98.5


line stmt bran cond sub pod time code
1             # Regexp::Assemple.pm
2             #
3             # Copyright (c) 2004-2011 David Landgren
4             # All rights reserved
5              
6             package Regexp::Assemble;
7              
8 11     11   338167 use strict;
  11         19  
  11         359  
9 11     11   50 use warnings;
  11         16  
  11         386  
10              
11 11     11   48 use constant DEBUG_ADD => 1;
  11         17  
  11         905  
12 11     11   53 use constant DEBUG_TAIL => 2;
  11         14  
  11         580  
13 11     11   54 use constant DEBUG_LEX => 4;
  11         15  
  11         600  
14 11     11   62 use constant DEBUG_TIME => 8;
  11         39  
  11         663  
15              
16 11     11   57 use vars qw/$have_Storable $Current_Lexer $Default_Lexer $Single_Char $Always_Fail/;
  11         15  
  11         52151  
17              
18             # The following patterns were generated with examples/naive.
19              
20             $Default_Lexer = qr/(?![[(\\]).(?:[*+?]\??|\{\d+(?:,\d*)?\}\??)?|\\(?:[bABCEGLQUXZ]|[lu].|(?:[^\w]|[aefnrtdDwWsS]|c.|0\d{2}|x(?:[\da-fA-F]{2}|{[\da-fA-F]{4}})|N\{\w+\}|[Pp](?:\{\w+\}|.))(?:[*+?]\??|\{\d+(?:,\d*)?\}\??)?)|\[.*?(?
21              
22             $Single_Char = qr/^(?:\\(?:[aefnrtdDwWsS]|c.|[^\w\/{|}-]|0\d{2}|x(?:[\da-fA-F]{2}|{[\da-fA-F]{4}}))|[^\$^])$/;
23              
24             # The pattern to return when nothing has been added (and thus not match anything)
25              
26             $Always_Fail = "^\\b\0";
27              
28             our $VERSION = '0.36';
29              
30             # ------------------------------------------------
31              
32             sub new {
33 2197     2197 1 1878970 my $class = shift;
34 2197         5102 my %args = @_;
35              
36 2197         2196 my $anc;
37 2197         3799 for $anc (qw(word line string)) {
38 6591 100       15952 if (exists $args{"anchor_$anc"}) {
39 135         267 my $val = delete $args{"anchor_$anc"};
40 135         397 for my $anchor ("anchor_${anc}_begin", "anchor_${anc}_end") {
41 270 100       928 $args{$anchor} = $val unless exists $args{$anchor};
42             }
43             }
44             }
45              
46             # anchor_string_absolute sets anchor_string_begin and anchor_string_end_absolute
47 2197 100       5147 if (exists $args{anchor_string_absolute}) {
48 3         5 my $val = delete $args{anchor_string_absolute};
49 3         4 for my $anchor (qw(anchor_string_begin anchor_string_end_absolute)) {
50 6 100       16 $args{$anchor} = $val unless exists $args{$anchor};
51             }
52             }
53              
54 2197   100     49118 exists $args{$_} or $args{$_} = 0 for qw(
55             anchor_word_begin
56             anchor_word_end
57             anchor_line_begin
58             anchor_line_end
59             anchor_string_begin
60             anchor_string_end
61             anchor_string_end_absolute
62             debug
63             dup_warn
64             indent
65             lookahead
66             mutable
67             track
68             unroll_plus
69             );
70              
71 2197   100     14315 exists $args{$_} or $args{$_} = 1 for qw(
72             fold_meta_pairs
73             reduce
74             chomp
75             );
76              
77 2197         6180 @args{qw(re str path)} = (undef, undef, []);
78              
79 2197   100     13933 $args{flags} ||= delete $args{modifiers} || '';
      100        
80 2197 100       4428 $args{lex} = $Current_Lexer if defined $Current_Lexer;
81              
82 2197         3852 my $self = bless \%args, $class;
83              
84 2197 100       5412 if ($self->_debug(DEBUG_TIME)) {
85 1         5 $self->_init_time_func();
86 1         5 $self->{_begin_time} = $self->{_time_func}->();
87             }
88             $self->{input_record_separator} = delete $self->{rs}
89 2197 100       4659 if exists $self->{rs};
90 2197 100       3947 exists $self->{file} and $self->add_file($self->{file});
91              
92 2196         5466 return $self;
93             }
94              
95             sub _init_time_func {
96 9     9   20 my $self = shift;
97 9 100       22 return if exists $self->{_time_func};
98              
99             # attempt to improve accuracy
100 6 100       16 if (!defined($self->{_use_time_hires})) {
101 5         14 eval {require Time::HiRes};
  5         1223  
102 5         2751 $self->{_use_time_hires} = $@;
103             }
104             $self->{_time_func} = length($self->{_use_time_hires}) > 0
105 4     4   10 ? sub { time }
106 6 100       41 : \&Time::HiRes::time
107             ;
108             }
109              
110             sub clone {
111 55     55 1 518 my $self = shift;
112 55         70 my $clone;
113 55         354 my @attr = grep {$_ ne 'path'} keys %$self;
  1314         1927  
114 55         145 @{$clone}{@attr} = @{$self}{@attr};
  55         640  
  55         159  
115 55         177 $clone->{path} = _path_clone($self->_path);
116 55         600 bless $clone, ref($self);
117             }
118              
119             sub _fastlex {
120 884     884   821 my $self = shift;
121 884         776 my $record = shift;
122 884         700 my $len = 0;
123 884         1022 my @path = ();
124 884         806 my $case = '';
125 884         671 my $qm = '';
126              
127 884         941 my $debug = $self->{debug} & DEBUG_LEX;
128 884         788 my $unroll_plus = $self->{unroll_plus};
129              
130 884         737 my $token;
131             my $qualifier;
132 884 100       3118 $debug and print "# _lex <$record>\n";
133 884         880 my $modifier = q{(?:[*+?]\\??|\\{(?:\\d+(?:,\d*)?|,\d+)\\}\\??)?};
134 884         2410 my $class_matcher = qr/\[(?:\[:[a-z]+:\]|\\?.)*?\]/;
135 884         3362 my $paren_matcher = qr/\(.*?(?
136 884         2553 my $misc_matcher = qr/(?:(c)(.)|(0)(\d{2}))($modifier)/;
137 884         2373 my $regular_matcher = qr/([^\\[(])($modifier)/;
138 884         1432 my $qm_matcher = qr/(\\?.)/;
139              
140 884         805 my $matcher = $regular_matcher;
141             {
142 884 100       675 if ($record =~ /\G$matcher/gc) {
  5960 100       24296  
    100          
    100          
143             # neither a \\ nor [ nor ( followed by a modifer
144 3344 100 100     9624 if ($1 eq '\\E') {
    100 66        
145 12 100       118 $debug and print "# E\n";
146 12         18 $case = $qm = '';
147 12         13 $matcher = $regular_matcher;
148 12         14 redo;
149             }
150             elsif ($qm and ($1 eq '\\L' or $1 eq '\\U')) {
151 5 100       13 $debug and print "# ignore \\L, \\U\n";
152 5         14 redo;
153             }
154 3327         3049 $token = $1;
155 3327 100       4401 $qualifier = defined $2 ? $2 : '';
156 3327 100       7522 $debug and print "# token <$token> <$qualifier>\n";
157 3327 100       3555 if ($qm) {
158 90         95 $token = quotemeta($token);
159 90         134 $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{}\/])$/$1/;
160             }
161             else {
162 3237         3770 $token =~ s{\A([][{}*+?@\\/])\Z}{\\$1};
163             }
164 3327 100 100     5559 if ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/) {
165 22 100       56 $1 and $qualifier .= $1;
166 22 100       885 $debug and print " unroll <$token><$token><$qualifier>\n";
167 22 100       53 $case and $token = $case eq 'L' ? lc($token) : uc($token);
    100          
168 22         86 push @path, $token, "$token$qualifier";
169             }
170             else {
171 3305 100       5888 $debug and print " clean <$token>\n";
172 3305 100       6703 push @path,
    100          
173             $case eq 'L' ? lc($token).$qualifier
174             : $case eq 'U' ? uc($token).$qualifier
175             : $token.$qualifier
176             ;
177             }
178 3327         2892 redo;
179             }
180              
181             elsif ($record =~ /\G\\/gc) {
182 1680 100       3447 $debug and print "# backslash\n";
183             # backslash
184 1680 100       13206 if ($record =~ /\G([sdwSDW])($modifier)/gc) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
185 443         616 ($token, $qualifier) = ($1, $2);
186 443 100       871 $debug and print "# meta <$token> <$qualifier>\n";
187 443 100 100     1094 push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
    100          
188             ? ("\\$token", "\\$token$qualifier" . (defined $1 ? $1 : ''))
189             : "\\$token$qualifier";
190             }
191             elsif ($record =~ /\Gx([\da-fA-F]{2})($modifier)/gc) {
192 9 100       294 $debug and print "# x $1\n";
193 9         37 $token = quotemeta(chr(hex($1)));
194 9         13 $qualifier = $2;
195 9 100       284 $debug and print "# cooked <$token>\n";
196 9         31 $token =~ s/^\\([^\w$()*+.?\[\\\]^|{\/])$/$1/; # } balance
197 9 100       307 $debug and print "# giving <$token>\n";
198 9 100 100     67 push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
    100          
199             ? ($token, "$token$qualifier" . (defined $1 ? $1 : ''))
200             : "$token$qualifier";
201             }
202             elsif ($record =~ /\GQ/gc) {
203 26 100       232 $debug and print "# Q\n";
204 26         30 $qm = 1;
205 26         31 $matcher = $qm_matcher;
206             }
207             elsif ($record =~ /\G([LU])/gc) {
208 15 100       413 $debug and print "# case $1\n";
209 15         33 $case = $1;
210             }
211             elsif ($record =~ /\GE/gc) {
212 6 100       17 $debug and print "# E\n";
213 6         9 $case = $qm = '';
214 6         10 $matcher = $regular_matcher;
215             }
216             elsif ($record =~ /\G([lu])(.)/gc) {
217 6 100       18 $debug and print "# case $1 to <$2>\n";
218 6 100       35 push @path, $1 eq 'l' ? lc($2) : uc($2);
219             }
220 30         40 elsif (my @arg = grep {defined} $record =~ /\G$misc_matcher/gc) {
221 6 50       12 if ($] < 5.007) {
222 0         0 my $len = 0;
223 0         0 $len += length($_) for @arg;
224 0 0       0 $debug and print "# pos ", pos($record), " fixup add $len\n";
225 0         0 pos($record) = pos($record) + $len;
226             }
227 6         8 my $directive = shift @arg;
228 6 100       30 if ($directive eq 'c') {
229 3 100       10 $debug and print "# ctrl <@arg>\n";
230 3         8 push @path, "\\c" . uc(shift @arg);
231             }
232             else { # elsif ($directive eq '0') {
233 3 100       10 $debug and print "# octal <@arg>\n";
234 3         10 my $ascii = oct(shift @arg);
235 3 100       11 push @path, ($ascii < 32)
236             ? "\\c" . chr($ascii+64)
237             : chr($ascii)
238             ;
239             }
240 6         9 $path[-1] .= join( '', @arg ); # if @arg;
241 6         10 redo;
242             }
243             elsif ($record =~ /\G(.)/gc) {
244 1165         1712 $token = $1;
245 1165         2273 $token =~ s{[AZabefnrtz\[\]{}()\\\$*+.?@|/^]}{\\$token};
246 1165 100       1698 $debug and print "# meta <$token>\n";
247 1165         1741 push @path, $token;
248             }
249             else {
250 4 100       19 $debug and print "# ignore char at ", pos($record), " of <$record>\n";
251             }
252 1674         2203 redo;
253             }
254              
255             elsif ($record =~ /\G($class_matcher)($modifier)/gc) {
256             # [class] followed by a modifer
257 39         65 my $class = $1;
258 39 50       82 my $qualifier = defined $2 ? $2 : '';
259 39 100       375 $debug and print "# class begin <$class> <$qualifier>\n";
260 39 100       115 if ($class =~ /\A\[\\?(.)]\Z/) {
261 12         25 $class = quotemeta $1;
262 12         23 $class =~ s{\A\\([!@%])\Z}{$1};
263 12 100       143 $debug and print "# class unwrap $class\n";
264             }
265 39 100       353 $debug and print "# class end <$class> <$qualifier>\n";
266 39 100 100     178 push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
    100          
267             ? ($class, "$class$qualifier" . (defined $1 ? $1 : ''))
268             : "$class$qualifier";
269 39         58 redo;
270             }
271              
272             elsif ($record =~ /\G($paren_matcher)/gc) {
273 13 100       35 $debug and print "# paren <$1>\n";
274             # (paren) followed by a modifer
275 13         34 push @path, $1;
276 13         22 redo;
277             }
278              
279             }
280 884         3622 return \@path;
281             }
282              
283             sub _lex {
284 211     211   359 my $self = shift;
285 211         220 my $record = shift;
286 211         205 my $len = 0;
287 211         282 my @path = ();
288 211         214 my $case = '';
289 211         201 my $qm = '';
290             my $re = defined $self->{lex} ? $self->{lex}
291 211 50       556 : defined $Current_Lexer ? $Current_Lexer
    100          
292             : $Default_Lexer;
293 211         258 my $debug = $self->{debug} & DEBUG_LEX;
294 211 100       2018 $debug and print "# _lex <$record>\n";
295 211         196 my ($token, $next_token, $diff, $token_len);
296 211         3693 while( $record =~ /($re)/g ) {
297 480         939 $token = $1;
298 480         646 $token_len = length($token);
299 480 100       8723 $debug and print "# lexed <$token> len=$token_len\n";
300 480 100       1066 if( pos($record) - $len > $token_len ) {
301 15         19 $next_token = $token;
302 15         32 $token = substr( $record, $len, $diff = pos($record) - $len - $token_len );
303 15 100       814 $debug and print "# recover <", substr( $record, $len, $diff ), "> as <$token>, save <$next_token>\n";
304 15         23 $len += $diff;
305             }
306 480         450 $len += $token_len;
307             TOKEN: {
308 480 100       406 if( substr( $token, 0, 1 ) eq '\\' ) {
  495         980  
309 226 100       1132 if( $token =~ /^\\([ELQU])$/ ) {
    100          
    100          
310 51 100       195 if( $1 eq 'E' ) {
    100          
311             $qm and $re = defined $self->{lex} ? $self->{lex}
312 12 50       47 : defined $Current_Lexer ? $Current_Lexer
    100          
    100          
313             : $Default_Lexer;
314 12         27 $case = $qm = '';
315             }
316             elsif( $1 eq 'Q' ) {
317 19         28 $qm = $1;
318             # switch to a more precise lexer to quotemeta individual characters
319 19         71 $re = qr/\\?./;
320             }
321             else {
322 20         101 $case = $1;
323             }
324 51 100       1687 $debug and print "# state change qm=<$qm> case=<$case>\n";
325 51         716 goto NEXT_TOKEN;
326             }
327             elsif( $token =~ /^\\([lu])(.)$/ ) {
328 3 100       100 $debug and print "# apply case=<$1> to <$2>\n";
329 3 100       19 push @path, $1 eq 'l' ? lc($2) : uc($2);
330 3         44 goto NEXT_TOKEN;
331             }
332             elsif( $token =~ /^\\x([\da-fA-F]{2})$/ ) {
333 41         113 $token = quotemeta(chr(hex($1)));
334 41 100       229 $debug and print "# cooked <$token>\n";
335 41         84 $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{\/])$/$1/; # } balance
336 41 100       165 $debug and print "# giving <$token>\n";
337             }
338             else {
339 131         229 $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{\/])$/$1/; # } balance
340 131 100       662 $debug and print "# backslashed <$token>\n";
341             }
342             }
343             else {
344 269 100       480 $case and $token = $case eq 'U' ? uc($token) : lc($token);
    100          
345 269 100       434 $qm and $token = quotemeta($token);
346 269 100       466 $token = '\\/' if $token eq '/';
347             }
348             # undo quotemeta's brute-force escapades
349 441 100       736 $qm and $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{}\/])$/$1/;
350 441 100       7576 $debug and print "# <$token> case=<$case> qm=<$qm>\n";
351 441         801 push @path, $token;
352              
353             NEXT_TOKEN:
354 495 100       4271 if( defined $next_token ) {
355 15 100       937 $debug and print "# redo <$next_token>\n";
356 15         27 $token = $next_token;
357 15         16 $next_token = undef;
358 15         20 redo TOKEN;
359             }
360             }
361             }
362 211 100       378 if( $len < length($record) ) {
363             # NB: the remainder only arises in the case of degenerate lexer,
364             # and if \Q is operative, the lexer will have been switched to
365             # /\\?./, which means there can never be a remainder, so we
366             # don't have to bother about quotemeta. In other words:
367             # $qm will never be true in this block.
368 7         16 my $remain = substr($record,$len);
369 7 100       22 $case and $remain = $case eq 'U' ? uc($remain) : lc($remain);
    100          
370 7 100       630 $debug and print "# add remaining <$remain> case=<$case> qm=<$qm>\n";
371 7         26 push @path, $remain;
372             }
373 211 100       2056 $debug and print "# _lex out <@path>\n";
374 211         1540 return \@path;
375             }
376              
377             sub add {
378 1069     1069 1 13649 my $self = shift;
379 1069         928 my $record;
380 1069         1391 my $debug = $self->{debug} & DEBUG_LEX;
381 1069         2449 while( defined( $record = shift @_ )) {
382 2279 100       4411 CORE::chomp($record) if $self->{chomp};
383 2279 100 100     4812 next if $self->{pre_filter} and not $self->{pre_filter}->($record);
384 2278 100       6925 $debug and print "# add <$record>\n";
385 2278         2692 $self->{stats_raw} += length $record;
386             my $list = $record =~ /[+*?(\\\[{]/ # }]) restore equilibrium
387 2278 100       14642 ? $self->{lex} ? $self->_lex($record) : $self->_fastlex($record)
    100          
388             : [split //, $record]
389             ;
390 2278 100 100     5794 next if $self->{filter} and not $self->{filter}->(@$list);
391 2277         4091 $self->_insertr( $list );
392             }
393 1069         2305 return $self;
394             }
395              
396             sub add_file {
397 13     13 1 19 my $self = shift;
398 13         14 my $rs;
399             my @file;
400 13 100       37 if (ref($_[0]) eq 'HASH') {
401 6         10 my $arg = shift;
402             $rs = $arg->{rs}
403             || $arg->{input_record_separator}
404             || $self->{input_record_separator}
405 6   66     40 || $/;
406             @file = ref($arg->{file}) eq 'ARRAY'
407 4         13 ? @{$arg->{file}}
408 6 100       21 : $arg->{file};
409             }
410             else {
411 7   66     29 $rs = $self->{input_record_separator} || $/;
412 7         18 @file = @_;
413             }
414 13         52 local $/ = $rs;
415 13         14 my $file;
416 13         24 for $file (@file) {
417 15 100       1531 open my $fh, '<', $file or do {
418 1         13 require Carp;
419 1         247 Carp::croak("cannot open $file for input: $!");
420             };
421 14         23739 while (defined (my $rec = <$fh>)) {
422 55         132 $self->add($rec);
423             }
424 14         171 close $fh;
425             }
426 12         74 return $self;
427             }
428              
429             sub insert {
430 3398     3398 1 12413 my $self = shift;
431 3398 100 100     7647 return if $self->{filter} and not $self->{filter}->(@_);
432 3397         21858 $self->_insertr( [@_] );
433 3397         7273 return $self;
434             }
435              
436             sub _insertr {
437 7084     7084   6984 my $self = shift;
438 7084   100     20848 my $dup = $self->{stats_dup} || 0;
439 7084         11252 $self->{path} = $self->_insert_path( $self->_path, $self->_debug(DEBUG_ADD), $_[0] );
440 7084 100 100     18935 if( not defined $self->{stats_dup} or $dup == $self->{stats_dup} ) {
    100          
441 7073         8313 ++$self->{stats_add};
442 7073 100       5888 $self->{stats_cooked} += defined($_) ? length($_) : 0 for @{$_[0]};
  7073         37982  
443             }
444             elsif( $self->{dup_warn} ) {
445 4 100       10 if( ref $self->{dup_warn} eq 'CODE' ) {
446 2         7 $self->{dup_warn}->($self, $_[0]);
447             }
448             else {
449 2         2 my $pattern = join( '', @{$_[0]} );
  2         5  
450 2         11 require Carp;
451 2         36 Carp::carp("duplicate pattern added: /$pattern/");
452             }
453             }
454 7084         18152 $self->{str} = $self->{re} = undef;
455             }
456              
457             sub lexstr {
458 2     2 1 13 return shift->_lex(shift);
459             }
460              
461             sub pre_filter {
462 3     3 1 623 my $self = shift;
463 3         7 my $pre_filter = shift;
464 3 100 100     19 if( defined $pre_filter and ref($pre_filter) ne 'CODE' ) {
465 1         5 require Carp;
466 1         69 Carp::croak("pre_filter method not passed a coderef");
467             }
468 2         2 $self->{pre_filter} = $pre_filter;
469 2         6 return $self;
470             }
471              
472              
473             sub filter {
474 4     4 1 307 my $self = shift;
475 4         7 my $filter = shift;
476 4 100 100     22 if( defined $filter and ref($filter) ne 'CODE' ) {
477 1         9 require Carp;
478 1         166 Carp::croak("filter method not passed a coderef");
479             }
480 3         7 $self->{filter} = $filter;
481 3         9 return $self;
482             }
483              
484             sub as_string {
485 800     800 1 1388 my $self = shift;
486 800 100       1513 if( not defined $self->{str} ) {
487 798 100       1356 if( $self->{track} ) {
488 8         22 $self->{m} = undef;
489 8         23 $self->{mcount} = 0;
490 8         23 $self->{mlist} = [];
491 8         27 $self->{str} = _re_path_track($self, $self->_path, '', '');
492             }
493             else {
494 790 100 100     4327 $self->_reduce unless ($self->{mutable} or not $self->{reduce});
495 790         1217 my $arg = {@_};
496             $arg->{indent} = $self->{indent}
497 790 100 100     3689 if not exists $arg->{indent} and $self->{indent} > 0;
498 790 100 100     2595 if( exists $arg->{indent} and $arg->{indent} > 0 ) {
    100          
499 42         81 $arg->{depth} = 0;
500 42         101 $self->{str} = _re_path_pretty($self, $self->_path, $arg);
501             }
502             elsif( $self->{lookahead} ) {
503 35         90 $self->{str} = _re_path_lookahead($self, $self->_path);
504             }
505             else {
506 713         1356 $self->{str} = _re_path($self, $self->_path);
507             }
508             }
509 798 100       1835 if (not length $self->{str}) {
510             # explicitly fail to match anything if no pattern was generated
511 9         19 $self->{str} = $Always_Fail;
512             }
513             else {
514             my $begin =
515             $self->{anchor_word_begin} ? '\\b'
516             : $self->{anchor_line_begin} ? '^'
517 789 100       2495 : $self->{anchor_string_begin} ? '\A'
    100          
    100          
518             : ''
519             ;
520             my $end =
521             $self->{anchor_word_end} ? '\\b'
522             : $self->{anchor_line_end} ? '$'
523             : $self->{anchor_string_end} ? '\Z'
524 789 100       2439 : $self->{anchor_string_end_absolute} ? '\z'
    100          
    100          
    100          
525             : ''
526             ;
527 789         1520 $self->{str} = "$begin$self->{str}$end";
528             }
529 798 100       2024 $self->{path} = [] unless $self->{mutable};
530             }
531 800         4493 return $self->{str};
532             }
533              
534             sub re {
535 122     122 1 1345 my $self = shift;
536 122 100       486 $self->_build_re($self->as_string(@_)) unless defined $self->{re};
537 122         644 return $self->{re};
538             }
539              
540             use overload '""' => sub {
541 2131     2131   692647 my $self = shift;
542 2131 100       17310 return $self->{re} if $self->{re};
543 489         972 $self->_build_re($self->as_string());
544 489         6245 return $self->{re};
545 11     11   16230 };
  11         12073  
  11         128  
546              
547             sub _build_re {
548 618     618   658 my $self = shift;
549 618         610 my $str = shift;
550 618 100       1161 if( $self->{track} ) {
551 11     11   1059 use re 'eval';
  11         18  
  11         128533  
552             $self->{re} = length $self->{flags}
553 8 100       1571 ? qr/(?$self->{flags}:$str)/
554             : qr/$str/
555             ;
556             }
557             else {
558             # how could I not repeat myself?
559             $self->{re} = length $self->{flags}
560 610 100       13474 ? qr/(?$self->{flags}:$str)/
561             : qr/$str/
562             ;
563             }
564             }
565              
566             sub match {
567 29     29 1 6342 my $self = shift;
568 29         47 my $target = shift;
569 29 100       140 $self->_build_re($self->as_string(@_)) unless defined $self->{re};
570 29         64 $self->{m} = undef;
571 29         61 $self->{mvar} = [];
572 29 100       1014 if( not $target =~ /$self->{re}/ ) {
573 8         23 $self->{mbegin} = [];
574 8         26 $self->{mend} = [];
575 8         57 return undef;
576             }
577 21 50       92 $self->{m} = $^R if $] >= 5.009005;
578 21         148 $self->{mbegin} = _path_copy([@-]);
579 21         132 $self->{mend} = _path_copy([@+]);
580 21         52 my $n = 0;
581 21         81 for( my $n = 0; $n < @-; ++$n ) {
582 43 100 66     257 push @{$self->{mvar}}, substr($target, $-[$n], $+[$n] - $-[$n])
  33         281  
583             if defined $-[$n] and defined $+[$n];
584             }
585 21 100       63 if( $self->{track} ) {
586 20 50       190 return defined $self->{m} ? $self->{mlist}[$self->{m}] : 1;
587             }
588             else {
589 1         56 return 1;
590             }
591             }
592              
593             sub source {
594 4     4 1 387 my $self = shift;
595 4 100       20 return unless $self->{track};
596 3 100       17 defined($_[0]) and return $self->{mlist}[$_[0]];
597 2 100       15 return unless defined $self->{m};
598 1         6 return $self->{mlist}[$self->{m}];
599             }
600              
601             sub mbegin {
602 3     3 1 12 my $self = shift;
603 3 100       30 return exists $self->{mbegin} ? $self->{mbegin} : [];
604             }
605              
606             sub mend {
607 3     3 1 7 my $self = shift;
608 3 100       58 return exists $self->{mend} ? $self->{mend} : [];
609             }
610              
611             sub mvar {
612 19     19 1 35 my $self = shift;
613 19 100       68 return undef unless exists $self->{mvar};
614 18 100       134 return defined($_[0]) ? $self->{mvar}[$_[0]] : $self->{mvar};
615             }
616              
617             sub capture {
618 5     5 1 20 my $self = shift;
619 5 100       19 if( $self->{mvar} ) {
620 4         6 my @capture = @{$self->{mvar}};
  4         15  
621 4         8 shift @capture;
622 4         20 return @capture;
623             }
624 1         8 return ();
625             }
626              
627             sub matched {
628 9     9 1 476 my $self = shift;
629 9 100       82 return defined $self->{m} ? $self->{mlist}[$self->{m}] : undef;
630             }
631              
632             sub stats_add {
633 2     2 1 6 my $self = shift;
634 2   100     10 return $self->{stats_add} || 0;
635             }
636              
637             sub stats_dup {
638 2     2 1 3 my $self = shift;
639 2   100     11 return $self->{stats_dup} || 0;
640             }
641              
642             sub stats_raw {
643 2     2 1 4 my $self = shift;
644 2   100     12 return $self->{stats_raw} || 0;
645             }
646              
647             sub stats_cooked {
648 2     2 1 5 my $self = shift;
649 2   100     11 return $self->{stats_cooked} || 0;
650             }
651              
652             sub stats_length {
653 6     6 1 2618 my $self = shift;
654 6 100 100     41 return (defined $self->{str} and $self->{str} ne $Always_Fail) ? length $self->{str} : 0;
655             }
656              
657             sub dup_warn {
658 5     5 1 1250 my $self = shift;
659 5 100       13 $self->{dup_warn} = defined($_[0]) ? $_[0] : 1;
660 5         11 return $self;
661             }
662              
663             sub anchor_word_begin {
664 5     5 1 6 my $self = shift;
665 5 100       16 $self->{anchor_word_begin} = defined($_[0]) ? $_[0] : 1;
666 5         17 return $self;
667             }
668              
669             sub anchor_word_end {
670 4     4 1 3 my $self = shift;
671 4 100       6 $self->{anchor_word_end} = defined($_[0]) ? $_[0] : 1;
672 4         9 return $self;
673             }
674              
675             sub anchor_word {
676 2     2 1 3 my $self = shift;
677 2         2 my $state = shift;
678 2         6 $self->anchor_word_begin($state)->anchor_word_end($state);
679 2         6 return $self;
680             }
681              
682             sub anchor_line_begin {
683 4     4 1 4 my $self = shift;
684 4 100       12 $self->{anchor_line_begin} = defined($_[0]) ? $_[0] : 1;
685 4         7 return $self;
686             }
687              
688             sub anchor_line_end {
689 2     2 1 2 my $self = shift;
690 2 100       6 $self->{anchor_line_end} = defined($_[0]) ? $_[0] : 1;
691 2         2 return $self;
692             }
693              
694             sub anchor_line {
695 2     2 1 3 my $self = shift;
696 2         2 my $state = shift;
697 2         5 $self->anchor_line_begin($state)->anchor_line_end($state);
698 2         5 return $self;
699             }
700              
701             sub anchor_string_begin {
702 277     277 1 199 my $self = shift;
703 277 100       453 $self->{anchor_string_begin} = defined($_[0]) ? $_[0] : 1;
704 277         411 return $self;
705             }
706              
707             sub anchor_string_end {
708 276     276 1 203 my $self = shift;
709 276 100       421 $self->{anchor_string_end} = defined($_[0]) ? $_[0] : 1;
710 276         215 return $self;
711             }
712              
713             sub anchor_string_end_absolute {
714 3     3 1 4 my $self = shift;
715 3 100       8 $self->{anchor_string_end_absolute} = defined($_[0]) ? $_[0] : 1;
716 3         8 return $self;
717             }
718              
719             sub anchor_string {
720 274     274 1 248 my $self = shift;
721 274 100       406 my $state = defined($_[0]) ? $_[0] : 1;
722 274         387 $self->anchor_string_begin($state)->anchor_string_end($state);
723 274         685 return $self;
724             }
725              
726             sub anchor_string_absolute {
727 2     2 1 3 my $self = shift;
728 2 100       7 my $state = defined($_[0]) ? $_[0] : 1;
729 2         6 $self->anchor_string_begin($state)->anchor_string_end_absolute($state);
730 2         6 return $self;
731             }
732              
733             sub debug {
734 605     605 1 2217 my $self = shift;
735 605 100       1241 $self->{debug} = defined($_[0]) ? $_[0] : 0;
736 605 100       911 if ($self->_debug(DEBUG_TIME)) {
737             # hmm, debugging time was switched on after instantiation
738 4         12 $self->_init_time_func;
739 4         14 $self->{_begin_time} = $self->{_time_func}->();
740             }
741 605         774 return $self;
742             }
743              
744             sub dump {
745 9     9 1 1429 return _dump($_[0]->_path);
746             }
747              
748             sub chomp {
749 22     22 1 584 my $self = shift;
750 22 100       56 $self->{chomp} = defined($_[0]) ? $_[0] : 1;
751 22         48 return $self;
752             }
753              
754             sub fold_meta_pairs {
755 5     5 1 7 my $self = shift;
756 5 100       14 $self->{fold_meta_pairs} = defined($_[0]) ? $_[0] : 1;
757 5         10 return $self;
758             }
759              
760             sub indent {
761 4     4 1 593 my $self = shift;
762 4 100       11 $self->{indent} = defined($_[0]) ? $_[0] : 0;
763 4         8 return $self;
764             }
765              
766             sub lookahead {
767 22     22 1 32 my $self = shift;
768 22 100       82 $self->{lookahead} = defined($_[0]) ? $_[0] : 1;
769 22         65 return $self;
770             }
771              
772             sub flags {
773 24     24 1 1190 my $self = shift;
774 24 100       85 $self->{flags} = defined($_[0]) ? $_[0] : '';
775 24         84 return $self;
776             }
777              
778             sub modifiers {
779 4     4 1 1130 my $self = shift;
780 4         9 return $self->flags(@_);
781             }
782              
783             sub track {
784 5     5 1 1181 my $self = shift;
785 5 100       20 $self->{track} = defined($_[0]) ? $_[0] : 1;
786 5         12 return $self;
787             }
788              
789             sub unroll_plus {
790 2     2 1 840 my $self = shift;
791 2 100       8 $self->{unroll_plus} = defined($_[0]) ? $_[0] : 1;
792 2         8 return $self;
793             }
794              
795             sub lex {
796 1     1 1 3 my $self = shift;
797 1         52 $self->{lex} = qr($_[0]);
798 1         6 return $self;
799             }
800              
801             sub reduce {
802 19     19 1 975 my $self = shift;
803 19 100       74 $self->{reduce} = defined($_[0]) ? $_[0] : 1;
804 19         70 return $self;
805             }
806              
807             sub mutable {
808 5     5 1 905 my $self = shift;
809 5 100       21 $self->{mutable} = defined($_[0]) ? $_[0] : 1;
810 5         9 return $self;
811             }
812              
813             sub reset {
814             # reinitialise the internal state of the object
815 19     19 1 2451 my $self = shift;
816 19         45 $self->{path} = [];
817 19         48 $self->{re} = undef;
818 19         30 $self->{str} = undef;
819 19         82 return $self;
820             }
821              
822             sub Default_Lexer {
823 4 100   4 1 3135 if( $_[0] ) {
824 3 100       13 if( my $refname = ref($_[0]) ) {
825 1         11 require Carp;
826 1         162 Carp::croak("Cannot pass a $refname to Default_Lexer");
827             }
828 2         5 $Current_Lexer = $_[0];
829             }
830 3 100       15 return defined $Current_Lexer ? $Current_Lexer : $Default_Lexer;
831             }
832              
833             # --- no user serviceable parts below ---
834              
835             # -- debug helpers
836              
837             sub _debug {
838 12316     12316   10213 my $self = shift;
839 12316 100       37393 return $self->{debug} & shift() ? 1 : 0;
840             }
841              
842             # -- helpers
843              
844             sub _path {
845             # access the path
846 10027     10027   23461 return $_[0]->{path};
847             }
848              
849             # -- the heart of the matter
850              
851             $have_Storable = do {
852             eval {
853             require Storable;
854             import Storable 'dclone';
855             };
856             $@ ? 0 : 1;
857             };
858              
859             sub _path_clone {
860 55 100   55   3533 $have_Storable ? dclone($_[0]) : _path_copy($_[0]);
861             }
862              
863             sub _path_copy {
864 80     80   661 my $path = shift;
865 80         88 my $new = [];
866 80         197 for( my $p = 0; $p < @$path; ++$p ) {
867 201 100       400 if( ref($path->[$p]) eq 'HASH' ) {
    100          
868 9         16 push @$new, _node_copy($path->[$p]);
869             }
870             elsif( ref($path->[$p]) eq 'ARRAY' ) {
871 3         6 push @$new, _path_copy($path->[$p]);
872             }
873             else {
874 189         438 push @$new, $path->[$p];
875             }
876             }
877 80         189 return $new;
878             }
879              
880             sub _node_copy {
881 11     11   11 my $node = shift;
882 11         12 my $new = {};
883 11         33 while( my( $k, $v ) = each %$node ) {
884 27 100       49 $new->{$k} = defined($v)
885             ? _path_copy($v)
886             : undef
887             ;
888             }
889 11         26 return $new;
890             }
891              
892             sub _insert_path {
893 7158     7158   6708 my $self = shift;
894 7158         5836 my $list = shift;
895 7158         6522 my $debug = shift;
896 7158         5917 my @in = @{shift()}; # create a new copy
  7158         17541  
897 7158 100       13612 if( @$list == 0 ) { # special case the first time
898 2033 100 100     9831 if( @in == 0 or (@in == 1 and (not defined $in[0] or $in[0] eq ''))) {
      66        
      66        
899 33         221 return [{'' => undef}];
900             }
901             else {
902 2000         4686 return \@in;
903             }
904             }
905 5125 100       9268 $debug and print "# _insert_path @{[_dump(\@in)]} into @{[_dump($list)]}\n";
  151         235  
  151         176  
906 5125         4905 my $path = $list;
907 5125         4172 my $offset = 0;
908 5125         4019 my $token;
909 5125 100       8533 if( not @in ) {
910 2 100       6 if( ref($list->[0]) ne 'HASH' ) {
911 1         4 return [ { '' => undef, $list->[0] => $list } ];
912             }
913             else {
914 1         2 $list->[0]{''} = undef;
915 1         2 return $list;
916             }
917             }
918 5123         10827 while( defined( $token = shift @in )) {
919 17747 100       34171 if( ref($token) eq 'HASH' ) {
920 282 100       551 $debug and print "# p0=", _dump($path), "\n";
921 282         777 $path = $self->_insert_node( $path, $offset, $token, $debug, @in );
922 282 100       540 $debug and print "# p1=", _dump($path), "\n";
923 282         354 last;
924             }
925 17465 100       27747 if( ref($path->[$offset]) eq 'HASH' ) {
926 3714 100       5657 $debug and print "# at (off=$offset len=@{[scalar @$path]}) ", _dump($path->[$offset]), "\n";
  54         220  
927 3714         3558 my $node = $path->[$offset];
928 3714 100       5625 if( exists( $node->{$token} )) {
929 2632 100       4425 if ($offset < $#$path) {
930             my $new = {
931             $token => [$token, @in],
932 1         5 _re_path($self, [$node]) => [@{$path}[$offset..$#$path]],
  1         3  
933             };
934 1         4 splice @$path, $offset, @$path-$offset, $new;
935 1         2 last;
936             }
937             else {
938 2631 100       4083 $debug and print "# descend key=$token @{[_dump($node->{$token})]}\n";
  31         64  
939 2631         2892 $path = $node->{$token};
940 2631         2190 $offset = 0;
941 2631         2992 redo;
942             }
943             }
944             else {
945 1082 100       1797 $debug and print "# add path ($token:@{[_dump(\@in)]}) into @{[_dump($path)]} at off=$offset to end=@{[scalar $#$path]}\n";
  23         52  
  23         45  
  23         1101  
946 1082 100       1793 if( $offset == $#$path ) {
947 1075         3063 $node->{$token} = [ $token, @in ];
948             }
949             else {
950             my $new = {
951             _node_key($token) => [ $token, @in ],
952 7         20 _node_key($node) => [@{$path}[$offset..$#{$path}]],
  7         26  
  7         14  
953             };
954 7         27 splice( @$path, $offset, @$path - $offset, $new );
955 7 100       21 $debug and print "# fused node=@{[_dump($new)]} path=@{[_dump($path)]}\n";
  1         2  
  1         2  
956             }
957 1082         1410 last;
958             }
959             }
960              
961 13751 100       18804 if( $debug ) {
962 306         289 my $msg = '';
963 306         223 my $n;
964 306         533 for( $n = 0; $n < @$path; ++$n ) {
965 1093 100       1385 $msg .= ' ' if $n;
966             my $atom = ref($path->[$n]) eq 'HASH'
967 1093 100       1396 ? '{'.join( ' ', keys(%{$path->[$n]})).'}'
  81         209  
968             : $path->[$n]
969             ;
970 1093 100       2432 $msg .= $n == $offset ? "<$atom>" : $atom;
971             }
972 306         12480 print "# at path ($msg)\n";
973             }
974              
975 13751 100       33357 if( $offset >= @$path ) {
    100          
    100          
976 732         3047 push @$path, { $token => [ $token, @in ], '' => undef };
977 732 100       1680 $debug and print "# added remaining @{[_dump($path)]}\n";
  21         48  
978 732         953 last;
979             }
980             elsif( $token ne $path->[$offset] ) {
981 2103 100       6803 $debug and print "# token $token not present\n";
982             splice @$path, $offset, @$path-$offset, {
983             length $token
984             ? ( _node_key($token) => [$token, @in])
985             : ( '' => undef )
986             ,
987 2103 100       5961 $path->[$offset] => [@{$path}[$offset..$#{$path}]],
  2103         8690  
  2103         3045  
988             };
989 2103 100       4418 $debug and print "# path=@{[_dump($path)]}\n";
  79         119  
990 2103         2404 last;
991             }
992             elsif( not @in ) {
993 923 100       1952 $debug and print "# last token to add\n";
994 923 100       2171 if( defined( $path->[$offset+1] )) {
995 912         971 ++$offset;
996 912 100       1655 if( ref($path->[$offset]) eq 'HASH' ) {
997 118 100       322 $debug and print "# add sentinel to node\n";
998 118         216 $path->[$offset]{''} = undef;
999             }
1000             else {
1001 794 100       1397 $debug and print "# convert <$path->[$offset]> to node for sentinel\n";
1002             splice @$path, $offset, @$path-$offset, {
1003             '' => undef,
1004 794         2166 $path->[$offset] => [ @{$path}[$offset..$#{$path}] ],
  794         3758  
  794         1262  
1005             };
1006             }
1007             }
1008             else {
1009             # already seen this pattern
1010 11         22 ++$self->{stats_dup};
1011             }
1012 923         1370 last;
1013             }
1014             # if we get here then @_ still contains a token
1015 9993         16401 ++$offset;
1016             }
1017 5123         11451 $list;
1018             }
1019              
1020             sub _insert_node {
1021 282     282   310 my $self = shift;
1022 282         283 my $path = shift;
1023 282         263 my $offset = shift;
1024 282         306 my $token = shift;
1025 282         270 my $debug = shift;
1026 282         379 my $path_end = [@{$path}[$offset..$#{$path}]];
  282         609  
  282         420  
1027             # NB: $path->[$offset] and $[path_end->[0] are equivalent
1028 282         751 my $token_key = _re_path($self, [$token]);
1029 282 100       726 $debug and print "# insert node(@{[_dump($token)]}:@{[_dump(\@_)]}) (key=$token_key)",
  26         47  
  26         52  
1030 26         48 " at path=@{[_dump($path_end)]}\n";
1031 282 100       720 if( ref($path_end->[0]) eq 'HASH' ) {
1032 195 100       684 if( exists($path_end->[0]{$token_key}) ) {
    100          
1033 25 100       70 if( @$path_end > 1 ) {
1034 2         9 my $path_key = _re_path($self, [$path_end->[0]]);
1035 2         12 my $new = {
1036             $path_key => [ @$path_end ],
1037             $token_key => [ $token, @_ ],
1038             };
1039 2 100       8 $debug and print "# +bifurcate new=@{[_dump($new)]}\n";
  1         3  
1040 2         39 splice( @$path, $offset, @$path_end, $new );
1041             }
1042             else {
1043 23         41 my $old_path = $path_end->[0]{$token_key};
1044 23         36 my $new_path = [];
1045 23   100     115 while( @$old_path and _node_eq( $old_path->[0], $token )) {
1046 30 100       111 $debug and print "# identical nodes in sub_path ",
    100          
1047             ref($token) ? _dump($token) : $token, "\n";
1048 30         58 push @$new_path, shift(@$old_path);
1049 30         97 $token = shift @_;
1050             }
1051 23 50       72 if( @$new_path ) {
1052 23         74 my $new;
1053 23         29 my $token_key = $token;
1054 23 100       46 if( @_ ) {
1055 6         13 $new = {
1056             _re_path($self, $old_path) => $old_path,
1057             $token_key => [$token, @_],
1058             };
1059 6 100       16 $debug and print "# insert_node(bifurc) n=@{[_dump([$new])]}\n";
  1         2  
1060             }
1061             else {
1062 17 100       43 $debug and print "# insert $token into old path @{[_dump($old_path)]}\n";
  5         9  
1063 17 100       46 if( @$old_path ) {
1064 11         67 $new = ($self->_insert_path( $old_path, $debug, [$token] ))->[0];
1065             }
1066             else {
1067 6         19 $new = { '' => undef, $token => [$token] };
1068             }
1069             }
1070 23         47 push @$new_path, $new;
1071             }
1072 23         36 $path_end->[0]{$token_key} = $new_path;
1073 23 100       59 $debug and print "# +_insert_node result=@{[_dump($path_end)]}\n";
  6         9  
1074 23         68 splice( @$path, $offset, @$path_end, @$path_end );
1075             }
1076             }
1077             elsif( not _node_eq( $path_end->[0], $token )) {
1078 70 100       149 if( @$path_end > 1 ) {
1079 11         36 my $path_key = _re_path($self, [$path_end->[0]]);
1080 11         74 my $new = {
1081             $path_key => [ @$path_end ],
1082             $token_key => [ $token, @_ ],
1083             };
1084 11 100       69 $debug and print "# path->node1 at $path_key/$token_key @{[_dump($new)]}\n";
  1         3  
1085 11         44 splice( @$path, $offset, @$path_end, $new );
1086             }
1087             else {
1088 59 100       121 $debug and print "# next in path is node, trivial insert at $token_key\n";
1089 59         212 $path_end->[0]{$token_key} = [$token, @_];
1090 59         157 splice( @$path, $offset, @$path_end, @$path_end );
1091             }
1092             }
1093             else {
1094 100   100     392 while( @$path_end and _node_eq( $path_end->[0], $token )) {
1095 131 100       419 $debug and print "# identical nodes @{[_dump([$token])]}\n";
  9         21  
1096 131         187 shift @$path_end;
1097 131         254 $token = shift @_;
1098 131         350 ++$offset;
1099             }
1100 100 100       202 if( @$path_end ) {
1101 57 100       130 $debug and print "# insert at $offset $token:@{[_dump(\@_)]} into @{[_dump($path_end)]}\n";
  4         9  
  4         8  
1102 57         318 $path_end = $self->_insert_path( $path_end, $debug, [$token, @_] );
1103 57 100       149 $debug and print "# got off=$offset s=@{[scalar @_]} path_add=@{[_dump($path_end)]}\n";
  4         15  
  4         8  
1104 57         175 splice( @$path, $offset, @$path - $offset, @$path_end );
1105 57 100       150 $debug and print "# got final=@{[_dump($path)]}\n";
  4         9  
1106             }
1107             else {
1108 43         113 $token_key = _node_key($token);
1109 43         212 my $new = {
1110             '' => undef,
1111             $token_key => [ $token, @_ ],
1112             };
1113 43 100       129 $debug and print "# convert opt @{[_dump($new)]}\n";
  3         5  
1114 43         101 push @$path, $new;
1115             }
1116             }
1117             }
1118             else {
1119 87 100       172 if( @$path_end ) {
1120 74         356 my $new = {
1121             $path_end->[0] => [ @$path_end ],
1122             $token_key => [ $token, @_ ],
1123             };
1124 74 100       165 $debug and print "# atom->node @{[_dump($new)]}\n";
  5         8  
1125 74         205 splice( @$path, $offset, @$path_end, $new );
1126 74 100       180 $debug and print "# out=@{[_dump($path)]}\n";
  5         9  
1127             }
1128             else {
1129 13 100       34 $debug and print "# add opt @{[_dump([$token,@_])]} via $token_key\n";
  4         10  
1130 13         55 push @$path, {
1131             '' => undef,
1132             $token_key => [ $token, @_ ],
1133             };
1134             }
1135             }
1136 282         952 $path;
1137             }
1138              
1139             sub _reduce {
1140 810     810   994 my $self = shift;
1141 810         1384 my $context = { debug => $self->_debug(DEBUG_TAIL), depth => 0 };
1142              
1143 810 100       1563 if ($self->_debug(DEBUG_TIME)) {
1144 4         6 $self->_init_time_func;
1145 4         10 my $now = $self->{_time_func}->();
1146 4 100       14 if (exists $self->{_begin_time}) {
1147 3         302 printf "# load=%0.6f\n", $now - $self->{_begin_time};
1148             }
1149             else {
1150 1         44 printf "# load-epoch=%0.6f\n", $now;
1151             }
1152 4         17 $self->{_begin_time} = $self->{_time_func}->();
1153             }
1154              
1155 810         1316 my ($head, $tail) = _reduce_path( $self->_path, $context );
1156 810 100       1632 $context->{debug} and print "# final head=", _dump($head), ' tail=', _dump($tail), "\n";
1157 810 100       1356 if( !@$head ) {
1158 651         991 $self->{path} = $tail;
1159             }
1160             else {
1161             $self->{path} = [
1162 159         345 @{_unrev_path( $tail, $context )},
1163 159         174 @{_unrev_path( $head, $context )},
  159         249  
1164             ];
1165             }
1166              
1167 810 100       1961 if ($self->_debug(DEBUG_TIME)) {
1168 4         10 my $now = $self->{_time_func}->();
1169 4 50       9 if (exists $self->{_begin_time}) {
1170 4         329 printf "# reduce=%0.6f\n", $now - $self->{_begin_time};
1171             }
1172             else {
1173 0         0 printf "# reduce-epoch=%0.6f\n", $now;
1174             }
1175 4         16 $self->{_begin_time} = $self->{_time_func}->();
1176             }
1177              
1178 810 100       1695 $context->{debug} and print "# final path=", _dump($self->{path}), "\n";
1179 810         1566 return $self;
1180             }
1181              
1182             sub _remove_optional {
1183 1710 100   1710   3395 if( exists $_[0]->{''} ) {
1184 350         633 delete $_[0]->{''};
1185 350         545 return 1;
1186             }
1187 1360         1598 return 0;
1188             }
1189              
1190             sub _reduce_path {
1191 810     810   925 my ($path, $ctx) = @_;
1192 810         1504 my $indent = ' ' x $ctx->{depth};
1193 810         823 my $debug = $ctx->{debug};
1194 810 100       1420 $debug and print "#$indent _reduce_path $ctx->{depth} ", _dump($path), "\n";
1195 810         700 my $new;
1196 810         960 my $head = [];
1197 810         1010 my $tail = [];
1198 810         1897 while( defined( my $p = pop @$path )) {
1199 1159 100       1946 if( ref($p) eq 'HASH' ) {
1200 514         937 my ($node_head, $node_tail) = _reduce_node($p, _descend($ctx) );
1201 514 100       1496 $debug and print "#$indent| head=", _dump($node_head), " tail=", _dump($node_tail), "\n";
1202 514 100       1233 push @$head, @$node_head if scalar @$node_head;
1203 514 100       4140 push @$tail, ref($node_tail) eq 'HASH' ? $node_tail : @$node_tail;
1204             }
1205             else {
1206 645 100       943 if( @$head ) {
1207 125 100       257 $debug and print "#$indent| push $p leaves @{[_dump($path)]}\n";
  7         38  
1208 125         383 push @$tail, $p;
1209             }
1210             else {
1211 520 100       1135 $debug and print "#$indent| unshift $p\n";
1212 520         1573 unshift @$tail, $p;
1213             }
1214             }
1215             }
1216 20         278 $debug and print "#$indent| tail nr=@{[scalar @$tail]} t0=", ref($tail->[0]),
1217 810 100       1511 (ref($tail->[0]) eq 'HASH' ? " n=" . scalar(keys %{$tail->[0]}) : '' ),
  18 100       894  
1218             "\n";
1219 810 100 100     2566 if( @$tail > 1
      100        
1220             and ref($tail->[0]) eq 'HASH'
1221 97         437 and keys %{$tail->[0]} == 2
1222             ) {
1223 72         81 my $opt;
1224             my $fixed;
1225 72         107 while( my ($key, $path) = each %{$tail->[0]} ) {
  216         586  
1226 144 100       262 $debug and print "#$indent| scan k=$key p=@{[_dump($path)]}\n";
  14         29  
1227 144 100       287 next unless $path;
1228 123 100 100     415 if (@$path == 1 and ref($path->[0]) eq 'HASH') {
1229 8         21 $opt = $path->[0];
1230             }
1231             else {
1232 115         175 $fixed = $path;
1233             }
1234             }
1235 72 100       197 if( exists $tail->[0]{''} ) {
1236 21         45 my $path = [@{$tail}[1..$#{$tail}]];
  21         67  
  21         39  
1237 21         41 $tail = $tail->[0];
1238 21         63 ($head, $tail, $path) = _slide_tail( $head, $tail, $path, _descend($ctx) );
1239 21         83 $tail = [$tail, @$path];
1240             }
1241             }
1242 810 100       1396 $debug and print "#$indent _reduce_path $ctx->{depth} out head=", _dump($head), ' tail=', _dump($tail), "\n";
1243 810         1473 return ($head, $tail);
1244             }
1245              
1246             sub _reduce_node {
1247 1080     1080   1338 my ($node, $ctx) = @_;
1248 1080         2204 my $indent = ' ' x $ctx->{depth};
1249 1080         1100 my $debug = $ctx->{debug};
1250 1080         1513 my $optional = _remove_optional($node);
1251 1080 100       1935 $debug and print "#$indent _reduce_node $ctx->{depth} in @{[_dump($node)]} opt=$optional\n";
  57         101  
1252 1080 100 100     2500 if( $optional and scalar keys %$node == 1 ) {
1253 70         128 my $path = (values %$node)[0];
1254 70 100       105 if( not grep { ref($_) eq 'HASH' } @$path ) {
  117         311  
1255             # if we have removed an optional, and there is only one path
1256             # left then there is nothing left to compare. Because of the
1257             # optional it cannot participate in any further reductions.
1258             # (unless we test for equality among sub-trees).
1259 61         197 my $result = {
1260             '' => undef,
1261             $path->[0] => $path
1262             };
1263 61 100       121 $debug and print "#$indent| fast fail @{[_dump($result)]}\n";
  2         5  
1264 61         155 return [], $result;
1265             }
1266             }
1267              
1268 1019         1416 my( $fail, $reduce ) = _scan_node( $node, _descend($ctx) );
1269              
1270 1019 100       2711 $debug and print "#$indent|_scan_node done opt=$optional reduce=@{[_dump($reduce)]} fail=@{[_dump($fail)]}\n";
  55         100  
  55         91  
1271              
1272             # We now perform tail reduction on each of the nodes in the reduce
1273             # hash. If we have only one key, we know we will have a successful
1274             # reduction (since everything that was inserted into the node based
1275             # on the value of the last token of each path all mapped to the same
1276             # value).
1277              
1278 1019 100 100     5796 if( @$fail == 0 and keys %$reduce == 1 and not $optional) {
      100        
1279             # every path shares a common path
1280 506         825 my $path = (values %$reduce)[0];
1281 506         906 my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1282 506 100       1373 $debug and print "#$indent|_reduce_node $ctx->{depth} common=@{[_dump($common)]} tail=", _dump($tail), "\n";
  50         105  
1283 506         2350 return( $common, $tail );
1284             }
1285              
1286             # this node resulted in a list of paths, game over
1287 513         1027 $ctx->{indent} = $indent;
1288 513         970 return _reduce_fail( $reduce, $fail, $optional, _descend($ctx) );
1289             }
1290              
1291             sub _reduce_fail {
1292 513     513   628 my( $reduce, $fail, $optional, $ctx ) = @_;
1293 513         555 my( $debug, $depth, $indent ) = @{$ctx}{qw(debug depth indent)};
  513         935  
1294 513         533 my %result;
1295 513 100       892 $result{''} = undef if $optional;
1296 513         472 my $p;
1297 513         1024 for $p (keys %$reduce) {
1298 1181         1157 my $path = $reduce->{$p};
1299 1181 100       1649 if( scalar @$path == 1 ) {
1300 1114         1137 $path = $path->[0];
1301 1114 100       1633 $debug and print "#$indent| -simple opt=$optional unrev @{[_dump($path)]}\n";
  7         12  
1302 1114         1383 $path = _unrev_path($path, _descend($ctx) );
1303 1114         2347 $result{_node_key($path->[0])} = $path;
1304             }
1305             else {
1306 67 100       283 $debug and print "#$indent| _do_reduce(@{[_dump($path)]})\n";
  1         3  
1307 67         143 my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1308             $path = [
1309             (
1310             ref($tail) eq 'HASH'
1311             ? _unrev_node($tail, _descend($ctx) )
1312             : _unrev_path($tail, _descend($ctx) )
1313             ),
1314 67 100       330 @{_unrev_path($common, _descend($ctx) )}
  67         131  
1315             ];
1316 67 100       266 $debug and print "#$indent| +reduced @{[_dump($path)]}\n";
  1         3  
1317 67         286 $result{_node_key($path->[0])} = $path;
1318             }
1319             }
1320 513         542 my $f;
1321 513         786 for $f( @$fail ) {
1322 219 100       369 $debug and print "#$indent| +fail @{[_dump($f)]}\n";
  3         6  
1323 219         383 $result{$f->[0]} = $f;
1324             }
1325 513 100       846 $debug and print "#$indent _reduce_fail $depth fail=@{[_dump(\%result)]}\n";
  5         11  
1326 513         2252 return ( [], \%result );
1327             }
1328              
1329             sub _scan_node {
1330 1019     1019   1144 my( $node, $ctx ) = @_;
1331 1019         1590 my $indent = ' ' x $ctx->{depth};
1332 1019         1142 my $debug = $ctx->{debug};
1333              
1334             # For all the paths in the node, reverse them. If the first token
1335             # of the path is a scalar, push it onto an array in a hash keyed by
1336             # the value of the scalar.
1337             #
1338             # If it is a node, call _reduce_node on this node beforehand. If we
1339             # get back a common head, all of the paths in the subnode shared a
1340             # common tail. We then store the common part and the remaining node
1341             # of paths (which is where the paths diverged from the end and install
1342             # this into the same hash. At this point both the common and the tail
1343             # are in reverse order, just as simple scalar paths are.
1344             #
1345             # On the other hand, if there were no common path returned then all
1346             # the paths of the sub-node diverge at the end character. In this
1347             # case the tail cannot participate in any further reductions and will
1348             # appear in forward order.
1349             #
1350             # certainly the hurgliest function in the whole file :(
1351              
1352             # $debug = 1 if $depth >= 8;
1353 1019         908 my @fail;
1354             my %reduce;
1355              
1356 0         0 my $n;
1357 1019         2186 for $n(
1358 2743         5801 map { substr($_, index($_, '#')+1) }
1359             sort
1360             map {
1361             join( '|' =>
1362 9029         13768 scalar(grep {ref($_) eq 'HASH'} @{$node->{$_}}),
  2743         4056  
1363             _node_offset($node->{$_}),
1364 2743         2315 scalar @{$node->{$_}},
  2743         10173  
1365             )
1366             . "#$_"
1367             }
1368             keys %$node ) {
1369 2743         2164 my( $end, @path ) = reverse @{$node->{$n}};
  2743         6082  
1370 2743 100       4069 if( ref($end) ne 'HASH' ) {
1371 2137 100       3262 $debug and print "# $indent|_scan_node push reduce ($end:@{[_dump(\@path)]})\n";
  87         177  
1372 2137         1658 push @{$reduce{$end}}, [ $end, @path ];
  2137         7966  
1373             }
1374             else {
1375 606 100       1208 $debug and print "# $indent|_scan_node head=", _dump(\@path), ' tail=', _dump($end), "\n";
1376 606         627 my $new_path;
1377             # deal with sing, singing => s(?:ing)?ing
1378 606 100 66     2278 if( keys %$end == 2 and exists $end->{''} ) {
1379 94         180 my ($key, $opt_path) = each %$end;
1380 94 100       248 ($key, $opt_path) = each %$end if $key eq '';
1381 94         94 $opt_path = [reverse @{$opt_path}];
  94         229  
1382 94 100       196 $debug and print "# $indent| check=", _dump($opt_path), "\n";
1383 94         328 my $end = { '' => undef, $opt_path->[0] => [@$opt_path] };
1384 94         169 my $head = [];
1385 94         187 my $path = [@path];
1386 94         207 ($head, my $slide, $path) = _slide_tail( $head, $end, $path, $ctx );
1387 94 100       331 if( @$head ) {
1388 40         184 $new_path = [ @$head, $slide, @$path ];
1389             }
1390             }
1391 606 100       874 if( $new_path ) {
1392 40 100       84 $debug and print "# $indent|_scan_node slid=", _dump($new_path), "\n";
1393 40         42 push @{$reduce{$new_path->[0]}}, $new_path;
  40         173  
1394             }
1395             else {
1396 566         1003 my( $common, $tail ) = _reduce_node( $end, _descend($ctx) );
1397 566 100       1634 if( not @$common ) {
1398 219 100       586 $debug and print "# $indent| +failed $n\n";
1399 219         839 push @fail, [reverse(@path), $tail];
1400             }
1401             else {
1402 347         782 my $path = [@path];
1403 347 100       705 $debug and print "# $indent|_scan_node ++recovered common=@{[_dump($common)]} tail=",
  34         67  
1404 34         46 _dump($tail), " path=@{[_dump($path)]}\n";
1405 347 100 100     1864 if( ref($tail) eq 'HASH'
1406             and keys %$tail == 2
1407             ) {
1408 287 100       632 if( exists $tail->{''} ) {
1409 121         280 ($common, $tail, $path) = _slide_tail( $common, $tail, $path, $ctx );
1410             }
1411             }
1412 347 100       432 push @{$reduce{$common->[0]}}, [
  347         2496  
1413             @$common,
1414             (ref($tail) eq 'HASH' ? $tail : @$tail ),
1415             @$path
1416             ];
1417             }
1418             }
1419             }
1420             }
1421 1019 100       2588 $debug and print
1422 55         179 "# $indent|_scan_node counts: reduce=@{[scalar keys %reduce]} fail=@{[scalar @fail]}\n";
  55         2347  
1423 1019         2340 return( \@fail, \%reduce );
1424             }
1425              
1426             sub _do_reduce {
1427 573     573   679 my ($path, $ctx) = @_;
1428 573         954 my $indent = ' ' x $ctx->{depth};
1429 573         652 my $debug = $ctx->{debug};
1430 573         1495 my $ra = Regexp::Assemble->new(chomp=>0);
1431 573         1279 $ra->debug($debug);
1432 573 100       1026 $debug and print "# $indent| do @{[_dump($path)]}\n";
  51         80  
1433 573         1778 $ra->_insertr( $_ ) for
1434             # When nodes come into the picture, we have to be careful
1435             # about how we insert the paths into the assembly.
1436             # Paths with nodes first, then closest node to front
1437             # then shortest path. Merely because if we can control
1438             # order in which paths containing nodes get inserted,
1439             # then we can make a couple of assumptions that simplify
1440             # the code in _insert_node.
1441             sort {
1442 6000         7831 scalar(grep {ref($_) eq 'HASH'} @$a)
1443 1113 50 100     1441 <=> scalar(grep {ref($_) eq 'HASH'} @$b)
  6578         9782  
1444             ||
1445             _node_offset($b) <=> _node_offset($a)
1446             ||
1447             scalar @$a <=> scalar @$b
1448             }
1449             @$path
1450             ;
1451 573         1188 $path = $ra->_path;
1452 573         712 my $common = [];
1453 573         3553 push @$common, shift @$path while( ref($path->[0]) ne 'HASH' );
1454 573 100       1173 my $tail = scalar( @$path ) > 1 ? [@$path] : $path->[0];
1455 573 100       980 $debug and print "# $indent| _do_reduce common=@{[_dump($common)]} tail=@{[_dump($tail)]}\n";
  51         89  
  51         68  
1456 573         3129 return ($common, $tail);
1457             }
1458              
1459             sub _node_offset {
1460             # return the offset that the first node is found, or -ve
1461             # optimised for speed
1462 4422     4422   3172 my $nr = @{$_[0]};
  4422         5067  
1463 4422         3803 my $atom = -1;
1464 4422   100     33844 ref($_[0]->[$atom]) eq 'HASH' and return $atom while ++$atom < $nr;
1465 3458         6136 return -1;
1466             }
1467              
1468             sub _slide_tail {
1469 240     240   17552 my $head = shift;
1470 240         242 my $tail = shift;
1471 240         224 my $path = shift;
1472 240         245 my $ctx = shift;
1473 240         434 my $indent = ' ' x $ctx->{depth};
1474 240         279 my $debug = $ctx->{debug};
1475 240 100       444 $debug and print "# $indent| slide in h=", _dump($head),
1476             ' t=', _dump($tail), ' p=', _dump($path), "\n";
1477 240         486 my $slide_path = (each %$tail)[-1];
1478 240 100       585 $slide_path = (each %$tail)[-1] unless defined $slide_path;
1479 240 100       461 $debug and print "# $indent| slide potential ", _dump($slide_path), " over ", _dump($path), "\n";
1480 240   100     1147 while( defined $path->[0] and $path->[0] eq $slide_path->[0] ) {
1481 154 100       478 $debug and print "# $indent| slide=tail=$slide_path->[0]\n";
1482 154         195 my $slide = shift @$path;
1483 154         165 shift @$slide_path;
1484 154         241 push @$slide_path, $slide;
1485 154         570 push @$head, $slide;
1486             }
1487 240 100       520 $debug and print "# $indent| slide path ", _dump($slide_path), "\n";
1488 240         501 my $slide_node = {
1489             '' => undef,
1490             _node_key($slide_path->[0]) => $slide_path,
1491             };
1492 240 100       571 $debug and print "# $indent| slide out h=", _dump($head),
1493             ' s=', _dump($slide_node), ' p=', _dump($path), "\n";
1494 240         702 return ($head, $slide_node, $path);
1495             }
1496              
1497             sub _unrev_path {
1498 2690     2690   2747 my ($path, $ctx) = @_;
1499 2690         3576 my $indent = ' ' x $ctx->{depth};
1500 2690         2415 my $debug = $ctx->{debug};
1501 2690         2003 my $new;
1502 2690 100       2869 if( not grep { ref($_) } @$path ) {
  6383         10476  
1503 2196 100       3404 $debug and print "# ${indent}_unrev path fast ", _dump($path);
1504 2196         3934 $new = [reverse @$path];
1505 2196 100       3520 $debug and print "# -> ", _dump($new), "\n";
1506 2196         3500 return $new;
1507             }
1508 494 100       860 $debug and print "# ${indent}unrev path in ", _dump($path), "\n";
1509 494         1147 while( defined( my $p = pop @$path )) {
1510 1483 100       4520 push @$new,
    100          
1511             ref($p) eq 'HASH' ? _unrev_node($p, _descend($ctx) )
1512             : ref($p) eq 'ARRAY' ? _unrev_path($p, _descend($ctx) )
1513             : $p
1514             ;
1515             }
1516 494 100       907 $debug and print "# ${indent}unrev path out ", _dump($new), "\n";
1517 494         755 return $new;
1518             }
1519              
1520             sub _unrev_node {
1521 630     630   727 my ($node, $ctx ) = @_;
1522 630         1011 my $indent = ' ' x $ctx->{depth};
1523 630         591 my $debug = $ctx->{debug};
1524 630         866 my $optional = _remove_optional($node);
1525 630 100       1147 $debug and print "# ${indent}unrev node in ", _dump($node), " opt=$optional\n";
1526 630         643 my $new;
1527 630 100       1103 $new->{''} = undef if $optional;
1528 630         464 my $n;
1529 630         1455 for $n( keys %$node ) {
1530 1167         1780 my $path = _unrev_path($node->{$n}, _descend($ctx) );
1531 1167         2645 $new->{_node_key($path->[0])} = $path;
1532             }
1533 630 100       1142 $debug and print "# ${indent}unrev node out ", _dump($new), "\n";
1534 630         2628 return $new;
1535             }
1536              
1537             sub _node_key {
1538 4657     4657   5056 my $node = shift;
1539 4657 100       7581 return _node_key($node->[0]) if ref($node) eq 'ARRAY';
1540 4646 100       15824 return $node unless ref($node) eq 'HASH';
1541 205         243 my $key = '';
1542 205         213 my $k;
1543 205         490 for $k( keys %$node ) {
1544 452 100       749 next if $k eq '';
1545 374 100 100     1311 $key = $k if $key eq '' or $key gt $k;
1546             }
1547 205         633 return $key;
1548             }
1549              
1550             sub _descend {
1551             # Take a context object, and increase the depth by one.
1552             # By creating a fresh hash each time, we don't have to
1553             # bother adding make-work code to decrease the depth
1554             # when we return from what we called.
1555 6190     6190   5261 my $ctx = shift;
1556 6190         24211 return {%$ctx, depth => $ctx->{depth}+1};
1557             }
1558              
1559             #####################################################################
1560              
1561             sub _make_class {
1562 650     650   708 my $self = shift;
1563 650         1053 my %set = map { ($_,1) } @_;
  1675         3230  
1564 650 100       1481 delete $set{'\\d'} if exists $set{'\\w'};
1565 650 100       1185 delete $set{'\\D'} if exists $set{'\\W'};
1566             return '.' if exists $set{'.'}
1567             or ($self->{fold_meta_pairs} and (
1568             (exists $set{'\\d'} and exists $set{'\\D'})
1569             or (exists $set{'\\s'} and exists $set{'\\S'})
1570 650 100 66     5172 or (exists $set{'\\w'} and exists $set{'\\W'})
      66        
      66        
1571             ))
1572             ;
1573 632         1001 for my $meta( q/\\d/, q/\\D/, q/\\s/, q/\\S/, q/\\w/, q/\\W/ ) {
1574 3792 100       6044 if( exists $set{$meta} ) {
1575 28         180 my $re = qr/$meta/;
1576 28         39 my @delete;
1577 28   66     379 $_ =~ /^$re$/ and push @delete, $_ for keys %set;
1578 28 100       90 delete @set{@delete} if @delete;
1579             }
1580             }
1581 632 100       1352 return (keys %set)[0] if keys %set == 1;
1582 625         880 for my $meta( '.', '+', '*', '?', '(', ')', '^', '@', '$', '[', '/', ) {
1583 6875 100       11239 exists $set{"\\$meta"} and $set{$meta} = delete $set{"\\$meta"};
1584             }
1585 625 100       1211 my $dash = exists $set{'-'} ? do { delete($set{'-'}), '-' } : '';
  20         47  
1586 625 100       975 my $caret = exists $set{'^'} ? do { delete($set{'^'}), '^' } : '';
  7         16  
1587 625         2184 my $class = join( '' => sort keys %set );
1588 625 100 100     1938 $class =~ s/0123456789/\\d/ and $class eq '\\d' and return $class;
1589 622         4706 return "[$dash$class$caret]";
1590             }
1591              
1592             sub _re_sort {
1593 999   100 999   5178 return length $b <=> length $a || $a cmp $b
1594             }
1595              
1596             sub _combine {
1597 140     140   155 my $self = shift;
1598 140         134 my $type = shift;
1599             # print "c in = @{[_dump(\@_)]}\n";
1600             # my $combine =
1601             return '('
1602             . $type
1603 140         177 . do {
1604 140         113 my( @short, @long );
1605 140 100       249 push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
  377         2240  
1606 140 100       325 if( @short == 1 ) {
    100          
1607 31         126 @long = sort _re_sort @long, @short;
1608             }
1609             elsif( @short > 1 ) {
1610             # yucky but true
1611 77         151 my @combine = (_make_class($self, @short), sort _re_sort @long);
1612 77         211 @long = @combine;
1613             }
1614             else {
1615 32         103 @long = sort _re_sort @long;
1616             }
1617 140         473 join( '|', @long );
1618             }
1619             . ')';
1620             # print "combine <$combine>\n";
1621             # $combine;
1622             }
1623              
1624             sub _combine_new {
1625 1738     1738   1746 my $self = shift;
1626 1738         1560 my( @short, @long );
1627 1738 100       2705 push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
  3244         20283  
1628 1738 100 100     8016 if( @short == 1 and @long == 0 ) {
    100 100        
1629 365         2017 return $short[0];
1630             }
1631             elsif( @short > 1 and @short == @_ ) {
1632 494         904 return _make_class($self, @short);
1633             }
1634             else {
1635 879 100       13737 return '(?:'
1636             . join( '|' =>
1637             @short > 1
1638             ? ( _make_class($self, @short), sort _re_sort @long)
1639             : ( (sort _re_sort( @long )), @short )
1640             )
1641             . ')';
1642             }
1643             }
1644              
1645             sub _re_path {
1646 4772     4772   4334 my $self = shift;
1647             # in shorter assemblies, _re_path() is the second hottest
1648             # routine. after insert(), so make it fast.
1649              
1650 4772 100       7927 if ($self->{unroll_plus}) {
1651             # but we can't easily make this blockless
1652 72         81 my @arr = @{$_[0]};
  72         126  
1653 72         67 my $str = '';
1654 72         58 my $skip = 0;
1655 72         125 for my $i (0..$#arr) {
1656 127 100 100     1188 if (ref($arr[$i]) eq 'ARRAY') {
    100          
    100          
    100          
1657 1         3 $str .= _re_path($self, $arr[$i]);
1658             }
1659             elsif (ref($arr[$i]) eq 'HASH') {
1660             $str .= exists $arr[$i]->{''}
1661             ? _combine_new( $self,
1662 7         19 map { _re_path( $self, $arr[$i]->{$_} ) } grep { $_ ne '' } keys %{$arr[$i]}
  14         23  
  7         21  
1663             ) . '?'
1664 28 100       48 : _combine_new($self, map { _re_path( $self, $arr[$i]->{$_} ) } keys %{$arr[$i]})
  42         73  
  21         50  
1665             ;
1666             }
1667             elsif ($i < $#arr and $arr[$i+1] =~ /\A$arr[$i]\*(\??)\Z/) {
1668 7 50       29 $str .= "$arr[$i]+" . (defined $1 ? $1 : '');
1669 7         13 ++$skip;
1670             }
1671             elsif ($skip) {
1672 7         13 $skip = 0;
1673             }
1674             else {
1675 84         137 $str .= $arr[$i];
1676             }
1677             }
1678 72         200 return $str;
1679             }
1680              
1681 4700 50       5156 return join( '', @_ ) unless grep { length ref $_ } @_;
  4700         10016  
1682 4700         3412 my $p;
1683             return join '', map {
1684             ref($_) eq '' ? $_
1685 9506 100       24009 : ref($_) eq 'HASH' ? do {
    100          
1686             # In the case of a node, see whether there's a '' which
1687             # indicates that the whole thing is optional and thus
1688             # requires a trailing ?
1689             # Unroll the two different paths to avoid the needless
1690             # grep when it isn't necessary.
1691 1710         1479 $p = $_;
1692             exists $_->{''}
1693             ? _combine_new( $self,
1694 801         1613 map { _re_path( $self, $p->{$_} ) } grep { $_ ne '' } keys %$_
  1508         2527  
1695             ) . '?'
1696 1710 100       5146 : _combine_new($self, map { _re_path( $self, $p->{$_} ) } keys %$_ )
  2394         3791  
1697             }
1698             : _re_path($self, $_) # ref($_) eq 'ARRAY'
1699 4700         3629 } @{$_[0]}
  4700         6443  
1700             }
1701              
1702             sub _lookahead {
1703 132     132   141 my $in = shift;
1704 132         129 my %head;
1705             my $path;
1706 132         244 for $path( keys %$in ) {
1707 328 100       577 next unless defined $in->{$path};
1708             # print "look $path: ", ref($in->{$path}[0]), ".\n";
1709 267 100       595 if( ref($in->{$path}[0]) eq 'HASH' ) {
    100          
1710 15         20 my $next = 0;
1711 15   100     54 while( ref($in->{$path}[$next]) eq 'HASH' and @{$in->{$path}} > $next + 1 ) {
  16         70  
1712 11 100       30 if( exists $in->{$path}[$next]{''} ) {
1713 5         11 ++$head{$in->{$path}[$next+1]};
1714             }
1715 11         31 ++$next;
1716             }
1717 15         48 my $inner = _lookahead( $in->{$path}[0] );
1718 15         70 @head{ keys %$inner } = (values %$inner);
1719             }
1720             elsif( ref($in->{$path}[0]) eq 'ARRAY' ) {
1721 2         5 my $subpath = $in->{$path}[0];
1722 2         10 for( my $sp = 0; $sp < @$subpath; ++$sp ) {
1723 3 100       11 if( ref($subpath->[$sp]) eq 'HASH' ) {
1724 2         8 my $follow = _lookahead( $subpath->[$sp] );
1725 2         9 @head{ keys %$follow } = (values %$follow);
1726 2 100       13 last unless exists $subpath->[$sp]{''};
1727             }
1728             else {
1729 1         3 ++$head{$subpath->[$sp]};
1730 1         2 last;
1731             }
1732             }
1733             }
1734             else {
1735 250         488 ++$head{ $in->{$path}[0] };
1736             }
1737             }
1738             # print "_lookahead ", _dump($in), '==>', _dump([keys %head]), "\n";
1739 132         277 return \%head;
1740             }
1741              
1742             sub _re_path_lookahead {
1743 265     265   250 my $self = shift;
1744 265         226 my $in = shift;
1745             # print "_re_path_la in ", _dump($in), "\n";
1746 265         241 my $out = '';
1747 265         526 for( my $p = 0; $p < @$in; ++$p ) {
1748 573 100       979 if( ref($in->[$p]) eq '' ) {
    100          
1749 462         460 $out .= $in->[$p];
1750 462         794 next;
1751             }
1752             elsif( ref($in->[$p]) eq 'ARRAY' ) {
1753 2         9 $out .= _re_path_lookahead($self, $in->[$p]);
1754 2         8 next;
1755             }
1756             # print "$p ", _dump($in->[$p]), "\n";
1757             my $path = [
1758 228         455 map { _re_path_lookahead($self, $in->[$p]{$_} ) }
1759 275         445 grep { $_ ne '' }
1760 109         102 keys %{$in->[$p]}
  109         294  
1761             ];
1762 109         287 my $ahead = _lookahead($in->[$p]);
1763 109         115 my $more = 0;
1764 109 100 100     393 if( exists $in->[$p]{''} and $p + 1 < @$in ) {
1765 12         14 my $next = 1;
1766 12         38 while( $p + $next < @$in ) {
1767 14 100       36 if( ref( $in->[$p+$next] ) eq 'HASH' ) {
1768 2         5 my $follow = _lookahead( $in->[$p+$next] );
1769 2         8 @{$ahead}{ keys %$follow } = (values %$follow);
  2         7  
1770             }
1771             else {
1772 12         28 ++$ahead->{$in->[$p+$next]};
1773 12         15 last;
1774             }
1775 2         6 ++$next;
1776             }
1777 12         17 $more = 1;
1778             }
1779 109         146 my $nr_one = grep { /^$Single_Char$/ } @$path;
  228         1500  
1780 109         129 my $nr = @$path;
1781 109 100 100     340 if( $nr_one > 1 and $nr_one == $nr ) {
1782 18         53 $out .= _make_class($self, @$path);
1783 18 100       110 $out .= '?' if exists $in->[$p]{''};
1784             }
1785             else {
1786             my $zwla = keys(%$ahead) > 1
1787 91 100       281 ? _combine($self, '?=', grep { s/\+$//; $_ } keys %$ahead )
  191         190  
  191         296  
1788             : '';
1789 91 100       255 my $patt = $nr > 1 ? _combine($self, '?:', @$path ) : $path->[0];
1790             # print "have nr=$nr n1=$nr_one n=", _dump($in->[$p]), ' a=', _dump([keys %$ahead]), " zwla=$zwla patt=$patt @{[_dump($path)]}\n";
1791 91 100       216 if( exists $in->[$p]{''} ) {
1792 44 100       282 $out .= $more ? "$zwla(?:$patt)?" : "(?:$zwla$patt)?";
1793             }
1794             else {
1795 47         228 $out .= "$zwla$patt";
1796             }
1797             }
1798             }
1799 265         714 return $out;
1800             }
1801              
1802             sub _re_path_track {
1803 33     33   48 my $self = shift;
1804 33         35 my $in = shift;
1805 33         36 my $normal = shift;
1806 33         49 my $augmented = shift;
1807 33         36 my $o;
1808 33         41 my $simple = '';
1809 33         41 my $augment = '';
1810 33         90 for( my $n = 0; $n < @$in; ++$n ) {
1811 114 100       188 if( ref($in->[$n]) eq '' ) {
1812 104         118 $o = $in->[$n];
1813 104         99 $simple .= $o;
1814 104         108 $augment .= $o;
1815 104 100 100     896 if( (
      66        
      100        
1816             $n < @$in - 1
1817             and ref($in->[$n+1]) eq 'HASH' and exists $in->[$n+1]{''}
1818             )
1819             or $n == @$in - 1
1820             ) {
1821 24         24 push @{$self->{mlist}}, $normal . $simple ;
  24         97  
1822 24 50       93 $augment .= $] < 5.009005
1823             ? "(?{\$self->{m}=$self->{mcount}})"
1824             : "(?{$self->{mcount}})"
1825             ;
1826 24         83 ++$self->{mcount};
1827             }
1828             }
1829             else {
1830             my $path = [
1831 25         111 map { $self->_re_path_track( $in->[$n]{$_}, $normal.$simple , $augmented.$augment ) }
1832 26         58 grep { $_ ne '' }
1833 10         14 keys %{$in->[$n]}
  10         43  
1834             ];
1835 10         80 $o = '(?:' . join( '|' => sort _re_sort @$path ) . ')';
1836 10 100       31 $o .= '?' if exists $in->[$n]{''};
1837 10         20 $simple .= $o;
1838 10         42 $augment .= $o;
1839             }
1840             }
1841 33         116 return $augment;
1842             }
1843              
1844             sub _re_path_pretty {
1845 411     411   377 my $self = shift;
1846 411         362 my $in = shift;
1847 411         331 my $arg = shift;
1848 411         759 my $pre = ' ' x (($arg->{depth}+0) * $arg->{indent});
1849 411         536 my $indent = ' ' x (($arg->{depth}+1) * $arg->{indent});
1850 411         389 my $out = '';
1851 411         355 $arg->{depth}++;
1852 411         342 my $prev_was_paren = 0;
1853 411         774 for( my $p = 0; $p < @$in; ++$p ) {
1854 1084 100       1705 if( ref($in->[$p]) eq '' ) {
    100          
1855 910 100       1303 $out .= "\n$pre" if $prev_was_paren;
1856 910         824 $out .= $in->[$p];
1857 910         1554 $prev_was_paren = 0;
1858             }
1859             elsif( ref($in->[$p]) eq 'ARRAY' ) {
1860 3         13 $out .= _re_path($self, $in->[$p]);
1861             }
1862             else {
1863             my $path = [
1864 369         683 map { _re_path_pretty($self, $in->[$p]{$_}, $arg ) }
1865 419         629 grep { $_ ne '' }
1866 171         145 keys %{$in->[$p]}
  171         451  
1867             ];
1868 171         334 my $nr = @$path;
1869 171         148 my( @short, @long );
1870 171 100       271 push @{/^$Single_Char$/ ? \@short : \@long}, $_ for @$path;
  369         2421  
1871 171 100       278 if( @short == $nr ) {
1872 37 100       123 $out .= $nr == 1 ? $path->[0] : _make_class($self, @short);
1873 37 100       205 $out .= '?' if exists $in->[$p]{''};
1874             }
1875             else {
1876 134 100       291 $out .= "\n" if length $out;
1877 134 100       221 $out .= $pre if $p;
1878 134         206 $out .= "(?:\n$indent";
1879 134 100       209 if( @short < 2 ) {
1880 133         138 my $r = 0;
1881             $out .= join( "\n$indent|" => map {
1882 133 100       546 $r++ and $_ =~ s/^\(\?:/\n$indent(?:/;
  298         547  
1883 298         574 $_
1884             }
1885             sort _re_sort @$path
1886             );
1887             }
1888             else {
1889 1         7 $out .= join( "\n$indent|" => ( (sort _re_sort @long), _make_class($self, @short) ));
1890             }
1891 134         231 $out .= "\n$pre)";
1892 134 100       241 if( exists $in->[$p]{''} ) {
1893 37         68 $out .= "\n$pre?";
1894 37         133 $prev_was_paren = 0;
1895             }
1896             else {
1897 97         314 $prev_was_paren = 1;
1898             }
1899             }
1900             }
1901             }
1902 411         371 $arg->{depth}--;
1903 411         1009 return $out;
1904             }
1905              
1906             sub _node_eq {
1907 425 100 66 425   1534 return 0 if not defined $_[0] or not defined $_[1];
1908 422 100       974 return 0 if ref $_[0] ne ref $_[1];
1909             # Now that we have determined that the reference of each
1910             # argument are the same, we only have to test the first
1911             # one, which gives us a nice micro-optimisation.
1912 381 100       741 if( ref($_[0]) eq 'HASH' ) {
    100          
1913 305         634 keys %{$_[0]} == keys %{$_[1]}
  305         878  
1914             and
1915             # does this short-circuit to avoid _re_path() cost more than it saves?
1916 305 100 100     271 join( '|' => sort keys %{$_[0]}) eq join( '|' => sort keys %{$_[1]})
  272         957  
  272         1669  
1917             and
1918             _re_path(undef, [$_[0]] ) eq _re_path(undef, [$_[1]] );
1919             }
1920             elsif( ref($_[0]) eq 'ARRAY' ) {
1921 9 100       8 scalar @{$_[0]} == scalar @{$_[1]}
  9         15  
  9         31  
1922             and
1923             _re_path(undef, $_[0]) eq _re_path(undef, $_[1]);
1924             }
1925             else {
1926 67         232 $_[0] eq $_[1];
1927             }
1928             }
1929              
1930             sub _pretty_dump {
1931 7     7   32 return sprintf "\\x%02x", ord(shift);
1932             }
1933              
1934             sub _dump {
1935 5579     5579   5210 my $path = shift;
1936 5579 100       9288 return _dump_node($path) if ref($path) eq 'HASH';
1937 5057         3824 my $dump = '[';
1938 5057         3311 my $d;
1939 5057         3539 my $nr = 0;
1940 5057         5040 for $d( @$path ) {
1941 11036 100       15461 $dump .= ' ' if $nr++;
1942 11036 100       18870 if( ref($d) eq 'HASH' ) {
    100          
    100          
1943 1340         1584 $dump .= _dump_node($d);
1944             }
1945             elsif( ref($d) eq 'ARRAY' ) {
1946 242         297 $dump .= _dump($d);
1947             }
1948             elsif( defined $d ) {
1949             # D::C indicates the second test is redundant
1950             # $dump .= ( $d =~ /\s/ or not length $d )
1951 9453 100       22412 $dump .= (
    100          
1952             $d =~ /\s/ ? qq{'$d'} :
1953             $d =~ /^[\x00-\x1f]$/ ? _pretty_dump($d) :
1954             $d
1955             );
1956             }
1957             else {
1958 1         3 $dump .= '*';
1959             }
1960             }
1961 5057         55197 return $dump . ']';
1962             }
1963              
1964             sub _dump_node {
1965 1862     1862   1470 my $node = shift;
1966 1862         1417 my $dump = '{';
1967 1862         1316 my $nr = 0;
1968 1862         1349 my $n;
1969 1862         4209 for $n (sort keys %$node) {
1970 3899 100       5573 $dump .= ' ' if $nr++;
1971             # Devel::Cover shows this to test to be redundant
1972             # $dump .= ( $n eq '' and not defined $node->{$n} )
1973             $dump .= $n eq ''
1974             ? '*'
1975             : ($n =~ /^[\x00-\x1f]$/ ? _pretty_dump($n) : $n)
1976 3899 100       9647 . "=>" . _dump($node->{$n})
    100          
1977             ;
1978             }
1979 1862         19697 return $dump . '}';
1980             }
1981              
1982             =pod
1983              
1984             =head1 NAME
1985              
1986             Regexp::Assemble - Assemble multiple Regular Expressions into a single RE
1987              
1988             =head1 SYNOPSIS
1989              
1990             use Regexp::Assemble;
1991              
1992             my $ra = Regexp::Assemble->new;
1993             $ra->add( 'ab+c' );
1994             $ra->add( 'ab+-' );
1995             $ra->add( 'a\w\d+' );
1996             $ra->add( 'a\d+' );
1997             print $ra->re; # prints a(?:\w?\d+|b+[-c])
1998              
1999             =head1 DESCRIPTION
2000              
2001             Regexp::Assemble takes an arbitrary number of regular expressions
2002             and assembles them into a single regular expression (or RE) that
2003             matches all that the individual REs match.
2004              
2005             As a result, instead of having a large list of expressions to loop
2006             over, a target string only needs to be tested against one expression.
2007             This is interesting when you have several thousand patterns to deal
2008             with. Serious effort is made to produce the smallest pattern possible.
2009              
2010             It is also possible to track the original patterns, so that you can
2011             determine which, among the source patterns that form the assembled
2012             pattern, was the one that caused the match to occur.
2013              
2014             You should realise that large numbers of alternations are processed
2015             in perl's regular expression engine in O(n) time, not O(1). If you
2016             are still having performance problems, you should look at using a
2017             trie. Note that Perl's own regular expression engine will implement
2018             trie optimisations in perl 5.10 (they are already available in
2019             perl 5.9.3 if you want to try them out). C will
2020             do the right thing when it knows it's running on a trie'd perl.
2021             (At least in some version after this one).
2022              
2023             Some more examples of usage appear in the accompanying README. If
2024             that file is not easy to access locally, you can find it on a web
2025             repository such as
2026             L or
2027             L.
2028              
2029             See also L.
2030              
2031             =head1 Methods
2032              
2033             =head2 add(LIST)
2034              
2035             Takes a string, breaks it apart into a set of tokens (respecting
2036             meta characters) and inserts the resulting list into the C
2037             object. It uses a naive regular expression to lex the string
2038             that may be fooled complex expressions (specifically, it will
2039             fail to lex nested parenthetical expressions such as
2040             C correctly). If this is the case, the end of
2041             the string will not be tokenised correctly and returned as one
2042             long string.
2043              
2044             On the one hand, this may indicate that the patterns you are
2045             trying to feed the C object are too complex. Simpler
2046             patterns might allow the algorithm to work more effectively and
2047             perform more reductions in the resulting pattern.
2048              
2049             On the other hand, you can supply your own pattern to perform the
2050             lexing if you need. The test suite contains an example of a lexer
2051             pattern that will match one level of nested parentheses.
2052              
2053             Note that there is an internal optimisation that will bypass a
2054             much of the lexing process. If a string contains no C<\>
2055             (backslash), C<[> (open square bracket), C<(> (open paren),
2056             C (question mark), C<+> (plus), C<*> (star) or C<{> (open
2057             curly), a character split will be performed directly.
2058              
2059             A list of strings may be supplied, thus you can pass it a file
2060             handle of a file opened for reading:
2061              
2062             $re->add( '\d+-\d+-\d+-\d+\.example\.com' );
2063             $re->add( );
2064              
2065             If the file is very large, it may be more efficient to use a
2066             C loop, to read the file line-by-line:
2067              
2068             $re->add($_) while ;
2069              
2070             The C method will chomp the lines automatically. If you
2071             do not want this to occur (you want to keep the record
2072             separator), then disable Cing.
2073              
2074             $re->chomp(0);
2075             $re->add($_) while ;
2076              
2077             This method is chainable.
2078              
2079             =head2 add_file(FILENAME [...])
2080              
2081             Takes a list of file names. Each file is opened and read
2082             line by line. Each line is added to the assembly.
2083              
2084             $r->add_file( 'file.1', 'file.2' );
2085              
2086             If a file cannot be opened, the method will croak. If you cannot
2087             afford to let this happen then you should wrap the call in a C
2088             block.
2089              
2090             Chomping happens automatically unless you the C method
2091             to disable it. By default, input lines are read according to the
2092             value of the C attribute (if defined), and
2093             will otherwise fall back to the current setting of the system C<$/>
2094             variable. The record separator may also be specified on each
2095             call to C. Internally, the routine Cises the
2096             value of C<$/> to whatever is required, for the duration of the
2097             call.
2098              
2099             An alternate calling mechanism using a hash reference is
2100             available. The recognised keys are:
2101              
2102             =over 4
2103              
2104             =item file
2105              
2106             Reference to a list of file names, or the name of a single
2107             file.
2108              
2109             $r->add_file({file => ['file.1', 'file.2', 'file.3']});
2110             $r->add_file({file => 'file.n'});
2111              
2112             =item input_record_separator
2113              
2114             If present, indicates what constitutes a line
2115              
2116             $r->add_file({file => 'data.txt', input_record_separator => ':' });
2117              
2118             =item rs
2119              
2120             An alias for input_record_separator (mnemonic: same as the
2121             English variable names).
2122              
2123             =back
2124              
2125             $r->add_file( {
2126             file => [ 'pattern.txt', 'more.txt' ],
2127             input_record_separator => "\r\n",
2128             });
2129              
2130             =head2 clone()
2131              
2132             Clones the contents of a Regexp::Assemble object and creates a new
2133             object (in other words it performs a deep copy).
2134              
2135             If the Storable module is installed, its dclone method will be used,
2136             otherwise the cloning will be performed using a pure perl approach.
2137              
2138             You can use this method to take a snapshot of the patterns that have
2139             been added so far to an object, and generate an assembly from the
2140             clone. Additional patterns may to be added to the original object
2141             afterwards.
2142              
2143             my $re = $main->clone->re();
2144             $main->add( 'another-pattern-\\d+' );
2145              
2146             =head2 insert(LIST)
2147              
2148             Takes a list of tokens representing a regular expression and
2149             stores them in the object. Note: you should not pass it a bare
2150             regular expression, such as C. You must pass it as
2151             a list of tokens, I C<('a', 'b+', 'c?', 'd*', 'e')>.
2152              
2153             This method is chainable, I:
2154              
2155             my $ra = Regexp::Assemble->new
2156             ->insert( qw[ a b+ c? d* e ] )
2157             ->insert( qw[ a c+ d+ e* f ] );
2158              
2159             Lexing complex patterns with metacharacters and so on can consume
2160             a significant proportion of the overall time to build an assembly.
2161             If you have the information available in a tokenised form, calling
2162             C directly can be a big win.
2163              
2164             =head2 lexstr
2165              
2166             Use the C method if you are curious to see how a pattern
2167             gets tokenised. It takes a scalar on input, representing a pattern,
2168             and returns a reference to an array, containing the tokenised
2169             pattern. You can recover the original pattern by performing a
2170             C:
2171              
2172             my @token = $re->lexstr($pattern);
2173             my $new_pattern = join( '', @token );
2174              
2175             If the original pattern contains unnecessary backslashes, or C<\x4b>
2176             escapes, or quotemeta escapes (C<\Q>...C<\E>) the resulting pattern
2177             may not be identical.
2178              
2179             Call C does not add the pattern to the object, it is merely
2180             for exploratory purposes. It will, however, update various statistical
2181             counters.
2182              
2183             =head2 pre_filter(CODE)
2184              
2185             Allows you to install a callback to check that the pattern being
2186             loaded contains valid input. It receives the pattern as a whole to
2187             be added, before it been tokenised by the lexer. It may to return
2188             0 or C to indicate that the pattern should not be added, any
2189             true value indicates that the contents are fine.
2190              
2191             A filter to strip out trailing comments (marked by #):
2192              
2193             $re->pre_filter( sub { $_[0] =~ s/\s*#.*$//; 1 } );
2194              
2195             A filter to ignore blank lines:
2196              
2197             $re->pre_filter( sub { length(shift) } );
2198              
2199             If you want to remove the filter, pass C as a parameter.
2200              
2201             $ra->pre_filter(undef);
2202              
2203             This method is chainable.
2204              
2205             =head2 filter(CODE)
2206              
2207             Allows you to install a callback to check that the pattern being
2208             loaded contains valid input. It receives a list on input, after it
2209             has been tokenised by the lexer. It may to return 0 or undef to
2210             indicate that the pattern should not be added, any true value
2211             indicates that the contents are fine.
2212              
2213             If you know that all patterns you expect to assemble contain
2214             a restricted set of of tokens (e.g. no spaces), you could do
2215             the following:
2216              
2217             $ra->filter(sub { not grep { / / } @_ });
2218              
2219             or
2220              
2221             sub only_spaces_and_digits {
2222             not grep { ![\d ] } @_
2223             }
2224             $ra->filter( \&only_spaces_and_digits );
2225              
2226             These two examples will silently ignore faulty patterns, If you
2227             want the user to be made aware of the problem you should raise an
2228             error (via C or C), log an error message, whatever is
2229             best. If you want to remove a filter, pass C as a parameter.
2230              
2231             $ra->filter(undef);
2232              
2233             This method is chainable.
2234              
2235             =head2 as_string
2236              
2237             Assemble the expression and return it as a string. You may want to do
2238             this if you are writing the pattern to a file. The following arguments
2239             can be passed to control the aspect of the resulting pattern:
2240              
2241             B, the number of spaces used to indent nested grouping of
2242             a pattern. Use this to produce a pretty-printed pattern (for some
2243             definition of "pretty"). The resulting output is rather verbose. The
2244             reason is to ensure that the metacharacters C<(?:> and C<)> always
2245             occur on otherwise empty lines. This allows you grep the result for an
2246             even more synthetic view of the pattern:
2247              
2248             egrep -v '^ *[()]'
2249              
2250             The result of the above is quite readable. Remember to backslash the
2251             spaces appearing in your own patterns if you wish to use an indented
2252             pattern in an C construct. Indenting is ignored if tracking
2253             is enabled.
2254              
2255             The B argument takes precedence over the C
2256             method/attribute of the object.
2257              
2258             Calling this
2259             method will drain the internal data structure. Large numbers of patterns
2260             can eat a significant amount of memory, and this lets perl recover the
2261             memory used for other purposes.
2262              
2263             If you want to reduce the pattern I continue to add new patterns,
2264             clone the object and reduce the clone, leaving the original object intact.
2265              
2266             =head2 re
2267              
2268             Assembles the pattern and return it as a compiled RE, using the
2269             C operator.
2270              
2271             As with C, calling this method will reset the internal data
2272             structures to free the memory used in assembling the RE.
2273              
2274             The B attribute, documented in the C method, can be
2275             used here (it will be ignored if tracking is enabled).
2276              
2277             With method chaining, it is possible to produce a RE without having
2278             a temporary C object lying around, I:
2279              
2280             my $re = Regexp::Assemble->new
2281             ->add( q[ab+cd+e] )
2282             ->add( q[ac\\d+e] )
2283             ->add( q[c\\d+e] )
2284             ->re;
2285              
2286             The C<$re> variable now contains a Regexp object that can be used
2287             directly:
2288              
2289             while( <> ) {
2290             /$re/ and print "Something in [$_] matched\n";
2291             )
2292              
2293             The C method is called when the object is used in string context
2294             (hence, within an C operator), so by and large you do not even
2295             need to save the RE in a separate variable. The following will work
2296             as expected:
2297              
2298             my $re = Regexp::Assemble->new->add( qw[ fee fie foe fum ] );
2299             while( ) {
2300             if( /($re)/ ) {
2301             print "Here be giants: $1\n";
2302             }
2303             }
2304              
2305             This approach does not work with tracked patterns. The
2306             C and C methods must be used instead, see below.
2307              
2308             =head2 match(SCALAR)
2309              
2310             The following information applies to Perl 5.8 and below. See
2311             the section that follows for information on Perl 5.10.
2312              
2313             If pattern tracking is in use, you must C in order
2314             to make things work correctly. At a minimum, this will make your
2315             code look like this:
2316              
2317             my $did_match = do { use re 'eval'; $target =~ /$ra/ }
2318             if( $did_match ) {
2319             print "matched ", $ra->matched, "\n";
2320             }
2321              
2322             (The main reason is that the C<$^R> variable is currently broken
2323             and an ugly workaround that runs some Perl code during the match
2324             is required, in order to simulate what C<$^R> should be doing. See
2325             Perl bug #32840 for more information if you are curious. The README
2326             also contains more information). This bug has been fixed in 5.10.
2327              
2328             The important thing to note is that with C, THERE
2329             ARE SECURITY IMPLICATIONS WHICH YOU IGNORE AT YOUR PERIL. The problem
2330             is this: if you do not have strict control over the patterns being
2331             fed to C when tracking is enabled, and someone
2332             slips you a pattern such as C and you
2333             attempt to match a string against the resulting pattern, you will
2334             know Fear and Loathing.
2335              
2336             What is more, the C<$^R> workaround means that that tracking does
2337             not work if you perform a bare C pattern match as shown
2338             above. You have to instead call the C method, in order to
2339             supply the necessary context to take care of the tracking housekeeping
2340             details.
2341              
2342             if( defined( my $match = $ra->match($_)) ) {
2343             print " $_ matched by $match\n";
2344             }
2345              
2346             In the case of a successful match, the original matched pattern
2347             is returned directly. The matched pattern will also be available
2348             through the C method.
2349              
2350             (Except that the above is not true for 5.6.0: the C method
2351             returns true or undef, and the C method always returns
2352             undef).
2353              
2354             If you are capturing parts of the pattern I C
2355             you will want to get at the captures. See the C, C,
2356             C and C methods. If you are not using captures
2357             then you may safely ignore this section.
2358              
2359             In 5.10, since the bug concerning C<$^R> has been resolved, there
2360             is no need to use C and the assembled pattern does
2361             not require any Perl code to be executed during the match.
2362              
2363             =head2 new()
2364              
2365             Creates a new C object. The following optional
2366             key/value parameters may be employed. All keys have a corresponding
2367             method that can be used to change the behaviour later on. As a
2368             general rule, especially if you're just starting out, you don't
2369             have to bother with any of these.
2370              
2371             B, a family of optional attributes that allow anchors
2372             (C<^>, C<\b>, C<\Z>...) to be added to the resulting pattern.
2373              
2374             B, sets the C flags to add to the assembled regular
2375             expression. Warning: no error checking is done, you should ensure
2376             that the flags you pass are understood by the version of Perl you
2377             are using. B exists as an alias, for users familiar
2378             with L.
2379              
2380             B, controls whether the pattern should be chomped before being
2381             lexed. Handy if you are reading patterns from a file. By default,
2382             Cing is performed (this behaviour changed as of version 0.24,
2383             prior versions did not chomp automatically).
2384             See also the C attribute and the C method.
2385              
2386             B, slurp the contents of the specified file and add them
2387             to the assembly. Multiple files may be processed by using a list.
2388              
2389             my $r = Regexp::Assemble->new(file => 're.list');
2390              
2391             my $r = Regexp::Assemble->new(file => ['re.1', 're.2']);
2392              
2393             If you really don't want chomping to occur, you will have to set
2394             the C attribute to 0 (zero). You may also want to look at
2395             the C attribute, as well.
2396              
2397             B, controls what constitutes a record
2398             separator when using the C attribute or the C
2399             method. May be abbreviated to B. See the C<$/> variable in
2400             L.
2401              
2402             B, controls whether the pattern should contain zero-width
2403             lookahead assertions (For instance: (?=[abc])(?:bob|alice|charles).
2404             This is not activated by default, because in many circumstances the
2405             cost of processing the assertion itself outweighs the benefit of
2406             its faculty for short-circuiting a match that will fail. This is
2407             sensitive to the probability of a match succeeding, so if you're
2408             worried about performance you'll have to benchmark a sample population
2409             of targets to see which way the benefits lie.
2410              
2411             B, controls whether you want know which of the initial
2412             patterns was the one that matched. See the C method for
2413             more details. Note for version 5.8 of Perl and below, in this mode
2414             of operation YOU SHOULD BE AWARE OF THE SECURITY IMPLICATIONS that
2415             this entails. Perl 5.10 does not suffer from any such restriction.
2416              
2417             B, the number of spaces used to indent nested grouping of
2418             a pattern. Use this to produce a pretty-printed pattern. See the
2419             C method for a more detailed explanation.
2420              
2421             B, allows you to add a callback to enable sanity checks
2422             on the pattern being loaded. This callback is triggered before the
2423             pattern is split apart by the lexer. In other words, it operates
2424             on the entire pattern. If you are loading patterns from a file,
2425             this would be an appropriate place to remove comments.
2426              
2427             B, allows you to add a callback to enable sanity checks on
2428             the pattern being loaded. This callback is triggered after the
2429             pattern has been split apart by the lexer.
2430              
2431             B, controls whether to unroll, for example, C into
2432             C, C, which may allow additional reductions in the
2433             resulting assembled pattern.
2434              
2435             B, controls whether tail reduction occurs or not. If set,
2436             patterns like C will be reduced to C.
2437             That is, the end of the pattern in each part of the b... and d...
2438             alternations is identical, and hence is hoisted out of the alternation
2439             and placed after it. On by default. Turn it off if you're really
2440             pressed for short assembly times.
2441              
2442             B, specifies the pattern used to lex the input lines into
2443             tokens. You could replace the default pattern by a more sophisticated
2444             version that matches arbitrarily nested parentheses, for example.
2445              
2446             B, controls whether copious amounts of output is produced
2447             during the loading stage or the reducing stage of assembly.
2448              
2449             my $ra = Regexp::Assemble->new;
2450             my $rb = Regexp::Assemble->new( chomp => 1, debug => 3 );
2451              
2452             B, controls whether new patterns can be added to the object
2453             after the assembled pattern is generated. DEPRECATED.
2454              
2455             This method/attribute will be removed in a future release. It doesn't
2456             really serve any purpose, and may be more effectively replaced by
2457             cloning an existing C object and spinning out a
2458             pattern from that instead.
2459              
2460             =head2 source()
2461              
2462             When using tracked mode, after a successful match is made, returns
2463             the original source pattern that caused the match. In Perl 5.10,
2464             the C<$^R> variable can be used to as an index to fetch the correct
2465             pattern from the object.
2466              
2467             If no successful match has been performed, or the object is not in
2468             tracked mode, this method returns C.
2469              
2470             my $r = Regexp::Assemble->new->track(1)->add(qw(foo? bar{2} [Rr]at));
2471              
2472             for my $w (qw(this food is rather barren)) {
2473             if ($w =~ /$r/) {
2474             print "$w matched by ", $r->source($^R), $/;
2475             }
2476             else {
2477             print "$w no match\n";
2478             }
2479             }
2480              
2481             =head2 mbegin()
2482              
2483             This method returns a copy of C<@-> at the moment of the
2484             last match. You should ordinarily not need to bother with
2485             this, C should be able to supply all your needs.
2486              
2487             =head2 mend()
2488              
2489             This method returns a copy of C<@+> at the moment of the
2490             last match.
2491              
2492             =head2 mvar(NUMBER)
2493              
2494             The C method returns the captures of the last match.
2495             C corresponds to $1, C to $2, and so on.
2496             C happens to return the target string matched,
2497             as a byproduct of walking down the C<@-> and C<@+> arrays
2498             after the match.
2499              
2500             If called without a parameter, C will return a
2501             reference to an array containing all captures.
2502              
2503             =head2 capture
2504              
2505             The C method returns the the captures of the last
2506             match as an array. Unlink C, this method does not
2507             include the matched string. It is equivalent to getting an
2508             array back that contains C<$1, $2, $3, ...>.
2509              
2510             If no captures were found in the match, an empty array is
2511             returned, rather than C. You are therefore guaranteed
2512             to be able to use C<< for my $c ($re->capture) { ... >>
2513             without have to check whether anything was captured.
2514              
2515             =head2 matched()
2516              
2517             If pattern tracking has been set, via the C attribute,
2518             or through the C method, this method will return the
2519             original pattern of the last successful match. Returns undef
2520             match has yet been performed, or tracking has not been enabled.
2521              
2522             See below in the NOTES section for additional subtleties of
2523             which you should be aware of when tracking patterns.
2524              
2525             Note that this method is not available in 5.6.0, due to
2526             limitations in the implementation of C<(?{...})> at the time.
2527              
2528             =head2 Statistics/Reporting routines
2529              
2530             =head2 stats_add
2531              
2532             Returns the number of patterns added to the assembly (whether
2533             by C or C). Duplicate patterns are not included
2534             in this total.
2535              
2536             =head2 stats_dup
2537              
2538             Returns the number of duplicate patterns added to the assembly.
2539             If non-zero, this may be a sign that something is wrong with
2540             your data (or at the least, some needless redundancy). This may
2541             occur when you have two patterns (for instance, C and
2542             C) which map to the same result.
2543              
2544             =head2 stats_raw()
2545              
2546             Returns the raw number of bytes in the patterns added to the
2547             assembly. This includes both original and duplicate patterns.
2548             For instance, adding the two patterns C and C will
2549             count as 4 bytes.
2550              
2551             =head2 stats_cooked()
2552              
2553             Return the true number of bytes added to the assembly. This
2554             will not include duplicate patterns. Furthermore, it may differ
2555             from the raw bytes due to quotemeta treatment. For instance,
2556             C will count as 7 (not 8) bytes, because C<\,> will
2557             be stored as C<,>. Also, C<\Qa.b\E> is 7 bytes long, however,
2558             after the quotemeta directives are processed, C will be
2559             stored, for a total of 4 bytes.
2560              
2561             =head2 stats_length()
2562              
2563             Returns the length of the resulting assembled expression.
2564             Until C or C have been called, the length
2565             will be 0 (since the assembly will have not yet been
2566             performed). The length includes only the pattern, not the
2567             additional (C<(?-xism...>) fluff added by the compilation.
2568              
2569             =head2 dup_warn(NUMBER|CODEREF)
2570              
2571             Turns warnings about duplicate patterns on or off. By
2572             default, no warnings are emitted. If the method is
2573             called with no parameters, or a true parameter,
2574             the object will carp about patterns it has
2575             already seen. To turn off the warnings, use 0 as a
2576             parameter.
2577              
2578             $r->dup_warn();
2579              
2580             The method may also be passed a code block. In this case
2581             the code will be executed and it will receive a reference
2582             to the object in question, and the lexed pattern.
2583              
2584             $r->dup_warn(
2585             sub {
2586             my $self = shift;
2587             print $self->stats_add, " patterns added at line $.\n",
2588             join( '', @_ ), " added previously\n";
2589             }
2590             )
2591              
2592             =head2 Anchor routines
2593              
2594             Suppose you wish to assemble a series of patterns that all begin
2595             with C<^> and end with C<$> (anchor pattern to the beginning and
2596             end of line). Rather than add the anchors to each and every pattern
2597             (and possibly forget to do so when a new entry is added), you may
2598             specify the anchors in the object, and they will appear in the
2599             resulting pattern, and you no longer need to (or should) put them
2600             in your source patterns. For example, the two following snippets
2601             will produce identical patterns:
2602              
2603             $r->add(qw(^this ^that ^them))->as_string;
2604              
2605             $r->add(qw(this that them))->anchor_line_begin->as_string;
2606              
2607             # both techniques will produce ^th(?:at|em|is)
2608              
2609             All anchors are possible word (C<\b>) boundaries, line
2610             boundaries (C<^> and C<$>) and string boundaries (C<\A>
2611             and C<\Z> (or C<\z> if you absolutely need it)).
2612              
2613             The shortcut C> implies both
2614             C_begin> C_end>
2615             is also available. If different anchors are specified
2616             the most specific anchor wins. For instance, if both
2617             C and C are
2618             specified, C takes precedence.
2619              
2620             All the anchor methods are chainable.
2621              
2622             =head2 anchor_word_begin
2623              
2624             The resulting pattern will be prefixed with a C<\b>
2625             word boundary assertion when the value is true. Set
2626             to 0 to disable.
2627              
2628             $r->add('pre')->anchor_word_begin->as_string;
2629             # produces '\bpre'
2630              
2631             =head2 anchor_word_end
2632              
2633             The resulting pattern will be suffixed with a C<\b>
2634             word boundary assertion when the value is true. Set
2635             to 0 to disable.
2636              
2637             $r->add(qw(ing tion))
2638             ->anchor_word_end
2639             ->as_string; # produces '(?:tion|ing)\b'
2640              
2641             =head2 anchor_word
2642              
2643             The resulting pattern will be have C<\b>
2644             word boundary assertions at the beginning and end
2645             of the pattern when the value is true. Set
2646             to 0 to disable.
2647              
2648             $r->add(qw(cat carrot)
2649             ->anchor_word(1)
2650             ->as_string; # produces '\bca(?:rro)t\b'
2651              
2652             =head2 anchor_line_begin
2653              
2654             The resulting pattern will be prefixed with a C<^>
2655             line boundary assertion when the value is true. Set
2656             to 0 to disable.
2657              
2658             $r->anchor_line_begin;
2659             # or
2660             $r->anchor_line_begin(1);
2661              
2662             =head2 anchor_line_end
2663              
2664             The resulting pattern will be suffixed with a C<$>
2665             line boundary assertion when the value is true. Set
2666             to 0 to disable.
2667              
2668             # turn it off
2669             $r->anchor_line_end(0);
2670              
2671             =head2 anchor_line
2672              
2673             The resulting pattern will be have the C<^> and C<$>
2674             line boundary assertions at the beginning and end
2675             of the pattern, respectively, when the value is true. Set
2676             to 0 to disable.
2677              
2678             $r->add(qw(cat carrot)
2679             ->anchor_line
2680             ->as_string; # produces '^ca(?:rro)t$'
2681              
2682             =head2 anchor_string_begin
2683              
2684             The resulting pattern will be prefixed with a C<\A>
2685             string boundary assertion when the value is true. Set
2686             to 0 to disable.
2687              
2688             $r->anchor_string_begin(1);
2689              
2690             =head2 anchor_string_end
2691              
2692             The resulting pattern will be suffixed with a C<\Z>
2693             string boundary assertion when the value is true. Set
2694             to 0 to disable.
2695              
2696             # disable the string boundary end anchor
2697             $r->anchor_string_end(0);
2698              
2699             =head2 anchor_string_end_absolute
2700              
2701             The resulting pattern will be suffixed with a C<\z>
2702             string boundary assertion when the value is true. Set
2703             to 0 to disable.
2704              
2705             # disable the string boundary absolute end anchor
2706             $r->anchor_string_end_absolute(0);
2707              
2708             If you don't understand the difference between
2709             C<\Z> and C<\z>, the former will probably do what
2710             you want.
2711              
2712             =head2 anchor_string
2713              
2714             The resulting pattern will be have the C<\A> and C<\Z>
2715             string boundary assertions at the beginning and end
2716             of the pattern, respectively, when the value is true. Set
2717             to 0 to disable.
2718              
2719             $r->add(qw(cat carrot)
2720             ->anchor_string
2721             ->as_string; # produces '\Aca(?:rro)t\Z'
2722              
2723             =head2 anchor_string_absolute
2724              
2725             The resulting pattern will be have the C<\A> and C<\z>
2726             string boundary assertions at the beginning and end
2727             of the pattern, respectively, when the value is true. Set
2728             to 0 to disable.
2729              
2730             $r->add(qw(cat carrot)
2731             ->anchor_string_absolute
2732             ->as_string; # produces '\Aca(?:rro)t\z'
2733              
2734             =head2 debug(NUMBER)
2735              
2736             Turns debugging on or off. Statements are printed
2737             to the currently selected file handle (STDOUT by default).
2738             If you are already using this handle, you will have to
2739             arrange to select an output handle to a file of your own
2740             choosing, before call the C, C or C)
2741             functions, otherwise it will scribble all over your
2742             carefully formatted output.
2743              
2744             =over 4
2745              
2746             =item * 0
2747              
2748             Off. Turns off all debugging output.
2749              
2750             =item * 1
2751              
2752             Add. Trace the addition of patterns.
2753              
2754             =item * 2
2755              
2756             Reduce. Trace the process of reduction and assembly.
2757              
2758             =item * 4
2759              
2760             Lex. Trace the lexing of the input patterns into its constituent
2761             tokens.
2762              
2763             =item * 8
2764              
2765             Time. Print to STDOUT the time taken to load all the patterns. This is
2766             nothing more than the difference between the time the object was
2767             instantiated and the time reduction was initiated.
2768              
2769             # load=
2770              
2771             Any lengthy computation performed in the client code will be reflected
2772             in this value. Another line will be printed after reduction is
2773             complete.
2774              
2775             # reduce=
2776              
2777             The above output lines will be changed to C and
2778             C if the internal state of the object is corrupted
2779             and the initial timestamp is lost.
2780              
2781             The code attempts to load L in order to report fractional
2782             seconds. If this is not successful, the elapsed time is displayed
2783             in whole seconds.
2784              
2785             =back
2786              
2787             Values can be added (or or'ed together) to trace everything
2788              
2789             $r->debug(7)->add( '\\d+abc' );
2790              
2791             Calling C with no arguments turns debugging off.
2792              
2793             =head2 dump()
2794              
2795             Produces a synthetic view of the internal data structure. How
2796             to interpret the results is left as an exercise to the reader.
2797              
2798             print $r->dump;
2799              
2800             =head2 chomp(0|1)
2801              
2802             Turns chomping on or off.
2803              
2804             IMPORTANT: As of version 0.24, chomping is now on by default as it
2805             makes C Just Work. The only time you may run into trouble
2806             is with C. So don't do that, or else explicitly turn
2807             off chomping.
2808              
2809             To avoid incorporating (spurious)
2810             record separators (such as "\n" on Unix) when reading from a file,
2811             C Cs its input. If you don't want this to happen,
2812             call C with a false value.
2813              
2814             $re->chomp(0); # really want the record separators
2815             $re->add();
2816              
2817             =head2 fold_meta_pairs(NUMBER)
2818              
2819             Determines whether C<\s>, C<\S> and C<\w>, C<\W> and C<\d>, C<\D>
2820             are folded into a C<.> (dot). Folding happens by default (for
2821             reasons of backwards compatibility, even though it is wrong when
2822             the C expression modifier is active).
2823              
2824             Call this method with a false value to prevent this behaviour (which
2825             is only a problem when dealing with C<\n> if the C expression
2826             modifier is also set).
2827              
2828             $re->add( '\\w', '\\W' );
2829             my $clone = $re->clone;
2830              
2831             $clone->fold_meta_pairs(0);
2832             print $clone->as_string; # prints '.'
2833             print $re->as_string; # print '[\W\w]'
2834              
2835             =head2 indent(NUMBER)
2836              
2837             Sets the level of indent for pretty-printing nested groups
2838             within a pattern. See the C method for more details.
2839             When called without a parameter, no indenting is performed.
2840              
2841             $re->indent( 4 );
2842             print $re->as_string;
2843              
2844             =head2 lookahead(0|1)
2845              
2846             Turns on zero-width lookahead assertions. This is usually
2847             beneficial when you expect that the pattern will usually fail.
2848             If you expect that the pattern will usually match you will
2849             probably be worse off.
2850              
2851             =head2 flags(STRING)
2852              
2853             Sets the flags that govern how the pattern behaves (for
2854             versions of Perl up to 5.9 or so, these are C). By
2855             default no flags are enabled.
2856              
2857             =head2 modifiers(STRING)
2858              
2859             An alias of the C method, for users familiar with
2860             C.
2861              
2862             =head2 track(0|1)
2863              
2864             Turns tracking on or off. When this attribute is enabled,
2865             additional housekeeping information is inserted into the
2866             assembled expression using C<({...}> embedded code
2867             constructs. This provides the necessary information to
2868             determine which, of the original patterns added, was the
2869             one that caused the match.
2870              
2871             $re->track( 1 );
2872             if( $target =~ /$re/ ) {
2873             print "$target matched by ", $re->matched, "\n";
2874             }
2875              
2876             Note that when this functionality is enabled, no
2877             reduction is performed and no character classes are
2878             generated. In other words, C is not
2879             reduced down to C<(?:br|t)ag> and C is not
2880             reduced to C.
2881              
2882             =head2 unroll_plus(0|1)
2883              
2884             Turns the unrolling of plus metacharacters on or off. When
2885             a pattern is broken up, C becomes C, C (and
2886             C becomes C, C. This may allow the freed C
2887             to assemble with other patterns. Not enabled by default.
2888              
2889             =head2 lex(SCALAR)
2890              
2891             Change the pattern used to break a string apart into tokens.
2892             You can examine the C script as a starting point.
2893              
2894             =head2 reduce(0|1)
2895              
2896             Turns pattern reduction on or off. A reduced pattern may
2897             be considerably shorter than an unreduced pattern. Consider
2898             C I C. An unreduced
2899             pattern will be very similar to those produced by
2900             C. Reduction is on by default. Turning
2901             it off speeds assembly (but assembly is pretty fast -- it's
2902             the breaking up of the initial patterns in the lexing stage
2903             that can consume a non-negligible amount of time).
2904              
2905             =head2 mutable(0|1)
2906              
2907             This method has been marked as DEPRECATED. It will be removed
2908             in a future release. See the C method for a technique
2909             to replace its functionality.
2910              
2911             =head2 reset()
2912              
2913             Empties out the patterns that have been Ced or C-ed
2914             into the object. Does not modify the state of controller attributes
2915             such as C, C, C and the like.
2916              
2917             =head2 Default_Lexer
2918              
2919             B the C function is a class method, not
2920             an object method. It is a fatal error to call it as an object
2921             method.
2922              
2923             The C method lets you replace the default pattern
2924             used for all subsequently created C objects. It
2925             will not have any effect on existing objects. (It is also possible
2926             to override the lexer pattern used on a per-object basis).
2927              
2928             The parameter should be an ordinary scalar, not a compiled
2929             pattern. If the pattern fails to match all parts of the string,
2930             the missing parts will be returned as single chunks. Therefore
2931             the following pattern is legal (albeit rather cork-brained):
2932              
2933             Regexp::Assemble::Default_Lexer( '\\d' );
2934              
2935             The above pattern will split up input strings digit by digit, and
2936             all non-digit characters as single chunks.
2937              
2938             =head1 DIAGNOSTICS
2939              
2940             "Cannot pass a C to Default_Lexer"
2941              
2942             You tried to replace the default lexer pattern with an object
2943             instead of a scalar. Solution: You probably tried to call
2944             C<< $obj->Default_Lexer >>. Call the qualified class method instead
2945             C.
2946              
2947             "filter method not passed a coderef"
2948              
2949             "pre_filter method not passed a coderef"
2950              
2951             A reference to a subroutine (anonymous or otherwise) was expected.
2952             Solution: read the documentation for the C method.
2953              
2954             "duplicate pattern added: /.../"
2955              
2956             The C attribute is active, and a duplicate pattern was
2957             added (well duh!). Solution: clean your data.
2958              
2959             "cannot open [file] for input: [reason]"
2960              
2961             The C method was unable to open the specified file for
2962             whatever reason. Solution: make sure the file exists and the script
2963             has the required privileges to read it.
2964              
2965             =head1 NOTES
2966              
2967             This module has been tested successfully with a range of versions
2968             of perl, from 5.005_03 to 5.9.3. Use of 5.6.0 is not recommended.
2969              
2970             The expressions produced by this module can be used with the PCRE
2971             library.
2972              
2973             Remember to "double up" your backslashes if the patterns are
2974             hard-coded as constants in your program. That is, you should
2975             literally C rather than C. It
2976             usually will work either way, but it's good practice to do so.
2977              
2978             Where possible, supply the simplest tokens possible. Don't add
2979             C when C will do. The reason is that
2980             if you also add C the resulting assembly changes
2981             dramatically: C I
2982             C. Since R::A doesn't perform enough analysis,
2983             it won't "unroll" the C<{2}> quantifier, and will fail to notice
2984             the divergence after the first C<-d\d+>.
2985              
2986             Furthermore, when the string 'X-123000P' is matched against the
2987             first assembly, the regexp engine will have to backtrack over each
2988             alternation (the one that ends in Y B the one that ends in Z)
2989             before determining that there is no match. No such backtracking
2990             occurs in the second pattern: as soon as the engine encounters the
2991             'P' in the target string, neither of the alternations at that point
2992             (C<-\d+Y> or C) could succeed and so the match fails.
2993              
2994             C does, however, know how to build character
2995             classes. Given C, C and C, it will assemble these
2996             into C. When C<-> (dash) appears as a candidate for a
2997             character class it will be the first character in the class. When
2998             C<^> (circumflex) appears as a candidate for a character class it
2999             will be the last character in the class.
3000              
3001             It also knows about meta-characters than can "absorb" regular
3002             characters. For instance, given C and C, it knows that
3003             C<5> can be represented by C<\d> and so the assembly is just C.
3004             The "absorbent" meta-characters it deals with are C<.>, C<\d>, C<\s>
3005             and C<\W> and their complements. It will replace C<\d>/C<\D>,
3006             C<\s>/C<\S> and C<\w>/C<\W> by C<.> (dot), and it will drop C<\d>
3007             if C<\w> is also present (as will C<\D> in the presence of C<\W>).
3008              
3009             C deals correctly with C's propensity
3010             to backslash many characters that have no need to be. Backslashes on
3011             non-metacharacters will be removed. Similarly, in character classes,
3012             a number of characters lose their magic and so no longer need to be
3013             backslashed within a character class. Two common examples are C<.>
3014             (dot) and C<$>. Such characters will lose their backslash.
3015              
3016             At the same time, it will also process C<\Q...\E> sequences. When
3017             such a sequence is encountered, the inner section is extracted and
3018             C is applied to the section. The resulting quoted text
3019             is then used in place of the original unquoted text, and the C<\Q>
3020             and C<\E> metacharacters are thrown away. Similar processing occurs
3021             with the C<\U...\E> and C<\L...\E> sequences. This may have surprising
3022             effects when using a dispatch table. In this case, you will need
3023             to know exactly what the module makes of your input. Use the C
3024             method to find out what's going on:
3025              
3026             $pattern = join( '', @{$re->lexstr($pattern)} );
3027              
3028             If all the digits 0..9 appear in a character class, C
3029             will replace them by C<\d>. I'd do it for letters as well, but
3030             thinking about accented characters and other glyphs hurts my head.
3031              
3032             In an alternation, the longest paths are chosen first (for example,
3033             C). When two paths have the same length, the path
3034             with the most subpaths will appear first. This aims to put the
3035             "busiest" paths to the front of the alternation. For example, the
3036             list C, C, C, C and C will produce the
3037             pattern C<(?:f(?:ew|ig|un)|b(?:ad|it))>. See F for a
3038             real-world example of how alternations are sorted. Once you have
3039             looked at that, everything should be crystal clear.
3040              
3041             When tracking is in use, no reduction is performed. nor are
3042             character classes formed. The reason is that it is
3043             too difficult to determine the original pattern afterwards. Consider the
3044             two patterns C and C. These should be reduced to
3045             C. The final character matches one of two possibilities.
3046             To resolve whether it matched an C<'e'> or C<'m'> would require
3047             keeping track of the fact that the pattern finished up in a character
3048             class, which would the require a whole lot more work to figure out
3049             which character of the class matched. Without character classes
3050             it becomes much easier. Instead, C is produced, which
3051             lets us find out more simply where we ended up.
3052              
3053             Similarly, C and C should form C<(?:dog|sea)food>.
3054             When the pattern is being assembled, the tracking decision needs
3055             to be made at the end of the grouping, but the tail of the pattern
3056             has not yet been visited. Deferring things to make this work correctly
3057             is a vast hassle. In this case, the pattern becomes merely
3058             C<(?:dogfood|seafood>. Tracked patterns will therefore be bulkier than
3059             simple patterns.
3060              
3061             There is an open bug on this issue:
3062              
3063             L
3064              
3065             If this bug is ever resolved, tracking would become much easier to
3066             deal with (none of the C hassle would be required - you could
3067             just match like a regular RE and it would Just Work).
3068              
3069             =head1 SEE ALSO
3070              
3071             =over 4
3072              
3073             =item L
3074              
3075             General information about Perl's regular expressions.
3076              
3077             =item L
3078              
3079             Specific information about C.
3080              
3081             =item Regex::PreSuf
3082              
3083             C takes a string and chops it itself into tokens of
3084             length 1. Since it can't deal with tokens of more than one character,
3085             it can't deal with meta-characters and thus no regular expressions.
3086             Which is the main reason why I wrote this module.
3087              
3088             =item Regexp::Optimizer
3089              
3090             C produces regular expressions that are similar to
3091             those produced by R::A with reductions switched off. It's biggest
3092             drawback is that it is exponentially slower than Regexp::Assemble on
3093             very large sets of patterns.
3094              
3095             =item Regexp::Parser
3096              
3097             Fine grained analysis of regular expressions.
3098              
3099             =item Regexp::Trie
3100              
3101             Funnily enough, this was my working name for C
3102             during its development. I changed the name because I thought it
3103             was too obscure. Anyway, C does much the same as
3104             C and C except that it runs
3105             much faster (according to the author). It does not recognise
3106             meta characters (that is, 'a+b' is interpreted as 'a\+b').
3107              
3108             =item Text::Trie
3109              
3110             C is well worth investigating. Tries can outperform very
3111             bushy (read: many alternations) patterns.
3112              
3113             =item Tree::Trie
3114              
3115             C is another module that builds tries. The algorithm that
3116             C uses appears to be quite similar to the
3117             algorithm described therein, except that C solves its
3118             end-marker problem without having to rewrite the leaves.
3119              
3120             =back
3121              
3122             =head1 LIMITATIONS
3123              
3124             Some mildly complex cases are not handled well. See examples/failure.01.pl
3125             and L.
3126              
3127             does not attempt to find common substrings. For
3128             instance, it will not collapse C down to C.
3129             If there's a module out there that performs this sort of string
3130             analysis I'd like to know about it. But keep in mind that the
3131             algorithms that do this are very expensive: quadratic or worse.
3132              
3133             C does not interpret meta-character modifiers.
3134             For instance, if the following two patterns are
3135             given: C and C, it will not determine that C<\d> can be
3136             matched by C<\d+>. Instead, it will produce C. Along
3137             a similar line of reasoning, it will not determine that C and
3138             C is equivalent to C (It will produce C
3139             instead).
3140              
3141             You cannot remove a pattern that has been added to an object. You'll
3142             just have to start over again. Adding a pattern is difficult enough,
3143             I'd need a solid argument to convince me to add a C method.
3144             If you need to do this you should read the documentation for the
3145             C method.
3146              
3147             C does not (yet)? employ the C<(?E...)>
3148             construct.
3149              
3150             The module does not produce POSIX-style regular expressions. This
3151             would be quite easy to add, if there was a demand for it.
3152              
3153             =head1 BUGS
3154              
3155             Patterns that generate look-ahead assertions sometimes produce
3156             incorrect patterns in certain obscure corner cases. If you
3157             suspect that this is occurring in your pattern, disable
3158             lookaheads.
3159              
3160             Tracking doesn't really work at all with 5.6.0. It works better
3161             in subsequent 5.6 releases. For maximum reliability, the use of
3162             a 5.8 release is strongly recommended. Tracking barely works with
3163             5.005_04. Of note, using C<\d>-style meta-characters invariably
3164             causes panics. Tracking really comes into its own in Perl 5.10.
3165              
3166             If you feed C patterns with nested parentheses,
3167             there is a chance that the resulting pattern will be uncompilable
3168             due to mismatched parentheses (not enough closing parentheses). This
3169             is normal, so long as the default lexer pattern is used. If you want
3170             to find out which pattern among a list of 3000 patterns are to blame
3171             (speaking from experience here), the F script offers
3172             a strategy for pinpointing the pattern at fault. While you may not
3173             be able to use the script directly, the general approach is easy to
3174             implement.
3175              
3176             The algorithm used to assemble the regular expressions makes extensive
3177             use of mutually-recursive functions (that is, A calls B, B calls
3178             A, ...) For deeply similar expressions, it may be possible to provoke
3179             "Deep recursion" warnings.
3180              
3181             The module has been tested extensively, and has an extensive test
3182             suite (that achieves close to 100% statement coverage), but you
3183             never know... A bug may manifest itself in two ways: creating a
3184             pattern that cannot be compiled, such as C, or a pattern
3185             that compiles correctly but that either matches things it shouldn't,
3186             or doesn't match things it should. It is assumed that Such problems
3187             will occur when the reduction algorithm encounters some sort of
3188             edge case. A temporary work-around is to disable reductions:
3189              
3190             my $pattern = $assembler->reduce(0)->re;
3191              
3192             A discussion about implementation details and where bugs might lurk
3193             appears in the README file. If this file is not available locally,
3194             you should be able to find a copy on the Web at your nearest CPAN
3195             mirror.
3196              
3197             Seriously, though, a number of people have been using this module to
3198             create expressions anywhere from 140Kb to 600Kb in size, and it seems to
3199             be working according to spec. Thus, I don't think there are any serious
3200             bugs remaining.
3201              
3202             If you are feeling brave, extensive debugging traces are available to
3203             figure out where assembly goes wrong.
3204              
3205             Please report all bugs at
3206             L
3207              
3208             Make sure you include the output from the following two commands:
3209              
3210             perl -MRegexp::Assemble -le 'print $Regexp::Assemble::VERSION'
3211             perl -V
3212              
3213             There is a mailing list for the discussion of C.
3214             Subscription details are available at
3215             L.
3216              
3217             =head1 ACKNOWLEDGEMENTS
3218              
3219             This module grew out of work I did building access maps for Postfix,
3220             a modern SMTP mail transfer agent. See L
3221             for more information. I used Perl to build large regular expressions
3222             for blocking dynamic/residential IP addresses to cut down on spam
3223             and viruses. Once I had the code running for this, it was easy to
3224             start adding stuff to block really blatant spam subject lines, bogus
3225             HELO strings, spammer mailer-ids and more...
3226              
3227             I presented the work at the French Perl Workshop in 2004, and the
3228             thing most people asked was whether the underlying mechanism for
3229             assembling the REs was available as a module. At that time it was
3230             nothing more that a twisty maze of scripts, all different. The
3231             interest shown indicated that a module was called for. I'd like to
3232             thank the people who showed interest. Hey, it's going to make I
3233             messy scripts smaller, in any case.
3234              
3235             Thomas Drugeon was a valuable sounding board for trying out
3236             early ideas. Jean Forget and Philippe Blayo looked over an early
3237             version. H.Merijn Brandt stopped over in Paris one evening, and
3238             discussed things over a few beers.
3239              
3240             Nicholas Clark pointed out that while what this module does
3241             (?:c|sh)ould be done in perl's core, as per the 2004 TODO, he
3242             encouraged me to continue with the development of this module. In
3243             any event, this module allows one to gauge the difficulty of
3244             undertaking the endeavour in C. I'd rather gouge my eyes out with
3245             a blunt pencil.
3246              
3247             Paul Johnson settled the question as to whether this module should
3248             live in the Regex:: namespace, or Regexp:: namespace. If you're
3249             not convinced, try running the following one-liner:
3250              
3251             perl -le 'print ref qr//'
3252              
3253             Philippe Bruhat found a couple of corner cases where this module
3254             could produce incorrect results. Such feedback is invaluable,
3255             and only improves the module's quality.
3256              
3257             =head1 Machine-Readable Change Log
3258              
3259             The file Changes was converted into Changelog.ini by L.
3260              
3261             =head1 AUTHOR
3262              
3263             David Landgren
3264              
3265             Copyright (C) 2004-2011. All rights reserved.
3266              
3267             http://www.landgren.net/perl/
3268              
3269             If you use this module, I'd love to hear about what you're using
3270             it for. If you want to be informed of updates, send me a note.
3271              
3272             Ron Savage is co-maint of the module, starting with V 0.36.
3273              
3274             =head1 Repository
3275              
3276             L
3277              
3278             =head1 TODO
3279              
3280             1. Tree equivalencies. Currently, /contend/ /content/ /resend/ /resent/
3281             produces (?:conten[dt]|resend[dt]) but it is possible to produce
3282             (?:cont|res)en[dt] if one can spot the common tail nodes (and walk back
3283             the equivalent paths). Or be by me my => /[bm][ey]/ in the simplest case.
3284              
3285             To do this requires a certain amount of restructuring of the code.
3286             Currently, the algorithm uses a two-phase approach. In the first
3287             phase, the trie is traversed and reductions are performed. In the
3288             second phase, the reduced trie is traversed and the pattern is
3289             emitted.
3290              
3291             What has to occur is that the reduction and emission have to occur
3292             together. As a node is completed, it is replaced by its string
3293             representation. This then allows child nodes to be compared for
3294             equality with a simple 'eq'. Since there is only a single traversal,
3295             the overall generation time might drop, even though the context
3296             baggage required to delve through the tree will be more expensive
3297             to carry along (a hash rather than a couple of scalars).
3298              
3299             Actually, a simpler approach is to take on a secret sentinel
3300             atom at the end of every pattern, which gives the reduction
3301             algorithm sufficient traction to create a perfect trie.
3302              
3303             I'm rewriting the reduction code using this technique.
3304              
3305             2. Investigate how (?>foo) works. Can it be applied?
3306              
3307             5. How can a tracked pattern be serialised? (Add freeze and thaw methods).
3308              
3309             6. Store callbacks per tracked pattern.
3310              
3311             12. utf-8... hmmmm...
3312              
3313             14. Adding qr//'ed patterns. For example, consider
3314             $r->add ( qr/^abc/i )
3315             ->add( qr/^abd/ )
3316             ->add( qr/^ab e/x );
3317             this should admit abc abC aBc aBC abd abe as matches
3318              
3319             16. Allow a fast, unsafe tracking mode, that can be used if a(?bc)?
3320             can't happen. (Possibly carp if it does appear during traversal)?
3321              
3322             17. given a-\d+-\d+-\d+-\d+-b, produce a(?:-\d+){4}-b. Something
3323             along the lines of (.{4))(\1+) would let the regexp engine
3324             itself be brought to bear on the matter, which is a rather
3325             appealing idea. Consider
3326              
3327             while(/(?!\+)(\S{2,}?)(\1+)/g) { ... $1, $2 ... }
3328              
3329             as a starting point.
3330              
3331             19. The reduction code has become unbelievably baroque. Adding code
3332             to handle (sting,singing,sing) => s(?:(?:ing)?|t)ing was far
3333             too difficult. Adding more stuff just breaks existing behaviour.
3334             And fixing the ^abcd$ ... bug broke stuff all over again.
3335             Now that the corner cases are more clearly identified, a full
3336             rewrite of the reduction code is needed. And would admit the
3337             possibility of implementing items 1 and 17.
3338              
3339             20. Handle debug unrev with a separate bit
3340              
3341             23. Japhy's http://www.perlmonks.org/index.pl?node_id=90876 list2range
3342             regexp
3343              
3344             24. Lookahead assertions contain serious bugs (as shown by
3345             assembling powersets. Need to save more context during reduction,
3346             which in turn will simplify the preparation of the lookahead
3347             classes. See also 19.
3348              
3349             26. _lex() swamps the overall run-time. It stems from the decision
3350             to use a single regexp to pull apart any pattern. A suite of
3351             simpler regexp to pick of parens, char classes, quantifiers
3352             and bare tokens may be faster. (This has been implemented as
3353             _fastlex(), but it's only marginally faster. Perhaps split-by-
3354             char and lex a la C?
3355              
3356             27. We don't, as yet, unroll_plus a paren e.g. (abc)+?
3357              
3358             28. We don't reroll unrolled a a* to a+ in indented or tracked
3359             output
3360              
3361             29. Use (*MARK n) in blead for tracked patterns, and use (*FAIL) for
3362             the unmatchable pattern.
3363              
3364             =head1 LICENSE
3365              
3366             This library is free software; you can redistribute it and/or modify
3367             it under the same terms as Perl itself.
3368              
3369             =cut
3370              
3371             # Return a +ve value to tell Perl the module is ready to go.
3372              
3373             'The Lusty Decadent Delights of Imperial Pompeii';