File Coverage

blib/lib/Regexp/Assemble.pm
Criterion Covered Total %
statement 1140 1155 98.7
branch 777 796 97.6
condition 164 177 92.6
subroutine 97 98 98.9
pod 49 49 100.0
total 2227 2275 97.8


line stmt bran cond sub pod time code
1             # Regexp::Assemple.pm
2             #
3             # Copyright (c) 2004-2011 David Landgren
4             # All rights reserved
5              
6             package Regexp::Assemble;
7              
8 11     11   22265 use strict;
  11         12  
  11         274  
9 11     11   35 use warnings;
  11         15  
  11         297  
10              
11 11     11   39 use constant DEBUG_ADD => 1;
  11         14  
  11         869  
12 11     11   39 use constant DEBUG_TAIL => 2;
  11         8  
  11         409  
13 11     11   36 use constant DEBUG_LEX => 4;
  11         15  
  11         378  
14 11     11   33 use constant DEBUG_TIME => 8;
  11         12  
  11         441  
15              
16 11     11   36 use vars qw/$have_Storable $Current_Lexer $Default_Lexer $Single_Char $Always_Fail/;
  11         14  
  11         37377  
17              
18             # The following patterns were generated with examples/naive.
19              
20             $Default_Lexer = qr/(?![[(\\]).(?:[*+?]\??|\{\d+(?:,\d*)?\}\??)?|\\(?:[bABCEGLQUXZ]|[lu].|(?:[^\w]|[aefnrtdDwWsS]|c.|0\d{2}|x(?:[\da-fA-F]{2}|{[\da-fA-F]{4}})|N\{\w+\}|[Pp](?:\{\w+\}|.))(?:[*+?]\??|\{\d+(?:,\d*)?\}\??)?)|\[.*?(?
21              
22             $Single_Char = qr/^(?:\\(?:[aefnrtdDwWsS]|c.|[^\w\/{|}-]|0\d{2}|x(?:[\da-fA-F]{2}|{[\da-fA-F]{4}}))|[^\$^])$/;
23              
24             # The pattern to return when nothing has been added (and thus not match anything)
25              
26             $Always_Fail = "^\\b\0";
27              
28             our $VERSION = '0.37';
29              
30             # ------------------------------------------------
31              
32             sub new {
33 2194     2194 1 1045533 my $class = shift;
34 2194         3305 my %args = @_;
35              
36 2194         1690 my $anc;
37 2194         2449 for $anc (qw(word line string)) {
38 6582 100       11015 if (exists $args{"anchor_$anc"}) {
39 135         183 my $val = delete $args{"anchor_$anc"};
40 135         260 for my $anchor ("anchor_${anc}_begin", "anchor_${anc}_end") {
41 270 100       604 $args{$anchor} = $val unless exists $args{$anchor};
42             }
43             }
44             }
45              
46             # anchor_string_absolute sets anchor_string_begin and anchor_string_end_absolute
47 2194 100       3213 if (exists $args{anchor_string_absolute}) {
48 3         4 my $val = delete $args{anchor_string_absolute};
49 3         3 for my $anchor (qw(anchor_string_begin anchor_string_end_absolute)) {
50 6 100       14 $args{$anchor} = $val unless exists $args{$anchor};
51             }
52             }
53              
54 2194   100     32580 exists $args{$_} or $args{$_} = 0 for qw(
55             anchor_word_begin
56             anchor_word_end
57             anchor_line_begin
58             anchor_line_end
59             anchor_string_begin
60             anchor_string_end
61             anchor_string_end_absolute
62             debug
63             dup_warn
64             indent
65             lookahead
66             mutable
67             track
68             unroll_plus
69             );
70              
71 2194   100     10019 exists $args{$_} or $args{$_} = 1 for qw(
72             fold_meta_pairs
73             reduce
74             chomp
75             );
76              
77 2194         4183 @args{qw(re str path)} = (undef, undef, []);
78              
79 2194   100     10301 $args{flags} ||= delete $args{modifiers} || '';
      100        
80 2194 100       3321 $args{lex} = $Current_Lexer if defined $Current_Lexer;
81              
82 2194         2838 my $self = bless \%args, $class;
83              
84 2194 100       3444 if ($self->_debug(DEBUG_TIME)) {
85 1         4 $self->_init_time_func();
86 1         5 $self->{_begin_time} = $self->{_time_func}->();
87             }
88             $self->{input_record_separator} = delete $self->{rs}
89 2194 100       3216 if exists $self->{rs};
90 2194 100       2884 exists $self->{file} and $self->add_file($self->{file});
91              
92 2193         3714 return $self;
93             }
94              
95             sub _init_time_func {
96 9     9   10 my $self = shift;
97 9 100       19 return if exists $self->{_time_func};
98              
99             # attempt to improve accuracy
100 6 100       13 if (!defined($self->{_use_time_hires})) {
101 5         11 eval {require Time::HiRes};
  5         1251  
102 5         2313 $self->{_use_time_hires} = $@;
103             }
104             $self->{_time_func} = length($self->{_use_time_hires}) > 0
105 4     4   10 ? sub { time }
106 6 100       25 : \&Time::HiRes::time
107             ;
108             }
109              
110             sub clone {
111 55     55 1 257 my $self = shift;
112 55         41 my $clone;
113 55         187 my @attr = grep {$_ ne 'path'} keys %$self;
  1314         1117  
114 55         94 @{$clone}{@attr} = @{$self}{@attr};
  55         317  
  55         117  
115 55         107 $clone->{path} = _path_clone($self->_path);
116 55         219 bless $clone, ref($self);
117             }
118              
119             sub _fastlex {
120 884     884   654 my $self = shift;
121 884         612 my $record = shift;
122 884         610 my $len = 0;
123 884         784 my @path = ();
124 884         661 my $case = '';
125 884         544 my $qm = '';
126              
127 884         749 my $debug = $self->{debug} & DEBUG_LEX;
128 884         626 my $unroll_plus = $self->{unroll_plus};
129              
130 884         600 my $token;
131             my $qualifier;
132 884 100       1167 $debug and print "# _lex <$record>\n";
133 884         622 my $modifier = q{(?:[*+?]\\??|\\{(?:\\d+(?:,\d*)?|,\d+)\\}\\??)?};
134 884         1548 my $class_matcher = qr/\[(?:\[:[a-z]+:\]|\\?.)*?\]/;
135 884         2454 my $paren_matcher = qr/\(.*?(?
136 884         2038 my $misc_matcher = qr/(?:(c)(.)|(0)(\d{2}))($modifier)/;
137 884         1866 my $regular_matcher = qr/([^\\[(])($modifier)/;
138 884         1094 my $qm_matcher = qr/(\\?.)/;
139              
140 884         644 my $matcher = $regular_matcher;
141             {
142 884 100       609 if ($record =~ /\G$matcher/gc) {
  5960 100       18509  
    100          
    100          
143             # neither a \\ nor [ nor ( followed by a modifer
144 3344 100 100     7502 if ($1 eq '\\E') {
    100 66        
145 12 100       27 $debug and print "# E\n";
146 12         15 $case = $qm = '';
147 12         9 $matcher = $regular_matcher;
148 12         13 redo;
149             }
150             elsif ($qm and ($1 eq '\\L' or $1 eq '\\U')) {
151 5 100       11 $debug and print "# ignore \\L, \\U\n";
152 5         7 redo;
153             }
154 3327         2661 $token = $1;
155 3327 100       3597 $qualifier = defined $2 ? $2 : '';
156 3327 100       3588 $debug and print "# token <$token> <$qualifier>\n";
157 3327 100       2774 if ($qm) {
158 90         76 $token = quotemeta($token);
159 90         99 $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{}\/])$/$1/;
160             }
161             else {
162 3237         3044 $token =~ s{\A([][{}*+?@\\/])\Z}{\\$1};
163             }
164 3327 100 100     4686 if ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/) {
165 22 100       44 $1 and $qualifier .= $1;
166 22 100       40 $debug and print " unroll <$token><$token><$qualifier>\n";
167 22 100       59 $case and $token = $case eq 'L' ? lc($token) : uc($token);
    100          
168 22         41 push @path, $token, "$token$qualifier";
169             }
170             else {
171 3305 100       3545 $debug and print " clean <$token>\n";
172 3305 100       5639 push @path,
    100          
173             $case eq 'L' ? lc($token).$qualifier
174             : $case eq 'U' ? uc($token).$qualifier
175             : $token.$qualifier
176             ;
177             }
178 3327         2442 redo;
179             }
180              
181             elsif ($record =~ /\G\\/gc) {
182 1680 100       2084 $debug and print "# backslash\n";
183             # backslash
184 1680 100       9908 if ($record =~ /\G([sdwSDW])($modifier)/gc) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
185 443         523 ($token, $qualifier) = ($1, $2);
186 443 100       551 $debug and print "# meta <$token> <$qualifier>\n";
187 443 100 100     955 push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
    100          
188             ? ("\\$token", "\\$token$qualifier" . (defined $1 ? $1 : ''))
189             : "\\$token$qualifier";
190             }
191             elsif ($record =~ /\Gx([\da-fA-F]{2})($modifier)/gc) {
192 9 100       27 $debug and print "# x $1\n";
193 9         29 $token = quotemeta(chr(hex($1)));
194 9         10 $qualifier = $2;
195 9 100       23 $debug and print "# cooked <$token>\n";
196 9         25 $token =~ s/^\\([^\w$()*+.?\[\\\]^|{\/])$/$1/; # } balance
197 9 100       21 $debug and print "# giving <$token>\n";
198 9 100 100     53 push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
    100          
199             ? ($token, "$token$qualifier" . (defined $1 ? $1 : ''))
200             : "$token$qualifier";
201             }
202             elsif ($record =~ /\GQ/gc) {
203 26 100       47 $debug and print "# Q\n";
204 26         28 $qm = 1;
205 26         26 $matcher = $qm_matcher;
206             }
207             elsif ($record =~ /\G([LU])/gc) {
208 15 100       40 $debug and print "# case $1\n";
209 15         26 $case = $1;
210             }
211             elsif ($record =~ /\GE/gc) {
212 6 100       16 $debug and print "# E\n";
213 6         15 $case = $qm = '';
214 6         6 $matcher = $regular_matcher;
215             }
216             elsif ($record =~ /\G([lu])(.)/gc) {
217 6 100       18 $debug and print "# case $1 to <$2>\n";
218 6 100       21 push @path, $1 eq 'l' ? lc($2) : uc($2);
219             }
220 30         37 elsif (my @arg = grep {defined} $record =~ /\G$misc_matcher/gc) {
221 6 50       14 if ($] < 5.007) {
222 0         0 my $len = 0;
223 0         0 $len += length($_) for @arg;
224 0 0       0 $debug and print "# pos ", pos($record), " fixup add $len\n";
225 0         0 pos($record) = pos($record) + $len;
226             }
227 6         6 my $directive = shift @arg;
228 6 100       36 if ($directive eq 'c') {
229 3 100       11 $debug and print "# ctrl <@arg>\n";
230 3         9 push @path, "\\c" . uc(shift @arg);
231             }
232             else { # elsif ($directive eq '0') {
233 3 100       10 $debug and print "# octal <@arg>\n";
234 3         5 my $ascii = oct(shift @arg);
235 3 100       11 push @path, ($ascii < 32)
236             ? "\\c" . chr($ascii+64)
237             : chr($ascii)
238             ;
239             }
240 6         9 $path[-1] .= join( '', @arg ); # if @arg;
241 6         10 redo;
242             }
243             elsif ($record =~ /\G(.)/gc) {
244 1165         1438 $token = $1;
245 1165         1716 $token =~ s{[AZabefnrtz\[\]{}()\\\$*+.?@|/^]}{\\$token};
246 1165 100       1462 $debug and print "# meta <$token>\n";
247 1165         1339 push @path, $token;
248             }
249             else {
250 4 100       11 $debug and print "# ignore char at ", pos($record), " of <$record>\n";
251             }
252 1674         1767 redo;
253             }
254              
255             elsif ($record =~ /\G($class_matcher)($modifier)/gc) {
256             # [class] followed by a modifer
257 39         64 my $class = $1;
258 39 50       68 my $qualifier = defined $2 ? $2 : '';
259 39 100       83 $debug and print "# class begin <$class> <$qualifier>\n";
260 39 100       98 if ($class =~ /\A\[\\?(.)]\Z/) {
261 12         20 $class = quotemeta $1;
262 12         21 $class =~ s{\A\\([!@%])\Z}{$1};
263 12 100       21 $debug and print "# class unwrap $class\n";
264             }
265 39 100       60 $debug and print "# class end <$class> <$qualifier>\n";
266 39 100 100     128 push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
    100          
267             ? ($class, "$class$qualifier" . (defined $1 ? $1 : ''))
268             : "$class$qualifier";
269 39         52 redo;
270             }
271              
272             elsif ($record =~ /\G($paren_matcher)/gc) {
273 13 100       23 $debug and print "# paren <$1>\n";
274             # (paren) followed by a modifer
275 13         24 push @path, $1;
276 13         16 redo;
277             }
278              
279             }
280 884         2579 return \@path;
281             }
282              
283             sub _lex {
284 211     211   224 my $self = shift;
285 211         176 my $record = shift;
286 211         146 my $len = 0;
287 211         189 my @path = ();
288 211         146 my $case = '';
289 211         152 my $qm = '';
290             my $re = defined $self->{lex} ? $self->{lex}
291 211 50       335 : defined $Current_Lexer ? $Current_Lexer
    100          
292             : $Default_Lexer;
293 211         178 my $debug = $self->{debug} & DEBUG_LEX;
294 211 100       336 $debug and print "# _lex <$record>\n";
295 211         149 my ($token, $next_token, $diff, $token_len);
296 211         2320 while( $record =~ /($re)/g ) {
297 480         583 $token = $1;
298 480         331 $token_len = length($token);
299 480 100       789 $debug and print "# lexed <$token> len=$token_len\n";
300 480 100       664 if( pos($record) - $len > $token_len ) {
301 15         13 $next_token = $token;
302 15         20 $token = substr( $record, $len, $diff = pos($record) - $len - $token_len );
303 15 100       35 $debug and print "# recover <", substr( $record, $len, $diff ), "> as <$token>, save <$next_token>\n";
304 15         42 $len += $diff;
305             }
306 480         324 $len += $token_len;
307             TOKEN: {
308 480 100       315 if( substr( $token, 0, 1 ) eq '\\' ) {
  495         671  
309 226 100       610 if( $token =~ /^\\([ELQU])$/ ) {
    100          
    100          
310 51 100       104 if( $1 eq 'E' ) {
    100          
311             $qm and $re = defined $self->{lex} ? $self->{lex}
312 12 50       33 : defined $Current_Lexer ? $Current_Lexer
    100          
    100          
313             : $Default_Lexer;
314 12         17 $case = $qm = '';
315             }
316             elsif( $1 eq 'Q' ) {
317 19         19 $qm = $1;
318             # switch to a more precise lexer to quotemeta individual characters
319 19         49 $re = qr/\\?./;
320             }
321             else {
322 20         22 $case = $1;
323             }
324 51 100       101 $debug and print "# state change qm=<$qm> case=<$case>\n";
325 51         358 goto NEXT_TOKEN;
326             }
327             elsif( $token =~ /^\\([lu])(.)$/ ) {
328 3 100       11 $debug and print "# apply case=<$1> to <$2>\n";
329 3 100       10 push @path, $1 eq 'l' ? lc($2) : uc($2);
330 3         22 goto NEXT_TOKEN;
331             }
332             elsif( $token =~ /^\\x([\da-fA-F]{2})$/ ) {
333 41         91 $token = quotemeta(chr(hex($1)));
334 41 100       58 $debug and print "# cooked <$token>\n";
335 41         70 $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{\/])$/$1/; # } balance
336 41 100       70 $debug and print "# giving <$token>\n";
337             }
338             else {
339 131         155 $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{\/])$/$1/; # } balance
340 131 100       181 $debug and print "# backslashed <$token>\n";
341             }
342             }
343             else {
344 269 100       329 $case and $token = $case eq 'U' ? uc($token) : lc($token);
    100          
345 269 100       352 $qm and $token = quotemeta($token);
346 269 100       362 $token = '\\/' if $token eq '/';
347             }
348             # undo quotemeta's brute-force escapades
349 441 100       519 $qm and $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{}\/])$/$1/;
350 441 100       577 $debug and print "# <$token> case=<$case> qm=<$qm>\n";
351 441         491 push @path, $token;
352              
353             NEXT_TOKEN:
354 495 100       2529 if( defined $next_token ) {
355 15 100       30 $debug and print "# redo <$next_token>\n";
356 15         12 $token = $next_token;
357 15         12 $next_token = undef;
358 15         15 redo TOKEN;
359             }
360             }
361             }
362 211 100       286 if( $len < length($record) ) {
363             # NB: the remainder only arises in the case of degenerate lexer,
364             # and if \Q is operative, the lexer will have been switched to
365             # /\\?./, which means there can never be a remainder, so we
366             # don't have to bother about quotemeta. In other words:
367             # $qm will never be true in this block.
368 7         9 my $remain = substr($record,$len);
369 7 100       12 $case and $remain = $case eq 'U' ? uc($remain) : lc($remain);
    100          
370 7 100       22 $debug and print "# add remaining <$remain> case=<$case> qm=<$qm>\n";
371 7         8 push @path, $remain;
372             }
373 211 100       283 $debug and print "# _lex out <@path>\n";
374 211         937 return \@path;
375             }
376              
377             sub add {
378 1059     1059 1 8089 my $self = shift;
379 1059         740 my $record;
380 1059         1508 my $debug = $self->{debug} & DEBUG_LEX;
381 1059         1928 while( defined( $record = shift @_ )) {
382 2263 100       3261 CORE::chomp($record) if $self->{chomp};
383 2263 100 100     3484 next if $self->{pre_filter} and not $self->{pre_filter}->($record);
384 2262 100       2831 $debug and print "# add <$record>\n";
385 2262         2022 $self->{stats_raw} += length $record;
386             my $list = $record =~ /[+*?(\\\[{]/ # }]) restore equilibrium
387 2262 100       7483 ? $self->{lex} ? $self->_lex($record) : $self->_fastlex($record)
    100          
388             : [split //, $record]
389             ;
390 2262 100 100     4252 next if $self->{filter} and not $self->{filter}->(@$list);
391 2261         2930 $self->_insertr( $list );
392             }
393 1059         1674 return $self;
394             }
395              
396             sub add_file {
397 13     13 1 14 my $self = shift;
398 13         8 my $rs;
399             my @file;
400 13 100       23 if (ref($_[0]) eq 'HASH') {
401 6         6 my $arg = shift;
402             $rs = $arg->{rs}
403             || $arg->{input_record_separator}
404             || $self->{input_record_separator}
405 6   66     22 || $/;
406             @file = ref($arg->{file}) eq 'ARRAY'
407 4         7 ? @{$arg->{file}}
408 6 100       11 : $arg->{file};
409             }
410             else {
411 7   66     19 $rs = $self->{input_record_separator} || $/;
412 7         10 @file = @_;
413             }
414 13         29 local $/ = $rs;
415 13         9 my $file;
416 13         14 for $file (@file) {
417 15 100       514 open my $fh, '<', $file or do {
418 1         8 require Carp;
419 1         149 Carp::croak("cannot open $file for input: $!");
420             };
421 14         102 while (defined (my $rec = <$fh>)) {
422 55         73 $self->add($rec);
423             }
424 14         90 close $fh;
425             }
426 12         44 return $self;
427             }
428              
429             sub insert {
430 3398     3398 1 6941 my $self = shift;
431 3398 100 100     5196 return if $self->{filter} and not $self->{filter}->(@_);
432 3397         5954 $self->_insertr( [@_] );
433 3397         5032 return $self;
434             }
435              
436             sub _insertr {
437 7068     7068   4681 my $self = shift;
438 7068   100     15969 my $dup = $self->{stats_dup} || 0;
439 7068         7865 $self->{path} = $self->_insert_path( $self->_path, $self->_debug(DEBUG_ADD), $_[0] );
440 7068 100 100     13511 if( not defined $self->{stats_dup} or $dup == $self->{stats_dup} ) {
    50          
441 7063         5621 ++$self->{stats_add};
442 7063 100       4528 $self->{stats_cooked} += defined($_) ? length($_) : 0 for @{$_[0]};
  7063         26398  
443             }
444             elsif( $self->{dup_warn} ) {
445 0 0       0 if( ref $self->{dup_warn} eq 'CODE' ) {
446 0         0 $self->{dup_warn}->($self, $_[0]);
447             }
448             else {
449 0         0 my $pattern = join( '', @{$_[0]} );
  0         0  
450 0         0 require Carp;
451 0         0 Carp::carp("duplicate pattern added: /$pattern/");
452             }
453             }
454 7068         11456 $self->{str} = $self->{re} = undef;
455             }
456              
457             sub lexstr {
458 2     2 1 7 return shift->_lex(shift);
459             }
460              
461             sub pre_filter {
462 3     3 1 540 my $self = shift;
463 3         4 my $pre_filter = shift;
464 3 100 100     24 if( defined $pre_filter and ref($pre_filter) ne 'CODE' ) {
465 1         4 require Carp;
466 1         60 Carp::croak("pre_filter method not passed a coderef");
467             }
468 2         3 $self->{pre_filter} = $pre_filter;
469 2         4 return $self;
470             }
471              
472              
473             sub filter {
474 4     4 1 242 my $self = shift;
475 4         3 my $filter = shift;
476 4 100 100     21 if( defined $filter and ref($filter) ne 'CODE' ) {
477 1         8 require Carp;
478 1         117 Carp::croak("filter method not passed a coderef");
479             }
480 3         5 $self->{filter} = $filter;
481 3         5 return $self;
482             }
483              
484             sub as_string {
485 800     800 1 982 my $self = shift;
486 800 100       1228 if( not defined $self->{str} ) {
487 798 100       928 if( $self->{track} ) {
488 8         9 $self->{m} = undef;
489 8         10 $self->{mcount} = 0;
490 8         11 $self->{mlist} = [];
491 8         14 $self->{str} = _re_path_track($self, $self->_path, '', '');
492             }
493             else {
494 790 100 100     3171 $self->_reduce unless ($self->{mutable} or not $self->{reduce});
495 790         812 my $arg = {@_};
496             $arg->{indent} = $self->{indent}
497 790 100 100     2526 if not exists $arg->{indent} and $self->{indent} > 0;
498 790 100 100     1807 if( exists $arg->{indent} and $arg->{indent} > 0 ) {
    100          
499 42         38 $arg->{depth} = 0;
500 42         72 $self->{str} = _re_path_pretty($self, $self->_path, $arg);
501             }
502             elsif( $self->{lookahead} ) {
503 35         52 $self->{str} = _re_path_lookahead($self, $self->_path);
504             }
505             else {
506 713         781 $self->{str} = _re_path($self, $self->_path);
507             }
508             }
509 798 100       1369 if (not length $self->{str}) {
510             # explicitly fail to match anything if no pattern was generated
511 9         14 $self->{str} = $Always_Fail;
512             }
513             else {
514             my $begin =
515             $self->{anchor_word_begin} ? '\\b'
516             : $self->{anchor_line_begin} ? '^'
517 789 100       1707 : $self->{anchor_string_begin} ? '\A'
    100          
    100          
518             : ''
519             ;
520             my $end =
521             $self->{anchor_word_end} ? '\\b'
522             : $self->{anchor_line_end} ? '$'
523             : $self->{anchor_string_end} ? '\Z'
524 789 100       1755 : $self->{anchor_string_end_absolute} ? '\z'
    100          
    100          
    100          
525             : ''
526             ;
527 789         1150 $self->{str} = "$begin$self->{str}$end";
528             }
529 798 100       1545 $self->{path} = [] unless $self->{mutable};
530             }
531 800         2850 return $self->{str};
532             }
533              
534             sub re {
535 122     122 1 880 my $self = shift;
536 122 100       306 $self->_build_re($self->as_string(@_)) unless defined $self->{re};
537 122         424 return $self->{re};
538             }
539              
540             use overload '""' => sub {
541 2131     2131   380510 my $self = shift;
542 2131 100       11608 return $self->{re} if $self->{re};
543 489         648 $self->_build_re($self->as_string());
544 489         4033 return $self->{re};
545 11     11   11455 };
  11         8936  
  11         102  
546              
547             sub _build_re {
548 618     618   469 my $self = shift;
549 618         443 my $str = shift;
550 618 100       722 if( $self->{track} ) {
551 11     11   788 use re 'eval';
  11         16  
  11         89333  
552             $self->{re} = length $self->{flags}
553 8 100       992 ? qr/(?$self->{flags}:$str)/
554             : qr/$str/
555             ;
556             }
557             else {
558             # how could I not repeat myself?
559             $self->{re} = length $self->{flags}
560 610 100       9172 ? qr/(?$self->{flags}:$str)/
561             : qr/$str/
562             ;
563             }
564             }
565              
566             sub match {
567 29     29 1 3095 my $self = shift;
568 29         23 my $target = shift;
569 29 100       72 $self->_build_re($self->as_string(@_)) unless defined $self->{re};
570 29         37 $self->{m} = undef;
571 29         42 $self->{mvar} = [];
572 29 100       647 if( not $target =~ /$self->{re}/ ) {
573 8         14 $self->{mbegin} = [];
574 8         19 $self->{mend} = [];
575 8         38 return undef;
576             }
577 21 50       45 $self->{m} = $^R if $] >= 5.009005;
578 21         84 $self->{mbegin} = _path_copy([@-]);
579 21         93 $self->{mend} = _path_copy([@+]);
580 21         37 my $n = 0;
581 21         40 for( my $n = 0; $n < @-; ++$n ) {
582 43 100 66     171 push @{$self->{mvar}}, substr($target, $-[$n], $+[$n] - $-[$n])
  33         178  
583             if defined $-[$n] and defined $+[$n];
584             }
585 21 100       40 if( $self->{track} ) {
586 20 50       111 return defined $self->{m} ? $self->{mlist}[$self->{m}] : 1;
587             }
588             else {
589 1         6 return 1;
590             }
591             }
592              
593             sub source {
594 4     4 1 247 my $self = shift;
595 4 100       14 return unless $self->{track};
596 3 100       8 defined($_[0]) and return $self->{mlist}[$_[0]];
597 2 100       7 return unless defined $self->{m};
598 1         4 return $self->{mlist}[$self->{m}];
599             }
600              
601             sub mbegin {
602 3     3 1 7 my $self = shift;
603 3 100       25 return exists $self->{mbegin} ? $self->{mbegin} : [];
604             }
605              
606             sub mend {
607 3     3 1 20 my $self = shift;
608 3 100       18 return exists $self->{mend} ? $self->{mend} : [];
609             }
610              
611             sub mvar {
612 19     19 1 24 my $self = shift;
613 19 100       46 return undef unless exists $self->{mvar};
614 18 100       97 return defined($_[0]) ? $self->{mvar}[$_[0]] : $self->{mvar};
615             }
616              
617             sub capture {
618 5     5 1 12 my $self = shift;
619 5 100       13 if( $self->{mvar} ) {
620 4         5 my @capture = @{$self->{mvar}};
  4         12  
621 4         5 shift @capture;
622 4         16 return @capture;
623             }
624 1         3 return ();
625             }
626              
627             sub matched {
628 9     9 1 222 my $self = shift;
629 9 100       48 return defined $self->{m} ? $self->{mlist}[$self->{m}] : undef;
630             }
631              
632             sub stats_add {
633 2     2 1 8 my $self = shift;
634 2   100     11 return $self->{stats_add} || 0;
635             }
636              
637             sub stats_dup {
638 2     2 1 2 my $self = shift;
639 2   100     12 return $self->{stats_dup} || 0;
640             }
641              
642             sub stats_raw {
643 2     2 1 3 my $self = shift;
644 2   100     12 return $self->{stats_raw} || 0;
645             }
646              
647             sub stats_cooked {
648 2     2 1 4 my $self = shift;
649 2   100     11 return $self->{stats_cooked} || 0;
650             }
651              
652             sub stats_length {
653 6     6 1 1665 my $self = shift;
654 6 100 100     39 return (defined $self->{str} and $self->{str} ne $Always_Fail) ? length $self->{str} : 0;
655             }
656              
657             sub dup_warn {
658 0     0 1 0 my $self = shift;
659 0 0       0 $self->{dup_warn} = defined($_[0]) ? $_[0] : 1;
660 0         0 return $self;
661             }
662              
663             sub anchor_word_begin {
664 5     5 1 3 my $self = shift;
665 5 100       10 $self->{anchor_word_begin} = defined($_[0]) ? $_[0] : 1;
666 5         8 return $self;
667             }
668              
669             sub anchor_word_end {
670 4     4 1 4 my $self = shift;
671 4 100       7 $self->{anchor_word_end} = defined($_[0]) ? $_[0] : 1;
672 4         6 return $self;
673             }
674              
675             sub anchor_word {
676 2     2 1 2 my $self = shift;
677 2         2 my $state = shift;
678 2         5 $self->anchor_word_begin($state)->anchor_word_end($state);
679 2         5 return $self;
680             }
681              
682             sub anchor_line_begin {
683 4     4 1 3 my $self = shift;
684 4 100       8 $self->{anchor_line_begin} = defined($_[0]) ? $_[0] : 1;
685 4         7 return $self;
686             }
687              
688             sub anchor_line_end {
689 2     2 1 1 my $self = shift;
690 2 100       5 $self->{anchor_line_end} = defined($_[0]) ? $_[0] : 1;
691 2         1 return $self;
692             }
693              
694             sub anchor_line {
695 2     2 1 3 my $self = shift;
696 2         1 my $state = shift;
697 2         4 $self->anchor_line_begin($state)->anchor_line_end($state);
698 2         7 return $self;
699             }
700              
701             sub anchor_string_begin {
702 277     277 1 188 my $self = shift;
703 277 100       357 $self->{anchor_string_begin} = defined($_[0]) ? $_[0] : 1;
704 277         321 return $self;
705             }
706              
707             sub anchor_string_end {
708 276     276 1 166 my $self = shift;
709 276 100       315 $self->{anchor_string_end} = defined($_[0]) ? $_[0] : 1;
710 276         197 return $self;
711             }
712              
713             sub anchor_string_end_absolute {
714 3     3 1 3 my $self = shift;
715 3 100       5 $self->{anchor_string_end_absolute} = defined($_[0]) ? $_[0] : 1;
716 3         4 return $self;
717             }
718              
719             sub anchor_string {
720 274     274 1 193 my $self = shift;
721 274 100       367 my $state = defined($_[0]) ? $_[0] : 1;
722 274         417 $self->anchor_string_begin($state)->anchor_string_end($state);
723 274         578 return $self;
724             }
725              
726             sub anchor_string_absolute {
727 2     2 1 1 my $self = shift;
728 2 100       5 my $state = defined($_[0]) ? $_[0] : 1;
729 2         3 $self->anchor_string_begin($state)->anchor_string_end_absolute($state);
730 2         4 return $self;
731             }
732              
733             sub debug {
734 605     605 1 1481 my $self = shift;
735 605 100       925 $self->{debug} = defined($_[0]) ? $_[0] : 0;
736 605 100       616 if ($self->_debug(DEBUG_TIME)) {
737             # hmm, debugging time was switched on after instantiation
738 4         28 $self->_init_time_func;
739 4         14 $self->{_begin_time} = $self->{_time_func}->();
740             }
741 605         495 return $self;
742             }
743              
744             sub dump {
745 9     9 1 514 return _dump($_[0]->_path);
746             }
747              
748             sub chomp {
749 22     22 1 498 my $self = shift;
750 22 100       32 $self->{chomp} = defined($_[0]) ? $_[0] : 1;
751 22         37 return $self;
752             }
753              
754             sub fold_meta_pairs {
755 5     5 1 4 my $self = shift;
756 5 100       14 $self->{fold_meta_pairs} = defined($_[0]) ? $_[0] : 1;
757 5         7 return $self;
758             }
759              
760             sub indent {
761 4     4 1 477 my $self = shift;
762 4 100       10 $self->{indent} = defined($_[0]) ? $_[0] : 0;
763 4         8 return $self;
764             }
765              
766             sub lookahead {
767 22     22 1 19 my $self = shift;
768 22 100       45 $self->{lookahead} = defined($_[0]) ? $_[0] : 1;
769 22         34 return $self;
770             }
771              
772             sub flags {
773 24     24 1 956 my $self = shift;
774 24 100       47 $self->{flags} = defined($_[0]) ? $_[0] : '';
775 24         40 return $self;
776             }
777              
778             sub modifiers {
779 4     4 1 936 my $self = shift;
780 4         9 return $self->flags(@_);
781             }
782              
783             sub track {
784 5     5 1 922 my $self = shift;
785 5 100       11 $self->{track} = defined($_[0]) ? $_[0] : 1;
786 5         8 return $self;
787             }
788              
789             sub unroll_plus {
790 2     2 1 513 my $self = shift;
791 2 100       6 $self->{unroll_plus} = defined($_[0]) ? $_[0] : 1;
792 2         4 return $self;
793             }
794              
795             sub lex {
796 1     1 1 1 my $self = shift;
797 1         39 $self->{lex} = qr($_[0]);
798 1         2 return $self;
799             }
800              
801             sub reduce {
802 19     19 1 721 my $self = shift;
803 19 100       50 $self->{reduce} = defined($_[0]) ? $_[0] : 1;
804 19         31 return $self;
805             }
806              
807             sub mutable {
808 5     5 1 737 my $self = shift;
809 5 100       16 $self->{mutable} = defined($_[0]) ? $_[0] : 1;
810 5         6 return $self;
811             }
812              
813             sub reset {
814             # reinitialise the internal state of the object
815 19     19 1 1402 my $self = shift;
816 19         30 $self->{path} = [];
817 19         33 $self->{re} = undef;
818 19         19 $self->{str} = undef;
819 19         52 return $self;
820             }
821              
822             sub Default_Lexer {
823 4 100   4 1 2230 if( $_[0] ) {
824 3 100       13 if( my $refname = ref($_[0]) ) {
825 1         8 require Carp;
826 1         138 Carp::croak("Cannot pass a $refname to Default_Lexer");
827             }
828 2         15 $Current_Lexer = $_[0];
829             }
830 3 100       11 return defined $Current_Lexer ? $Current_Lexer : $Default_Lexer;
831             }
832              
833             # --- no user serviceable parts below ---
834              
835             # -- debug helpers
836              
837             sub _debug {
838 12297     12297   7986 my $self = shift;
839 12297 100       24416 return $self->{debug} & shift() ? 1 : 0;
840             }
841              
842             # -- helpers
843              
844             sub _path {
845             # access the path
846 10011     10011   15380 return $_[0]->{path};
847             }
848              
849             # -- the heart of the matter
850              
851             $have_Storable = do {
852             eval {
853             require Storable;
854             import Storable 'dclone';
855             };
856             $@ ? 0 : 1;
857             };
858              
859             sub _path_clone {
860 55 100   55   1976 $have_Storable ? dclone($_[0]) : _path_copy($_[0]);
861             }
862              
863             sub _path_copy {
864 80     80   403 my $path = shift;
865 80         65 my $new = [];
866 80         136 for( my $p = 0; $p < @$path; ++$p ) {
867 201 100       302 if( ref($path->[$p]) eq 'HASH' ) {
    100          
868 9         16 push @$new, _node_copy($path->[$p]);
869             }
870             elsif( ref($path->[$p]) eq 'ARRAY' ) {
871 3         3 push @$new, _path_copy($path->[$p]);
872             }
873             else {
874 189         319 push @$new, $path->[$p];
875             }
876             }
877 80         132 return $new;
878             }
879              
880             sub _node_copy {
881 11     11   8 my $node = shift;
882 11         9 my $new = {};
883 11         29 while( my( $k, $v ) = each %$node ) {
884 27 100       44 $new->{$k} = defined($v)
885             ? _path_copy($v)
886             : undef
887             ;
888             }
889 11         20 return $new;
890             }
891              
892             sub _insert_path {
893 7142     7142   5000 my $self = shift;
894 7142         4472 my $list = shift;
895 7142         4332 my $debug = shift;
896 7142         4589 my @in = @{shift()}; # create a new copy
  7142         11832  
897 7142 100       9983 if( @$list == 0 ) { # special case the first time
898 2030 100 100     6861 if( @in == 0 or (@in == 1 and (not defined $in[0] or $in[0] eq ''))) {
      66        
      66        
899 33         81 return [{'' => undef}];
900             }
901             else {
902 1997         3256 return \@in;
903             }
904             }
905 5112 100       6219 $debug and print "# _insert_path @{[_dump(\@in)]} into @{[_dump($list)]}\n";
  151         207  
  151         147  
906 5112         3535 my $path = $list;
907 5112         3170 my $offset = 0;
908 5112         3106 my $token;
909 5112 100       6348 if( not @in ) {
910 2 100       6 if( ref($list->[0]) ne 'HASH' ) {
911 1         2 return [ { '' => undef, $list->[0] => $list } ];
912             }
913             else {
914 1         1 $list->[0]{''} = undef;
915 1         2 return $list;
916             }
917             }
918 5110         8098 while( defined( $token = shift @in )) {
919 17713 100       20516 if( ref($token) eq 'HASH' ) {
920 282 100       409 $debug and print "# p0=", _dump($path), "\n";
921 282         492 $path = $self->_insert_node( $path, $offset, $token, $debug, @in );
922 282 100       407 $debug and print "# p1=", _dump($path), "\n";
923 282         274 last;
924             }
925 17431 100       21085 if( ref($path->[$offset]) eq 'HASH' ) {
926 3703 100       4250 $debug and print "# at (off=$offset len=@{[scalar @$path]}) ", _dump($path->[$offset]), "\n";
  54         171  
927 3703         2637 my $node = $path->[$offset];
928 3703 100       3967 if( exists( $node->{$token} )) {
929 2624 100       2721 if ($offset < $#$path) {
930             my $new = {
931             $token => [$token, @in],
932 1         3 _re_path($self, [$node]) => [@{$path}[$offset..$#$path]],
  1         3  
933             };
934 1         3 splice @$path, $offset, @$path-$offset, $new;
935 1         1 last;
936             }
937             else {
938 2623 100       2985 $debug and print "# descend key=$token @{[_dump($node->{$token})]}\n";
  31         55  
939 2623         1893 $path = $node->{$token};
940 2623         1723 $offset = 0;
941 2623         2047 redo;
942             }
943             }
944             else {
945 1079 100       1303 $debug and print "# add path ($token:@{[_dump(\@in)]}) into @{[_dump($path)]} at off=$offset to end=@{[scalar $#$path]}\n";
  23         32  
  23         32  
  23         329  
946 1079 100       1237 if( $offset == $#$path ) {
947 1072         2084 $node->{$token} = [ $token, @in ];
948             }
949             else {
950             my $new = {
951             _node_key($token) => [ $token, @in ],
952 7         13 _node_key($node) => [@{$path}[$offset..$#{$path}]],
  7         19  
  7         10  
953             };
954 7         18 splice( @$path, $offset, @$path - $offset, $new );
955 7 100       17 $debug and print "# fused node=@{[_dump($new)]} path=@{[_dump($path)]}\n";
  1         2  
  1         3  
956             }
957 1079         1031 last;
958             }
959             }
960              
961 13728 100       14830 if( $debug ) {
962 306         226 my $msg = '';
963 306         184 my $n;
964 306         447 for( $n = 0; $n < @$path; ++$n ) {
965 1093 100       1252 $msg .= ' ' if $n;
966             my $atom = ref($path->[$n]) eq 'HASH'
967 1093 100       1225 ? '{'.join( ' ', keys(%{$path->[$n]})).'}'
  81         173  
968             : $path->[$n]
969             ;
970 1093 100       1951 $msg .= $n == $offset ? "<$atom>" : $atom;
971             }
972 306         701 print "# at path ($msg)\n";
973             }
974              
975 13728 100       25093 if( $offset >= @$path ) {
    100          
    100          
976 732         1588 push @$path, { $token => [ $token, @in ], '' => undef };
977 732 100       1005 $debug and print "# added remaining @{[_dump($path)]}\n";
  21         32  
978 732         599 last;
979             }
980             elsif( $token ne $path->[$offset] ) {
981 2099 100       2583 $debug and print "# token $token not present\n";
982             splice @$path, $offset, @$path-$offset, {
983             length $token
984             ? ( _node_key($token) => [$token, @in])
985             : ( '' => undef )
986             ,
987 2099 100       4018 $path->[$offset] => [@{$path}[$offset..$#{$path}]],
  2099         5812  
  2099         2253  
988             };
989 2099 100       3433 $debug and print "# path=@{[_dump($path)]}\n";
  79         95  
990 2099         1792 last;
991             }
992             elsif( not @in ) {
993 917 100       1033 $debug and print "# last token to add\n";
994 917 100       1206 if( defined( $path->[$offset+1] )) {
995 912         665 ++$offset;
996 912 100       964 if( ref($path->[$offset]) eq 'HASH' ) {
997 118 100       145 $debug and print "# add sentinel to node\n";
998 118         140 $path->[$offset]{''} = undef;
999             }
1000             else {
1001 794 100       892 $debug and print "# convert <$path->[$offset]> to node for sentinel\n";
1002             splice @$path, $offset, @$path-$offset, {
1003             '' => undef,
1004 794         919 $path->[$offset] => [ @{$path}[$offset..$#{$path}] ],
  794         1758  
  794         802  
1005             };
1006             }
1007             }
1008             else {
1009             # already seen this pattern
1010 5         7 ++$self->{stats_dup};
1011             }
1012 917         870 last;
1013             }
1014             # if we get here then @_ still contains a token
1015 9980         12880 ++$offset;
1016             }
1017 5110         6727 $list;
1018             }
1019              
1020             sub _insert_node {
1021 282     282   234 my $self = shift;
1022 282         208 my $path = shift;
1023 282         195 my $offset = shift;
1024 282         212 my $token = shift;
1025 282         204 my $debug = shift;
1026 282         257 my $path_end = [@{$path}[$offset..$#{$path}]];
  282         385  
  282         309  
1027             # NB: $path->[$offset] and $[path_end->[0] are equivalent
1028 282         552 my $token_key = _re_path($self, [$token]);
1029 282 100       534 $debug and print "# insert node(@{[_dump($token)]}:@{[_dump(\@_)]}) (key=$token_key)",
  26         45  
  26         50  
1030 26         43 " at path=@{[_dump($path_end)]}\n";
1031 282 100       519 if( ref($path_end->[0]) eq 'HASH' ) {
1032 195 100       459 if( exists($path_end->[0]{$token_key}) ) {
    100          
1033 25 100       48 if( @$path_end > 1 ) {
1034 2         6 my $path_key = _re_path($self, [$path_end->[0]]);
1035 2         10 my $new = {
1036             $path_key => [ @$path_end ],
1037             $token_key => [ $token, @_ ],
1038             };
1039 2 100       6 $debug and print "# +bifurcate new=@{[_dump($new)]}\n";
  1         3  
1040 2         7 splice( @$path, $offset, @$path_end, $new );
1041             }
1042             else {
1043 23         28 my $old_path = $path_end->[0]{$token_key};
1044 23         29 my $new_path = [];
1045 23   100     92 while( @$old_path and _node_eq( $old_path->[0], $token )) {
1046 30 100       80 $debug and print "# identical nodes in sub_path ",
    100          
1047             ref($token) ? _dump($token) : $token, "\n";
1048 30         45 push @$new_path, shift(@$old_path);
1049 30         76 $token = shift @_;
1050             }
1051 23 50       112 if( @$new_path ) {
1052 23         22 my $new;
1053 23         26 my $token_key = $token;
1054 23 100       36 if( @_ ) {
1055 6         11 $new = {
1056             _re_path($self, $old_path) => $old_path,
1057             $token_key => [$token, @_],
1058             };
1059 6 100       14 $debug and print "# insert_node(bifurc) n=@{[_dump([$new])]}\n";
  1         4  
1060             }
1061             else {
1062 17 100       30 $debug and print "# insert $token into old path @{[_dump($old_path)]}\n";
  5         7  
1063 17 100       29 if( @$old_path ) {
1064 11         38 $new = ($self->_insert_path( $old_path, $debug, [$token] ))->[0];
1065             }
1066             else {
1067 6         15 $new = { '' => undef, $token => [$token] };
1068             }
1069             }
1070 23         35 push @$new_path, $new;
1071             }
1072 23         38 $path_end->[0]{$token_key} = $new_path;
1073 23 100       37 $debug and print "# +_insert_node result=@{[_dump($path_end)]}\n";
  6         11  
1074 23         66 splice( @$path, $offset, @$path_end, @$path_end );
1075             }
1076             }
1077             elsif( not _node_eq( $path_end->[0], $token )) {
1078 70 100       105 if( @$path_end > 1 ) {
1079 11         26 my $path_key = _re_path($self, [$path_end->[0]]);
1080 11         44 my $new = {
1081             $path_key => [ @$path_end ],
1082             $token_key => [ $token, @_ ],
1083             };
1084 11 100       27 $debug and print "# path->node1 at $path_key/$token_key @{[_dump($new)]}\n";
  1         3  
1085 11         27 splice( @$path, $offset, @$path_end, $new );
1086             }
1087             else {
1088 59 100       142 $debug and print "# next in path is node, trivial insert at $token_key\n";
1089 59         139 $path_end->[0]{$token_key} = [$token, @_];
1090 59         114 splice( @$path, $offset, @$path_end, @$path_end );
1091             }
1092             }
1093             else {
1094 100   100     271 while( @$path_end and _node_eq( $path_end->[0], $token )) {
1095 131 100       283 $debug and print "# identical nodes @{[_dump([$token])]}\n";
  9         24  
1096 131         114 shift @$path_end;
1097 131         134 $token = shift @_;
1098 131         252 ++$offset;
1099             }
1100 100 100       140 if( @$path_end ) {
1101 57 100       82 $debug and print "# insert at $offset $token:@{[_dump(\@_)]} into @{[_dump($path_end)]}\n";
  4         10  
  4         7  
1102 57         180 $path_end = $self->_insert_path( $path_end, $debug, [$token, @_] );
1103 57 100       102 $debug and print "# got off=$offset s=@{[scalar @_]} path_add=@{[_dump($path_end)]}\n";
  4         12  
  4         6  
1104 57         99 splice( @$path, $offset, @$path - $offset, @$path_end );
1105 57 100       97 $debug and print "# got final=@{[_dump($path)]}\n";
  4         8  
1106             }
1107             else {
1108 43         69 $token_key = _node_key($token);
1109 43         124 my $new = {
1110             '' => undef,
1111             $token_key => [ $token, @_ ],
1112             };
1113 43 100       74 $debug and print "# convert opt @{[_dump($new)]}\n";
  3         6  
1114 43         55 push @$path, $new;
1115             }
1116             }
1117             }
1118             else {
1119 87 100       126 if( @$path_end ) {
1120 74         262 my $new = {
1121             $path_end->[0] => [ @$path_end ],
1122             $token_key => [ $token, @_ ],
1123             };
1124 74 100       121 $debug and print "# atom->node @{[_dump($new)]}\n";
  5         10  
1125 74         131 splice( @$path, $offset, @$path_end, $new );
1126 74 100       131 $debug and print "# out=@{[_dump($path)]}\n";
  5         8  
1127             }
1128             else {
1129 13 100       31 $debug and print "# add opt @{[_dump([$token,@_])]} via $token_key\n";
  4         12  
1130 13         42 push @$path, {
1131             '' => undef,
1132             $token_key => [ $token, @_ ],
1133             };
1134             }
1135             }
1136 282         509 $path;
1137             }
1138              
1139             sub _reduce {
1140 810     810   654 my $self = shift;
1141 810         1000 my $context = { debug => $self->_debug(DEBUG_TAIL), depth => 0 };
1142              
1143 810 100       977 if ($self->_debug(DEBUG_TIME)) {
1144 4         7 $self->_init_time_func;
1145 4         11 my $now = $self->{_time_func}->();
1146 4 100       8 if (exists $self->{_begin_time}) {
1147 3         104 printf "# load=%0.6f\n", $now - $self->{_begin_time};
1148             }
1149             else {
1150 1         41 printf "# load-epoch=%0.6f\n", $now;
1151             }
1152 4         14 $self->{_begin_time} = $self->{_time_func}->();
1153             }
1154              
1155 810         974 my ($head, $tail) = _reduce_path( $self->_path, $context );
1156 810 100       1221 $context->{debug} and print "# final head=", _dump($head), ' tail=', _dump($tail), "\n";
1157 810 100       957 if( !@$head ) {
1158 651         649 $self->{path} = $tail;
1159             }
1160             else {
1161             $self->{path} = [
1162 159         237 @{_unrev_path( $tail, $context )},
1163 159         123 @{_unrev_path( $head, $context )},
  159         187  
1164             ];
1165             }
1166              
1167 810 100       1334 if ($self->_debug(DEBUG_TIME)) {
1168 4         9 my $now = $self->{_time_func}->();
1169 4 50       7 if (exists $self->{_begin_time}) {
1170 4         102 printf "# reduce=%0.6f\n", $now - $self->{_begin_time};
1171             }
1172             else {
1173 0         0 printf "# reduce-epoch=%0.6f\n", $now;
1174             }
1175 4         12 $self->{_begin_time} = $self->{_time_func}->();
1176             }
1177              
1178 810 100       1104 $context->{debug} and print "# final path=", _dump($self->{path}), "\n";
1179 810         1153 return $self;
1180             }
1181              
1182             sub _remove_optional {
1183 1710 100   1710   2420 if( exists $_[0]->{''} ) {
1184 350         364 delete $_[0]->{''};
1185 350         350 return 1;
1186             }
1187 1360         1860 return 0;
1188             }
1189              
1190             sub _reduce_path {
1191 810     810   749 my ($path, $ctx) = @_;
1192 810         1091 my $indent = ' ' x $ctx->{depth};
1193 810         646 my $debug = $ctx->{debug};
1194 810 100       1074 $debug and print "#$indent _reduce_path $ctx->{depth} ", _dump($path), "\n";
1195 810         636 my $new;
1196 810         714 my $head = [];
1197 810         621 my $tail = [];
1198 810         1474 while( defined( my $p = pop @$path )) {
1199 1159 100       1536 if( ref($p) eq 'HASH' ) {
1200 514         706 my ($node_head, $node_tail) = _reduce_node($p, _descend($ctx) );
1201 514 100       1102 $debug and print "#$indent| head=", _dump($node_head), " tail=", _dump($node_tail), "\n";
1202 514 100       822 push @$head, @$node_head if scalar @$node_head;
1203 514 100       2700 push @$tail, ref($node_tail) eq 'HASH' ? $node_tail : @$node_tail;
1204             }
1205             else {
1206 645 100       695 if( @$head ) {
1207 125 100       179 $debug and print "#$indent| push $p leaves @{[_dump($path)]}\n";
  7         12  
1208 125         271 push @$tail, $p;
1209             }
1210             else {
1211 520 100       613 $debug and print "#$indent| unshift $p\n";
1212 520         1214 unshift @$tail, $p;
1213             }
1214             }
1215             }
1216 20         108 $debug and print "#$indent| tail nr=@{[scalar @$tail]} t0=", ref($tail->[0]),
1217 810 100       1036 (ref($tail->[0]) eq 'HASH' ? " n=" . scalar(keys %{$tail->[0]}) : '' ),
  18 100       73  
1218             "\n";
1219 810 100 100     1955 if( @$tail > 1
      100        
1220             and ref($tail->[0]) eq 'HASH'
1221 97         298 and keys %{$tail->[0]} == 2
1222             ) {
1223 72         70 my $opt;
1224             my $fixed;
1225 72         61 while( my ($key, $path) = each %{$tail->[0]} ) {
  216         428  
1226 144 100       204 $debug and print "#$indent| scan k=$key p=@{[_dump($path)]}\n";
  14         15  
1227 144 100       200 next unless $path;
1228 123 100 100     317 if (@$path == 1 and ref($path->[0]) eq 'HASH') {
1229 8         14 $opt = $path->[0];
1230             }
1231             else {
1232 115         125 $fixed = $path;
1233             }
1234             }
1235 72 100       145 if( exists $tail->[0]{''} ) {
1236 21         28 my $path = [@{$tail}[1..$#{$tail}]];
  21         37  
  21         25  
1237 21         28 $tail = $tail->[0];
1238 21         42 ($head, $tail, $path) = _slide_tail( $head, $tail, $path, _descend($ctx) );
1239 21         60 $tail = [$tail, @$path];
1240             }
1241             }
1242 810 100       1023 $debug and print "#$indent _reduce_path $ctx->{depth} out head=", _dump($head), ' tail=', _dump($tail), "\n";
1243 810         1083 return ($head, $tail);
1244             }
1245              
1246             sub _reduce_node {
1247 1080     1080   937 my ($node, $ctx) = @_;
1248 1080         1193 my $indent = ' ' x $ctx->{depth};
1249 1080         830 my $debug = $ctx->{debug};
1250 1080         1083 my $optional = _remove_optional($node);
1251 1080 100       1480 $debug and print "#$indent _reduce_node $ctx->{depth} in @{[_dump($node)]} opt=$optional\n";
  57         66  
1252 1080 100 100     1954 if( $optional and scalar keys %$node == 1 ) {
1253 70         78 my $path = (values %$node)[0];
1254 70 100       69 if( not grep { ref($_) eq 'HASH' } @$path ) {
  117         209  
1255             # if we have removed an optional, and there is only one path
1256             # left then there is nothing left to compare. Because of the
1257             # optional it cannot participate in any further reductions.
1258             # (unless we test for equality among sub-trees).
1259 61         108 my $result = {
1260             '' => undef,
1261             $path->[0] => $path
1262             };
1263 61 100       90 $debug and print "#$indent| fast fail @{[_dump($result)]}\n";
  2         4  
1264 61         97 return [], $result;
1265             }
1266             }
1267              
1268 1019         1073 my( $fail, $reduce ) = _scan_node( $node, _descend($ctx) );
1269              
1270 1019 100       1871 $debug and print "#$indent|_scan_node done opt=$optional reduce=@{[_dump($reduce)]} fail=@{[_dump($fail)]}\n";
  55         75  
  55         81  
1271              
1272             # We now perform tail reduction on each of the nodes in the reduce
1273             # hash. If we have only one key, we know we will have a successful
1274             # reduction (since everything that was inserted into the node based
1275             # on the value of the last token of each path all mapped to the same
1276             # value).
1277              
1278 1019 100 100     4378 if( @$fail == 0 and keys %$reduce == 1 and not $optional) {
      100        
1279             # every path shares a common path
1280 506         626 my $path = (values %$reduce)[0];
1281 506         600 my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1282 506 100       919 $debug and print "#$indent|_reduce_node $ctx->{depth} common=@{[_dump($common)]} tail=", _dump($tail), "\n";
  50         76  
1283 506         1534 return( $common, $tail );
1284             }
1285              
1286             # this node resulted in a list of paths, game over
1287 513         643 $ctx->{indent} = $indent;
1288 513         583 return _reduce_fail( $reduce, $fail, $optional, _descend($ctx) );
1289             }
1290              
1291             sub _reduce_fail {
1292 513     513   450 my( $reduce, $fail, $optional, $ctx ) = @_;
1293 513         400 my( $debug, $depth, $indent ) = @{$ctx}{qw(debug depth indent)};
  513         659  
1294 513         377 my %result;
1295 513 100       727 $result{''} = undef if $optional;
1296 513         324 my $p;
1297 513         705 for $p (keys %$reduce) {
1298 1181         861 my $path = $reduce->{$p};
1299 1181 100       1205 if( scalar @$path == 1 ) {
1300 1114         762 $path = $path->[0];
1301 1114 100       1340 $debug and print "#$indent| -simple opt=$optional unrev @{[_dump($path)]}\n";
  7         10  
1302 1114         1094 $path = _unrev_path($path, _descend($ctx) );
1303 1114         1634 $result{_node_key($path->[0])} = $path;
1304             }
1305             else {
1306 67 100       114 $debug and print "#$indent| _do_reduce(@{[_dump($path)]})\n";
  1         2  
1307 67         84 my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1308             $path = [
1309             (
1310             ref($tail) eq 'HASH'
1311             ? _unrev_node($tail, _descend($ctx) )
1312             : _unrev_path($tail, _descend($ctx) )
1313             ),
1314 67 100       197 @{_unrev_path($common, _descend($ctx) )}
  67         88  
1315             ];
1316 67 100       172 $debug and print "#$indent| +reduced @{[_dump($path)]}\n";
  1         3  
1317 67         83 $result{_node_key($path->[0])} = $path;
1318             }
1319             }
1320 513         443 my $f;
1321 513         573 for $f( @$fail ) {
1322 219 100       272 $debug and print "#$indent| +fail @{[_dump($f)]}\n";
  3         6  
1323 219         257 $result{$f->[0]} = $f;
1324             }
1325 513 100       665 $debug and print "#$indent _reduce_fail $depth fail=@{[_dump(\%result)]}\n";
  5         9  
1326 513         1533 return ( [], \%result );
1327             }
1328              
1329             sub _scan_node {
1330 1019     1019   816 my( $node, $ctx ) = @_;
1331 1019         1111 my $indent = ' ' x $ctx->{depth};
1332 1019         757 my $debug = $ctx->{debug};
1333              
1334             # For all the paths in the node, reverse them. If the first token
1335             # of the path is a scalar, push it onto an array in a hash keyed by
1336             # the value of the scalar.
1337             #
1338             # If it is a node, call _reduce_node on this node beforehand. If we
1339             # get back a common head, all of the paths in the subnode shared a
1340             # common tail. We then store the common part and the remaining node
1341             # of paths (which is where the paths diverged from the end and install
1342             # this into the same hash. At this point both the common and the tail
1343             # are in reverse order, just as simple scalar paths are.
1344             #
1345             # On the other hand, if there were no common path returned then all
1346             # the paths of the sub-node diverge at the end character. In this
1347             # case the tail cannot participate in any further reductions and will
1348             # appear in forward order.
1349             #
1350             # certainly the hurgliest function in the whole file :(
1351              
1352             # $debug = 1 if $depth >= 8;
1353 1019         730 my @fail;
1354             my %reduce;
1355              
1356 0         0 my $n;
1357 1019         1585 for $n(
1358 2743         3943 map { substr($_, index($_, '#')+1) }
1359             sort
1360             map {
1361             join( '|' =>
1362 9029         9533 scalar(grep {ref($_) eq 'HASH'} @{$node->{$_}}),
  2743         2824  
1363             _node_offset($node->{$_}),
1364 2743         1858 scalar @{$node->{$_}},
  2743         7481  
1365             )
1366             . "#$_"
1367             }
1368             keys %$node ) {
1369 2743         1773 my( $end, @path ) = reverse @{$node->{$n}};
  2743         4631  
1370 2743 100       3429 if( ref($end) ne 'HASH' ) {
1371 2137 100       2597 $debug and print "# $indent|_scan_node push reduce ($end:@{[_dump(\@path)]})\n";
  87         167  
1372 2137         1375 push @{$reduce{$end}}, [ $end, @path ];
  2137         5573  
1373             }
1374             else {
1375 606 100       840 $debug and print "# $indent|_scan_node head=", _dump(\@path), ' tail=', _dump($end), "\n";
1376 606         441 my $new_path;
1377             # deal with sing, singing => s(?:ing)?ing
1378 606 100 66     1501 if( keys %$end == 2 and exists $end->{''} ) {
1379 94         115 my ($key, $opt_path) = each %$end;
1380 94 100       164 ($key, $opt_path) = each %$end if $key eq '';
1381 94         73 $opt_path = [reverse @{$opt_path}];
  94         134  
1382 94 100       137 $debug and print "# $indent| check=", _dump($opt_path), "\n";
1383 94         259 my $end = { '' => undef, $opt_path->[0] => [@$opt_path] };
1384 94         81 my $head = [];
1385 94         111 my $path = [@path];
1386 94         118 ($head, my $slide, $path) = _slide_tail( $head, $end, $path, $ctx );
1387 94 100       221 if( @$head ) {
1388 40         111 $new_path = [ @$head, $slide, @$path ];
1389             }
1390             }
1391 606 100       673 if( $new_path ) {
1392 40 100       58 $debug and print "# $indent|_scan_node slid=", _dump($new_path), "\n";
1393 40         24 push @{$reduce{$new_path->[0]}}, $new_path;
  40         80  
1394             }
1395             else {
1396 566         744 my( $common, $tail ) = _reduce_node( $end, _descend($ctx) );
1397 566 100       1044 if( not @$common ) {
1398 219 100       287 $debug and print "# $indent| +failed $n\n";
1399 219         488 push @fail, [reverse(@path), $tail];
1400             }
1401             else {
1402 347         473 my $path = [@path];
1403 347 100       501 $debug and print "# $indent|_scan_node ++recovered common=@{[_dump($common)]} tail=",
  34         45  
1404 34         45 _dump($tail), " path=@{[_dump($path)]}\n";
1405 347 100 100     1311 if( ref($tail) eq 'HASH'
1406             and keys %$tail == 2
1407             ) {
1408 287 100       462 if( exists $tail->{''} ) {
1409 121         184 ($common, $tail, $path) = _slide_tail( $common, $tail, $path, $ctx );
1410             }
1411             }
1412 347 100       296 push @{$reduce{$common->[0]}}, [
  347         1622  
1413             @$common,
1414             (ref($tail) eq 'HASH' ? $tail : @$tail ),
1415             @$path
1416             ];
1417             }
1418             }
1419             }
1420             }
1421 1019 100       1841 $debug and print
1422 55         140 "# $indent|_scan_node counts: reduce=@{[scalar keys %reduce]} fail=@{[scalar @fail]}\n";
  55         162  
1423 1019         1566 return( \@fail, \%reduce );
1424             }
1425              
1426             sub _do_reduce {
1427 573     573   488 my ($path, $ctx) = @_;
1428 573         664 my $indent = ' ' x $ctx->{depth};
1429 573         442 my $debug = $ctx->{debug};
1430 573         1004 my $ra = Regexp::Assemble->new(chomp=>0);
1431 573         823 $ra->debug($debug);
1432 573 100       741 $debug and print "# $indent| do @{[_dump($path)]}\n";
  51         77  
1433 573         1189 $ra->_insertr( $_ ) for
1434             # When nodes come into the picture, we have to be careful
1435             # about how we insert the paths into the assembly.
1436             # Paths with nodes first, then closest node to front
1437             # then shortest path. Merely because if we can control
1438             # order in which paths containing nodes get inserted,
1439             # then we can make a couple of assumptions that simplify
1440             # the code in _insert_node.
1441             sort {
1442 6000         5442 scalar(grep {ref($_) eq 'HASH'} @$a)
1443 1113 50 100     1071 <=> scalar(grep {ref($_) eq 'HASH'} @$b)
  6578         6599  
1444             ||
1445             _node_offset($b) <=> _node_offset($a)
1446             ||
1447             scalar @$a <=> scalar @$b
1448             }
1449             @$path
1450             ;
1451 573         767 $path = $ra->_path;
1452 573         527 my $common = [];
1453 573         2467 push @$common, shift @$path while( ref($path->[0]) ne 'HASH' );
1454 573 100       902 my $tail = scalar( @$path ) > 1 ? [@$path] : $path->[0];
1455 573 100       752 $debug and print "# $indent| _do_reduce common=@{[_dump($common)]} tail=@{[_dump($tail)]}\n";
  51         76  
  51         61  
1456 573         1913 return ($common, $tail);
1457             }
1458              
1459             sub _node_offset {
1460             # return the offset that the first node is found, or -ve
1461             # optimised for speed
1462 4422     4422   2584 my $nr = @{$_[0]};
  4422         3550  
1463 4422         2803 my $atom = -1;
1464 4422   100     26670 ref($_[0]->[$atom]) eq 'HASH' and return $atom while ++$atom < $nr;
1465 3458         4677 return -1;
1466             }
1467              
1468             sub _slide_tail {
1469 240     240   17165 my $head = shift;
1470 240         181 my $tail = shift;
1471 240         179 my $path = shift;
1472 240         165 my $ctx = shift;
1473 240         291 my $indent = ' ' x $ctx->{depth};
1474 240         186 my $debug = $ctx->{debug};
1475 240 100       353 $debug and print "# $indent| slide in h=", _dump($head),
1476             ' t=', _dump($tail), ' p=', _dump($path), "\n";
1477 240         314 my $slide_path = (each %$tail)[-1];
1478 240 100       435 $slide_path = (each %$tail)[-1] unless defined $slide_path;
1479 240 100       365 $debug and print "# $indent| slide potential ", _dump($slide_path), " over ", _dump($path), "\n";
1480 240   100     829 while( defined $path->[0] and $path->[0] eq $slide_path->[0] ) {
1481 154 100       237 $debug and print "# $indent| slide=tail=$slide_path->[0]\n";
1482 154         150 my $slide = shift @$path;
1483 154         114 shift @$slide_path;
1484 154         167 push @$slide_path, $slide;
1485 154         398 push @$head, $slide;
1486             }
1487 240 100       344 $debug and print "# $indent| slide path ", _dump($slide_path), "\n";
1488 240         318 my $slide_node = {
1489             '' => undef,
1490             _node_key($slide_path->[0]) => $slide_path,
1491             };
1492 240 100       411 $debug and print "# $indent| slide out h=", _dump($head),
1493             ' s=', _dump($slide_node), ' p=', _dump($path), "\n";
1494 240         444 return ($head, $slide_node, $path);
1495             }
1496              
1497             sub _unrev_path {
1498 2690     2690   1997 my ($path, $ctx) = @_;
1499 2690         2498 my $indent = ' ' x $ctx->{depth};
1500 2690         1887 my $debug = $ctx->{debug};
1501 2690         1549 my $new;
1502 2690 100       2187 if( not grep { ref($_) } @$path ) {
  6383         7591  
1503 2196 100       2575 $debug and print "# ${indent}_unrev path fast ", _dump($path);
1504 2196         2665 $new = [reverse @$path];
1505 2196 100       2582 $debug and print "# -> ", _dump($new), "\n";
1506 2196         2467 return $new;
1507             }
1508 494 100       666 $debug and print "# ${indent}unrev path in ", _dump($path), "\n";
1509 494         822 while( defined( my $p = pop @$path )) {
1510 1483 100       3184 push @$new,
    100          
1511             ref($p) eq 'HASH' ? _unrev_node($p, _descend($ctx) )
1512             : ref($p) eq 'ARRAY' ? _unrev_path($p, _descend($ctx) )
1513             : $p
1514             ;
1515             }
1516 494 100       613 $debug and print "# ${indent}unrev path out ", _dump($new), "\n";
1517 494         534 return $new;
1518             }
1519              
1520             sub _unrev_node {
1521 630     630   531 my ($node, $ctx ) = @_;
1522 630         630 my $indent = ' ' x $ctx->{depth};
1523 630         436 my $debug = $ctx->{debug};
1524 630         636 my $optional = _remove_optional($node);
1525 630 100       826 $debug and print "# ${indent}unrev node in ", _dump($node), " opt=$optional\n";
1526 630         421 my $new;
1527 630 100       845 $new->{''} = undef if $optional;
1528 630         403 my $n;
1529 630         909 for $n( keys %$node ) {
1530 1167         1213 my $path = _unrev_path($node->{$n}, _descend($ctx) );
1531 1167         1751 $new->{_node_key($path->[0])} = $path;
1532             }
1533 630 100       885 $debug and print "# ${indent}unrev node out ", _dump($new), "\n";
1534 630         1766 return $new;
1535             }
1536              
1537             sub _node_key {
1538 4653     4653   3856 my $node = shift;
1539 4653 100       5684 return _node_key($node->[0]) if ref($node) eq 'ARRAY';
1540 4642 100       10996 return $node unless ref($node) eq 'HASH';
1541 205         162 my $key = '';
1542 205         135 my $k;
1543 205         318 for $k( keys %$node ) {
1544 452 100       553 next if $k eq '';
1545 374 100 100     987 $key = $k if $key eq '' or $key gt $k;
1546             }
1547 205         456 return $key;
1548             }
1549              
1550             sub _descend {
1551             # Take a context object, and increase the depth by one.
1552             # By creating a fresh hash each time, we don't have to
1553             # bother adding make-work code to decrease the depth
1554             # when we return from what we called.
1555 6190     6190   4064 my $ctx = shift;
1556 6190         15825 return {%$ctx, depth => $ctx->{depth}+1};
1557             }
1558              
1559             #####################################################################
1560              
1561             sub _make_class {
1562 650     650   536 my $self = shift;
1563 650         585 my %set = map { ($_,1) } @_;
  1675         2287  
1564 650 100       1107 delete $set{'\\d'} if exists $set{'\\w'};
1565 650 100       856 delete $set{'\\D'} if exists $set{'\\W'};
1566             return '.' if exists $set{'.'}
1567             or ($self->{fold_meta_pairs} and (
1568             (exists $set{'\\d'} and exists $set{'\\D'})
1569             or (exists $set{'\\s'} and exists $set{'\\S'})
1570 650 100 66     3377 or (exists $set{'\\w'} and exists $set{'\\W'})
      66        
      66        
1571             ))
1572             ;
1573 632         655 for my $meta( q/\\d/, q/\\D/, q/\\s/, q/\\S/, q/\\w/, q/\\W/ ) {
1574 3792 100       4570 if( exists $set{$meta} ) {
1575 28         155 my $re = qr/$meta/;
1576 28         21 my @delete;
1577 28   66     326 $_ =~ /^$re$/ and push @delete, $_ for keys %set;
1578 28 100       79 delete @set{@delete} if @delete;
1579             }
1580             }
1581 632 100       937 return (keys %set)[0] if keys %set == 1;
1582 625         594 for my $meta( '.', '+', '*', '?', '(', ')', '^', '@', '$', '[', '/', ) {
1583 6875 100       8503 exists $set{"\\$meta"} and $set{$meta} = delete $set{"\\$meta"};
1584             }
1585 625 100       811 my $dash = exists $set{'-'} ? do { delete($set{'-'}), '-' } : '';
  20         36  
1586 625 100       667 my $caret = exists $set{'^'} ? do { delete($set{'^'}), '^' } : '';
  7         10  
1587 625         1461 my $class = join( '' => sort keys %set );
1588 625 100 100     1240 $class =~ s/0123456789/\\d/ and $class eq '\\d' and return $class;
1589 622         2828 return "[$dash$class$caret]";
1590             }
1591              
1592             sub _re_sort {
1593 1009   100 1009   3481 return length $b <=> length $a || $a cmp $b
1594             }
1595              
1596             sub _combine {
1597 140     140   104 my $self = shift;
1598 140         107 my $type = shift;
1599             # print "c in = @{[_dump(\@_)]}\n";
1600             # my $combine =
1601             return '('
1602             . $type
1603 140         118 . do {
1604 140         91 my( @short, @long );
1605 140 100       160 push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
  377         1441  
1606 140 100       228 if( @short == 1 ) {
    100          
1607 31         59 @long = sort _re_sort @long, @short;
1608             }
1609             elsif( @short > 1 ) {
1610             # yucky but true
1611 77         96 my @combine = (_make_class($self, @short), sort _re_sort @long);
1612 77         112 @long = @combine;
1613             }
1614             else {
1615 32         60 @long = sort _re_sort @long;
1616             }
1617 140         326 join( '|', @long );
1618             }
1619             . ')';
1620             # print "combine <$combine>\n";
1621             # $combine;
1622             }
1623              
1624             sub _combine_new {
1625 1738     1738   1333 my $self = shift;
1626 1738         1098 my( @short, @long );
1627 1738 100       1881 push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
  3244         13702  
1628 1738 100 100     5895 if( @short == 1 and @long == 0 ) {
    100 100        
1629 365         1384 return $short[0];
1630             }
1631             elsif( @short > 1 and @short == @_ ) {
1632 494         642 return _make_class($self, @short);
1633             }
1634             else {
1635 879 100       3457 return '(?:'
1636             . join( '|' =>
1637             @short > 1
1638             ? ( _make_class($self, @short), sort _re_sort @long)
1639             : ( (sort _re_sort( @long )), @short )
1640             )
1641             . ')';
1642             }
1643             }
1644              
1645             sub _re_path {
1646 4772     4772   3214 my $self = shift;
1647             # in shorter assemblies, _re_path() is the second hottest
1648             # routine. after insert(), so make it fast.
1649              
1650 4772 100       6182 if ($self->{unroll_plus}) {
1651             # but we can't easily make this blockless
1652 72         55 my @arr = @{$_[0]};
  72         118  
1653 72         52 my $str = '';
1654 72         55 my $skip = 0;
1655 72         104 for my $i (0..$#arr) {
1656 127 100 100     983 if (ref($arr[$i]) eq 'ARRAY') {
    100          
    100          
    100          
1657 1         3 $str .= _re_path($self, $arr[$i]);
1658             }
1659             elsif (ref($arr[$i]) eq 'HASH') {
1660             $str .= exists $arr[$i]->{''}
1661             ? _combine_new( $self,
1662 7         19 map { _re_path( $self, $arr[$i]->{$_} ) } grep { $_ ne '' } keys %{$arr[$i]}
  14         18  
  7         15  
1663             ) . '?'
1664 28 100       41 : _combine_new($self, map { _re_path( $self, $arr[$i]->{$_} ) } keys %{$arr[$i]})
  42         60  
  21         44  
1665             ;
1666             }
1667             elsif ($i < $#arr and $arr[$i+1] =~ /\A$arr[$i]\*(\??)\Z/) {
1668 7 50       25 $str .= "$arr[$i]+" . (defined $1 ? $1 : '');
1669 7         14 ++$skip;
1670             }
1671             elsif ($skip) {
1672 7         10 $skip = 0;
1673             }
1674             else {
1675 84         119 $str .= $arr[$i];
1676             }
1677             }
1678 72         166 return $str;
1679             }
1680              
1681 4700 50       3620 return join( '', @_ ) unless grep { length ref $_ } @_;
  4700         7211  
1682 4700         2932 my $p;
1683             return join '', map {
1684             ref($_) eq '' ? $_
1685 9506 100       16153 : ref($_) eq 'HASH' ? do {
    100          
1686             # In the case of a node, see whether there's a '' which
1687             # indicates that the whole thing is optional and thus
1688             # requires a trailing ?
1689             # Unroll the two different paths to avoid the needless
1690             # grep when it isn't necessary.
1691 1710         1090 $p = $_;
1692             exists $_->{''}
1693             ? _combine_new( $self,
1694 801         984 map { _re_path( $self, $p->{$_} ) } grep { $_ ne '' } keys %$_
  1508         1946  
1695             ) . '?'
1696 1710 100       3241 : _combine_new($self, map { _re_path( $self, $p->{$_} ) } keys %$_ )
  2394         2710  
1697             }
1698             : _re_path($self, $_) # ref($_) eq 'ARRAY'
1699 4700         2888 } @{$_[0]}
  4700         4167  
1700             }
1701              
1702             sub _lookahead {
1703 132     132   101 my $in = shift;
1704 132         90 my %head;
1705             my $path;
1706 132         171 for $path( keys %$in ) {
1707 328 100       431 next unless defined $in->{$path};
1708             # print "look $path: ", ref($in->{$path}[0]), ".\n";
1709 267 100       396 if( ref($in->{$path}[0]) eq 'HASH' ) {
    100          
1710 15         15 my $next = 0;
1711 15   100     36 while( ref($in->{$path}[$next]) eq 'HASH' and @{$in->{$path}} > $next + 1 ) {
  16         60  
1712 11 100       20 if( exists $in->{$path}[$next]{''} ) {
1713 5         12 ++$head{$in->{$path}[$next+1]};
1714             }
1715 11         22 ++$next;
1716             }
1717 15         30 my $inner = _lookahead( $in->{$path}[0] );
1718 15         46 @head{ keys %$inner } = (values %$inner);
1719             }
1720             elsif( ref($in->{$path}[0]) eq 'ARRAY' ) {
1721 2         3 my $subpath = $in->{$path}[0];
1722 2         6 for( my $sp = 0; $sp < @$subpath; ++$sp ) {
1723 3 100       5 if( ref($subpath->[$sp]) eq 'HASH' ) {
1724 2         3 my $follow = _lookahead( $subpath->[$sp] );
1725 2         5 @head{ keys %$follow } = (values %$follow);
1726 2 100       7 last unless exists $subpath->[$sp]{''};
1727             }
1728             else {
1729 1         2 ++$head{$subpath->[$sp]};
1730 1         2 last;
1731             }
1732             }
1733             }
1734             else {
1735 250         306 ++$head{ $in->{$path}[0] };
1736             }
1737             }
1738             # print "_lookahead ", _dump($in), '==>', _dump([keys %head]), "\n";
1739 132         184 return \%head;
1740             }
1741              
1742             sub _re_path_lookahead {
1743 265     265   181 my $self = shift;
1744 265         165 my $in = shift;
1745             # print "_re_path_la in ", _dump($in), "\n";
1746 265         172 my $out = '';
1747 265         359 for( my $p = 0; $p < @$in; ++$p ) {
1748 573 100       674 if( ref($in->[$p]) eq '' ) {
    100          
1749 462         332 $out .= $in->[$p];
1750 462         610 next;
1751             }
1752             elsif( ref($in->[$p]) eq 'ARRAY' ) {
1753 2         3 $out .= _re_path_lookahead($self, $in->[$p]);
1754 2         4 next;
1755             }
1756             # print "$p ", _dump($in->[$p]), "\n";
1757             my $path = [
1758 228         278 map { _re_path_lookahead($self, $in->[$p]{$_} ) }
1759 275         284 grep { $_ ne '' }
1760 109         84 keys %{$in->[$p]}
  109         182  
1761             ];
1762 109         176 my $ahead = _lookahead($in->[$p]);
1763 109         86 my $more = 0;
1764 109 100 100     257 if( exists $in->[$p]{''} and $p + 1 < @$in ) {
1765 12         11 my $next = 1;
1766 12         30 while( $p + $next < @$in ) {
1767 14 100       25 if( ref( $in->[$p+$next] ) eq 'HASH' ) {
1768 2         4 my $follow = _lookahead( $in->[$p+$next] );
1769 2         6 @{$ahead}{ keys %$follow } = (values %$follow);
  2         5  
1770             }
1771             else {
1772 12         17 ++$ahead->{$in->[$p+$next]};
1773 12         12 last;
1774             }
1775 2         4 ++$next;
1776             }
1777 12         9 $more = 1;
1778             }
1779 109         99 my $nr_one = grep { /^$Single_Char$/ } @$path;
  228         891  
1780 109         87 my $nr = @$path;
1781 109 100 100     216 if( $nr_one > 1 and $nr_one == $nr ) {
1782 18         29 $out .= _make_class($self, @$path);
1783 18 100       66 $out .= '?' if exists $in->[$p]{''};
1784             }
1785             else {
1786             my $zwla = keys(%$ahead) > 1
1787 91 100       188 ? _combine($self, '?=', grep { s/\+$//; $_ } keys %$ahead )
  191         152  
  191         212  
1788             : '';
1789 91 100       165 my $patt = $nr > 1 ? _combine($self, '?:', @$path ) : $path->[0];
1790             # print "have nr=$nr n1=$nr_one n=", _dump($in->[$p]), ' a=', _dump([keys %$ahead]), " zwla=$zwla patt=$patt @{[_dump($path)]}\n";
1791 91 100       117 if( exists $in->[$p]{''} ) {
1792 44 100       166 $out .= $more ? "$zwla(?:$patt)?" : "(?:$zwla$patt)?";
1793             }
1794             else {
1795 47         157 $out .= "$zwla$patt";
1796             }
1797             }
1798             }
1799 265         397 return $out;
1800             }
1801              
1802             sub _re_path_track {
1803 33     33   24 my $self = shift;
1804 33         23 my $in = shift;
1805 33         25 my $normal = shift;
1806 33         23 my $augmented = shift;
1807 33         22 my $o;
1808 33         19 my $simple = '';
1809 33         28 my $augment = '';
1810 33         48 for( my $n = 0; $n < @$in; ++$n ) {
1811 114 100       126 if( ref($in->[$n]) eq '' ) {
1812 104         73 $o = $in->[$n];
1813 104         68 $simple .= $o;
1814 104         65 $augment .= $o;
1815 104 100 100     555 if( (
      66        
      100        
1816             $n < @$in - 1
1817             and ref($in->[$n+1]) eq 'HASH' and exists $in->[$n+1]{''}
1818             )
1819             or $n == @$in - 1
1820             ) {
1821 24         20 push @{$self->{mlist}}, $normal . $simple ;
  24         42  
1822 24 50       64 $augment .= $] < 5.009005
1823             ? "(?{\$self->{m}=$self->{mcount}})"
1824             : "(?{$self->{mcount}})"
1825             ;
1826 24         40 ++$self->{mcount};
1827             }
1828             }
1829             else {
1830             my $path = [
1831 25         64 map { $self->_re_path_track( $in->[$n]{$_}, $normal.$simple , $augmented.$augment ) }
1832 26         39 grep { $_ ne '' }
1833 10         9 keys %{$in->[$n]}
  10         23  
1834             ];
1835 10         50 $o = '(?:' . join( '|' => sort _re_sort @$path ) . ')';
1836 10 100       17 $o .= '?' if exists $in->[$n]{''};
1837 10         11 $simple .= $o;
1838 10         24 $augment .= $o;
1839             }
1840             }
1841 33         65 return $augment;
1842             }
1843              
1844             sub _re_path_pretty {
1845 411     411   267 my $self = shift;
1846 411         253 my $in = shift;
1847 411         252 my $arg = shift;
1848 411         454 my $pre = ' ' x (($arg->{depth}+0) * $arg->{indent});
1849 411         384 my $indent = ' ' x (($arg->{depth}+1) * $arg->{indent});
1850 411         251 my $out = '';
1851 411         262 $arg->{depth}++;
1852 411         291 my $prev_was_paren = 0;
1853 411         530 for( my $p = 0; $p < @$in; ++$p ) {
1854 1084 100       1129 if( ref($in->[$p]) eq '' ) {
    100          
1855 910 100       953 $out .= "\n$pre" if $prev_was_paren;
1856 910         636 $out .= $in->[$p];
1857 910         1137 $prev_was_paren = 0;
1858             }
1859             elsif( ref($in->[$p]) eq 'ARRAY' ) {
1860 3         6 $out .= _re_path($self, $in->[$p]);
1861             }
1862             else {
1863             my $path = [
1864 369         448 map { _re_path_pretty($self, $in->[$p]{$_}, $arg ) }
1865 419         409 grep { $_ ne '' }
1866 171         128 keys %{$in->[$p]}
  171         276  
1867             ];
1868 171         174 my $nr = @$path;
1869 171         110 my( @short, @long );
1870 171 100       180 push @{/^$Single_Char$/ ? \@short : \@long}, $_ for @$path;
  369         1549  
1871 171 100       202 if( @short == $nr ) {
1872 37 100       64 $out .= $nr == 1 ? $path->[0] : _make_class($self, @short);
1873 37 100       122 $out .= '?' if exists $in->[$p]{''};
1874             }
1875             else {
1876 134 100       200 $out .= "\n" if length $out;
1877 134 100       163 $out .= $pre if $p;
1878 134         138 $out .= "(?:\n$indent";
1879 134 100       134 if( @short < 2 ) {
1880 133         93 my $r = 0;
1881             $out .= join( "\n$indent|" => map {
1882 133 100       278 $r++ and $_ =~ s/^\(\?:/\n$indent(?:/;
  298         412  
1883 298         410 $_
1884             }
1885             sort _re_sort @$path
1886             );
1887             }
1888             else {
1889 1         6 $out .= join( "\n$indent|" => ( (sort _re_sort @long), _make_class($self, @short) ));
1890             }
1891 134         163 $out .= "\n$pre)";
1892 134 100       153 if( exists $in->[$p]{''} ) {
1893 37         46 $out .= "\n$pre?";
1894 37         82 $prev_was_paren = 0;
1895             }
1896             else {
1897 97         237 $prev_was_paren = 1;
1898             }
1899             }
1900             }
1901             }
1902 411         283 $arg->{depth}--;
1903 411         653 return $out;
1904             }
1905              
1906             sub _node_eq {
1907 425 100 66 425   950 return 0 if not defined $_[0] or not defined $_[1];
1908 422 100       703 return 0 if ref $_[0] ne ref $_[1];
1909             # Now that we have determined that the reference of each
1910             # argument are the same, we only have to test the first
1911             # one, which gives us a nice micro-optimisation.
1912 381 100       501 if( ref($_[0]) eq 'HASH' ) {
    100          
1913 305         388 keys %{$_[0]} == keys %{$_[1]}
  305         585  
1914             and
1915             # does this short-circuit to avoid _re_path() cost more than it saves?
1916 305 100 100     199 join( '|' => sort keys %{$_[0]}) eq join( '|' => sort keys %{$_[1]})
  272         563  
  272         1114  
1917             and
1918             _re_path(undef, [$_[0]] ) eq _re_path(undef, [$_[1]] );
1919             }
1920             elsif( ref($_[0]) eq 'ARRAY' ) {
1921 9 100       6 scalar @{$_[0]} == scalar @{$_[1]}
  9         10  
  9         25  
1922             and
1923             _re_path(undef, $_[0]) eq _re_path(undef, $_[1]);
1924             }
1925             else {
1926 67         183 $_[0] eq $_[1];
1927             }
1928             }
1929              
1930             sub _pretty_dump {
1931 7     7   19 return sprintf "\\x%02x", ord(shift);
1932             }
1933              
1934             sub _dump {
1935 5579     5579   4217 my $path = shift;
1936 5579 100       7376 return _dump_node($path) if ref($path) eq 'HASH';
1937 5057         3507 my $dump = '[';
1938 5057         2715 my $d;
1939 5057         3209 my $nr = 0;
1940 5057         4163 for $d( @$path ) {
1941 11036 100       12707 $dump .= ' ' if $nr++;
1942 11036 100       16790 if( ref($d) eq 'HASH' ) {
    100          
    100          
1943 1340         1324 $dump .= _dump_node($d);
1944             }
1945             elsif( ref($d) eq 'ARRAY' ) {
1946 242         242 $dump .= _dump($d);
1947             }
1948             elsif( defined $d ) {
1949             # D::C indicates the second test is redundant
1950             # $dump .= ( $d =~ /\s/ or not length $d )
1951 9453 100       18142 $dump .= (
    100          
1952             $d =~ /\s/ ? qq{'$d'} :
1953             $d =~ /^[\x00-\x1f]$/ ? _pretty_dump($d) :
1954             $d
1955             );
1956             }
1957             else {
1958 1         2 $dump .= '*';
1959             }
1960             }
1961 5057         16911 return $dump . ']';
1962             }
1963              
1964             sub _dump_node {
1965 1862     1862   1307 my $node = shift;
1966 1862         1290 my $dump = '{';
1967 1862         1153 my $nr = 0;
1968 1862         1126 my $n;
1969 1862         3372 for $n (sort keys %$node) {
1970 3899 100       4746 $dump .= ' ' if $nr++;
1971             # Devel::Cover shows this to test to be redundant
1972             # $dump .= ( $n eq '' and not defined $node->{$n} )
1973             $dump .= $n eq ''
1974             ? '*'
1975             : ($n =~ /^[\x00-\x1f]$/ ? _pretty_dump($n) : $n)
1976 3899 100       8440 . "=>" . _dump($node->{$n})
    100          
1977             ;
1978             }
1979 1862         6620 return $dump . '}';
1980             }
1981              
1982             =pod
1983              
1984             =head1 NAME
1985              
1986             Regexp::Assemble - Assemble multiple Regular Expressions into a single RE
1987              
1988             =head1 SYNOPSIS
1989              
1990             use Regexp::Assemble;
1991              
1992             my $ra = Regexp::Assemble->new;
1993             $ra->add( 'ab+c' );
1994             $ra->add( 'ab+-' );
1995             $ra->add( 'a\w\d+' );
1996             $ra->add( 'a\d+' );
1997             print $ra->re; # prints a(?:\w?\d+|b+[-c])
1998              
1999             =head1 DESCRIPTION
2000              
2001             Regexp::Assemble takes an arbitrary number of regular expressions
2002             and assembles them into a single regular expression (or RE) that
2003             matches all that the individual REs match.
2004              
2005             As a result, instead of having a large list of expressions to loop
2006             over, a target string only needs to be tested against one expression.
2007             This is interesting when you have several thousand patterns to deal
2008             with. Serious effort is made to produce the smallest pattern possible.
2009              
2010             It is also possible to track the original patterns, so that you can
2011             determine which, among the source patterns that form the assembled
2012             pattern, was the one that caused the match to occur.
2013              
2014             You should realise that large numbers of alternations are processed
2015             in perl's regular expression engine in O(n) time, not O(1). If you
2016             are still having performance problems, you should look at using a
2017             trie. Note that Perl's own regular expression engine will implement
2018             trie optimisations in perl 5.10 (they are already available in
2019             perl 5.9.3 if you want to try them out). C will
2020             do the right thing when it knows it's running on a trie'd perl.
2021             (At least in some version after this one).
2022              
2023             Some more examples of usage appear in the accompanying README. If
2024             that file is not easy to access locally, you can find it on a web
2025             repository such as
2026             L or
2027             L.
2028              
2029             See also L.
2030              
2031             =head1 Methods
2032              
2033             =head2 add(LIST)
2034              
2035             Takes a string, breaks it apart into a set of tokens (respecting
2036             meta characters) and inserts the resulting list into the C
2037             object. It uses a naive regular expression to lex the string
2038             that may be fooled complex expressions (specifically, it will
2039             fail to lex nested parenthetical expressions such as
2040             C correctly). If this is the case, the end of
2041             the string will not be tokenised correctly and returned as one
2042             long string.
2043              
2044             On the one hand, this may indicate that the patterns you are
2045             trying to feed the C object are too complex. Simpler
2046             patterns might allow the algorithm to work more effectively and
2047             perform more reductions in the resulting pattern.
2048              
2049             On the other hand, you can supply your own pattern to perform the
2050             lexing if you need. The test suite contains an example of a lexer
2051             pattern that will match one level of nested parentheses.
2052              
2053             Note that there is an internal optimisation that will bypass a
2054             much of the lexing process. If a string contains no C<\>
2055             (backslash), C<[> (open square bracket), C<(> (open paren),
2056             C (question mark), C<+> (plus), C<*> (star) or C<{> (open
2057             curly), a character split will be performed directly.
2058              
2059             A list of strings may be supplied, thus you can pass it a file
2060             handle of a file opened for reading:
2061              
2062             $re->add( '\d+-\d+-\d+-\d+\.example\.com' );
2063             $re->add( );
2064              
2065             If the file is very large, it may be more efficient to use a
2066             C loop, to read the file line-by-line:
2067              
2068             $re->add($_) while ;
2069              
2070             The C method will chomp the lines automatically. If you
2071             do not want this to occur (you want to keep the record
2072             separator), then disable Cing.
2073              
2074             $re->chomp(0);
2075             $re->add($_) while ;
2076              
2077             This method is chainable.
2078              
2079             =head2 add_file(FILENAME [...])
2080              
2081             Takes a list of file names. Each file is opened and read
2082             line by line. Each line is added to the assembly.
2083              
2084             $r->add_file( 'file.1', 'file.2' );
2085              
2086             If a file cannot be opened, the method will croak. If you cannot
2087             afford to let this happen then you should wrap the call in a C
2088             block.
2089              
2090             Chomping happens automatically unless you the C method
2091             to disable it. By default, input lines are read according to the
2092             value of the C attribute (if defined), and
2093             will otherwise fall back to the current setting of the system C<$/>
2094             variable. The record separator may also be specified on each
2095             call to C. Internally, the routine Cises the
2096             value of C<$/> to whatever is required, for the duration of the
2097             call.
2098              
2099             An alternate calling mechanism using a hash reference is
2100             available. The recognised keys are:
2101              
2102             =over 4
2103              
2104             =item file
2105              
2106             Reference to a list of file names, or the name of a single
2107             file.
2108              
2109             $r->add_file({file => ['file.1', 'file.2', 'file.3']});
2110             $r->add_file({file => 'file.n'});
2111              
2112             =item input_record_separator
2113              
2114             If present, indicates what constitutes a line
2115              
2116             $r->add_file({file => 'data.txt', input_record_separator => ':' });
2117              
2118             =item rs
2119              
2120             An alias for input_record_separator (mnemonic: same as the
2121             English variable names).
2122              
2123             =back
2124              
2125             $r->add_file( {
2126             file => [ 'pattern.txt', 'more.txt' ],
2127             input_record_separator => "\r\n",
2128             });
2129              
2130             =head2 clone()
2131              
2132             Clones the contents of a Regexp::Assemble object and creates a new
2133             object (in other words it performs a deep copy).
2134              
2135             If the Storable module is installed, its dclone method will be used,
2136             otherwise the cloning will be performed using a pure perl approach.
2137              
2138             You can use this method to take a snapshot of the patterns that have
2139             been added so far to an object, and generate an assembly from the
2140             clone. Additional patterns may to be added to the original object
2141             afterwards.
2142              
2143             my $re = $main->clone->re();
2144             $main->add( 'another-pattern-\\d+' );
2145              
2146             =head2 insert(LIST)
2147              
2148             Takes a list of tokens representing a regular expression and
2149             stores them in the object. Note: you should not pass it a bare
2150             regular expression, such as C. You must pass it as
2151             a list of tokens, I C<('a', 'b+', 'c?', 'd*', 'e')>.
2152              
2153             This method is chainable, I:
2154              
2155             my $ra = Regexp::Assemble->new
2156             ->insert( qw[ a b+ c? d* e ] )
2157             ->insert( qw[ a c+ d+ e* f ] );
2158              
2159             Lexing complex patterns with metacharacters and so on can consume
2160             a significant proportion of the overall time to build an assembly.
2161             If you have the information available in a tokenised form, calling
2162             C directly can be a big win.
2163              
2164             =head2 lexstr
2165              
2166             Use the C method if you are curious to see how a pattern
2167             gets tokenised. It takes a scalar on input, representing a pattern,
2168             and returns a reference to an array, containing the tokenised
2169             pattern. You can recover the original pattern by performing a
2170             C:
2171              
2172             my @token = $re->lexstr($pattern);
2173             my $new_pattern = join( '', @token );
2174              
2175             If the original pattern contains unnecessary backslashes, or C<\x4b>
2176             escapes, or quotemeta escapes (C<\Q>...C<\E>) the resulting pattern
2177             may not be identical.
2178              
2179             Call C does not add the pattern to the object, it is merely
2180             for exploratory purposes. It will, however, update various statistical
2181             counters.
2182              
2183             =head2 pre_filter(CODE)
2184              
2185             Allows you to install a callback to check that the pattern being
2186             loaded contains valid input. It receives the pattern as a whole to
2187             be added, before it been tokenised by the lexer. It may to return
2188             0 or C to indicate that the pattern should not be added, any
2189             true value indicates that the contents are fine.
2190              
2191             A filter to strip out trailing comments (marked by #):
2192              
2193             $re->pre_filter( sub { $_[0] =~ s/\s*#.*$//; 1 } );
2194              
2195             A filter to ignore blank lines:
2196              
2197             $re->pre_filter( sub { length(shift) } );
2198              
2199             If you want to remove the filter, pass C as a parameter.
2200              
2201             $ra->pre_filter(undef);
2202              
2203             This method is chainable.
2204              
2205             =head2 filter(CODE)
2206              
2207             Allows you to install a callback to check that the pattern being
2208             loaded contains valid input. It receives a list on input, after it
2209             has been tokenised by the lexer. It may to return 0 or undef to
2210             indicate that the pattern should not be added, any true value
2211             indicates that the contents are fine.
2212              
2213             If you know that all patterns you expect to assemble contain
2214             a restricted set of of tokens (e.g. no spaces), you could do
2215             the following:
2216              
2217             $ra->filter(sub { not grep { / / } @_ });
2218              
2219             or
2220              
2221             sub only_spaces_and_digits {
2222             not grep { ![\d ] } @_
2223             }
2224             $ra->filter( \&only_spaces_and_digits );
2225              
2226             These two examples will silently ignore faulty patterns, If you
2227             want the user to be made aware of the problem you should raise an
2228             error (via C or C), log an error message, whatever is
2229             best. If you want to remove a filter, pass C as a parameter.
2230              
2231             $ra->filter(undef);
2232              
2233             This method is chainable.
2234              
2235             =head2 as_string
2236              
2237             Assemble the expression and return it as a string. You may want to do
2238             this if you are writing the pattern to a file. The following arguments
2239             can be passed to control the aspect of the resulting pattern:
2240              
2241             B, the number of spaces used to indent nested grouping of
2242             a pattern. Use this to produce a pretty-printed pattern (for some
2243             definition of "pretty"). The resulting output is rather verbose. The
2244             reason is to ensure that the metacharacters C<(?:> and C<)> always
2245             occur on otherwise empty lines. This allows you grep the result for an
2246             even more synthetic view of the pattern:
2247              
2248             egrep -v '^ *[()]'
2249              
2250             The result of the above is quite readable. Remember to backslash the
2251             spaces appearing in your own patterns if you wish to use an indented
2252             pattern in an C construct. Indenting is ignored if tracking
2253             is enabled.
2254              
2255             The B argument takes precedence over the C
2256             method/attribute of the object.
2257              
2258             Calling this
2259             method will drain the internal data structure. Large numbers of patterns
2260             can eat a significant amount of memory, and this lets perl recover the
2261             memory used for other purposes.
2262              
2263             If you want to reduce the pattern I continue to add new patterns,
2264             clone the object and reduce the clone, leaving the original object intact.
2265              
2266             =head2 re
2267              
2268             Assembles the pattern and return it as a compiled RE, using the
2269             C operator.
2270              
2271             As with C, calling this method will reset the internal data
2272             structures to free the memory used in assembling the RE.
2273              
2274             The B attribute, documented in the C method, can be
2275             used here (it will be ignored if tracking is enabled).
2276              
2277             With method chaining, it is possible to produce a RE without having
2278             a temporary C object lying around, I:
2279              
2280             my $re = Regexp::Assemble->new
2281             ->add( q[ab+cd+e] )
2282             ->add( q[ac\\d+e] )
2283             ->add( q[c\\d+e] )
2284             ->re;
2285              
2286             The C<$re> variable now contains a Regexp object that can be used
2287             directly:
2288              
2289             while( <> ) {
2290             /$re/ and print "Something in [$_] matched\n";
2291             )
2292              
2293             The C method is called when the object is used in string context
2294             (hence, within an C operator), so by and large you do not even
2295             need to save the RE in a separate variable. The following will work
2296             as expected:
2297              
2298             my $re = Regexp::Assemble->new->add( qw[ fee fie foe fum ] );
2299             while( ) {
2300             if( /($re)/ ) {
2301             print "Here be giants: $1\n";
2302             }
2303             }
2304              
2305             This approach does not work with tracked patterns. The
2306             C and C methods must be used instead, see below.
2307              
2308             =head2 match(SCALAR)
2309              
2310             The following information applies to Perl 5.8 and below. See
2311             the section that follows for information on Perl 5.10.
2312              
2313             If pattern tracking is in use, you must C in order
2314             to make things work correctly. At a minimum, this will make your
2315             code look like this:
2316              
2317             my $did_match = do { use re 'eval'; $target =~ /$ra/ }
2318             if( $did_match ) {
2319             print "matched ", $ra->matched, "\n";
2320             }
2321              
2322             (The main reason is that the C<$^R> variable is currently broken
2323             and an ugly workaround that runs some Perl code during the match
2324             is required, in order to simulate what C<$^R> should be doing. See
2325             Perl bug #32840 for more information if you are curious. The README
2326             also contains more information). This bug has been fixed in 5.10.
2327              
2328             The important thing to note is that with C, THERE
2329             ARE SECURITY IMPLICATIONS WHICH YOU IGNORE AT YOUR PERIL. The problem
2330             is this: if you do not have strict control over the patterns being
2331             fed to C when tracking is enabled, and someone
2332             slips you a pattern such as C and you
2333             attempt to match a string against the resulting pattern, you will
2334             know Fear and Loathing.
2335              
2336             What is more, the C<$^R> workaround means that that tracking does
2337             not work if you perform a bare C pattern match as shown
2338             above. You have to instead call the C method, in order to
2339             supply the necessary context to take care of the tracking housekeeping
2340             details.
2341              
2342             if( defined( my $match = $ra->match($_)) ) {
2343             print " $_ matched by $match\n";
2344             }
2345              
2346             In the case of a successful match, the original matched pattern
2347             is returned directly. The matched pattern will also be available
2348             through the C method.
2349              
2350             (Except that the above is not true for 5.6.0: the C method
2351             returns true or undef, and the C method always returns
2352             undef).
2353              
2354             If you are capturing parts of the pattern I C
2355             you will want to get at the captures. See the C, C,
2356             C and C methods. If you are not using captures
2357             then you may safely ignore this section.
2358              
2359             In 5.10, since the bug concerning C<$^R> has been resolved, there
2360             is no need to use C and the assembled pattern does
2361             not require any Perl code to be executed during the match.
2362              
2363             =head2 new()
2364              
2365             Creates a new C object. The following optional
2366             key/value parameters may be employed. All keys have a corresponding
2367             method that can be used to change the behaviour later on. As a
2368             general rule, especially if you're just starting out, you don't
2369             have to bother with any of these.
2370              
2371             B, a family of optional attributes that allow anchors
2372             (C<^>, C<\b>, C<\Z>...) to be added to the resulting pattern.
2373              
2374             B, sets the C flags to add to the assembled regular
2375             expression. Warning: no error checking is done, you should ensure
2376             that the flags you pass are understood by the version of Perl you
2377             are using. B exists as an alias, for users familiar
2378             with L.
2379              
2380             B, controls whether the pattern should be chomped before being
2381             lexed. Handy if you are reading patterns from a file. By default,
2382             Cing is performed (this behaviour changed as of version 0.24,
2383             prior versions did not chomp automatically).
2384             See also the C attribute and the C method.
2385              
2386             B, slurp the contents of the specified file and add them
2387             to the assembly. Multiple files may be processed by using a list.
2388              
2389             my $r = Regexp::Assemble->new(file => 're.list');
2390              
2391             my $r = Regexp::Assemble->new(file => ['re.1', 're.2']);
2392              
2393             If you really don't want chomping to occur, you will have to set
2394             the C attribute to 0 (zero). You may also want to look at
2395             the C attribute, as well.
2396              
2397             B, controls what constitutes a record
2398             separator when using the C attribute or the C
2399             method. May be abbreviated to B. See the C<$/> variable in
2400             L.
2401              
2402             B, controls whether the pattern should contain zero-width
2403             lookahead assertions (For instance: (?=[abc])(?:bob|alice|charles).
2404             This is not activated by default, because in many circumstances the
2405             cost of processing the assertion itself outweighs the benefit of
2406             its faculty for short-circuiting a match that will fail. This is
2407             sensitive to the probability of a match succeeding, so if you're
2408             worried about performance you'll have to benchmark a sample population
2409             of targets to see which way the benefits lie.
2410              
2411             B, controls whether you want know which of the initial
2412             patterns was the one that matched. See the C method for
2413             more details. Note for version 5.8 of Perl and below, in this mode
2414             of operation YOU SHOULD BE AWARE OF THE SECURITY IMPLICATIONS that
2415             this entails. Perl 5.10 does not suffer from any such restriction.
2416              
2417             B, the number of spaces used to indent nested grouping of
2418             a pattern. Use this to produce a pretty-printed pattern. See the
2419             C method for a more detailed explanation.
2420              
2421             B, allows you to add a callback to enable sanity checks
2422             on the pattern being loaded. This callback is triggered before the
2423             pattern is split apart by the lexer. In other words, it operates
2424             on the entire pattern. If you are loading patterns from a file,
2425             this would be an appropriate place to remove comments.
2426              
2427             B, allows you to add a callback to enable sanity checks on
2428             the pattern being loaded. This callback is triggered after the
2429             pattern has been split apart by the lexer.
2430              
2431             B, controls whether to unroll, for example, C into
2432             C, C, which may allow additional reductions in the
2433             resulting assembled pattern.
2434              
2435             B, controls whether tail reduction occurs or not. If set,
2436             patterns like C will be reduced to C.
2437             That is, the end of the pattern in each part of the b... and d...
2438             alternations is identical, and hence is hoisted out of the alternation
2439             and placed after it. On by default. Turn it off if you're really
2440             pressed for short assembly times.
2441              
2442             B, specifies the pattern used to lex the input lines into
2443             tokens. You could replace the default pattern by a more sophisticated
2444             version that matches arbitrarily nested parentheses, for example.
2445              
2446             B, controls whether copious amounts of output is produced
2447             during the loading stage or the reducing stage of assembly.
2448              
2449             my $ra = Regexp::Assemble->new;
2450             my $rb = Regexp::Assemble->new( chomp => 1, debug => 3 );
2451              
2452             B, controls whether new patterns can be added to the object
2453             after the assembled pattern is generated. DEPRECATED.
2454              
2455             This method/attribute will be removed in a future release. It doesn't
2456             really serve any purpose, and may be more effectively replaced by
2457             cloning an existing C object and spinning out a
2458             pattern from that instead.
2459              
2460             =head2 source()
2461              
2462             When using tracked mode, after a successful match is made, returns
2463             the original source pattern that caused the match. In Perl 5.10,
2464             the C<$^R> variable can be used to as an index to fetch the correct
2465             pattern from the object.
2466              
2467             If no successful match has been performed, or the object is not in
2468             tracked mode, this method returns C.
2469              
2470             my $r = Regexp::Assemble->new->track(1)->add(qw(foo? bar{2} [Rr]at));
2471              
2472             for my $w (qw(this food is rather barren)) {
2473             if ($w =~ /$r/) {
2474             print "$w matched by ", $r->source($^R), $/;
2475             }
2476             else {
2477             print "$w no match\n";
2478             }
2479             }
2480              
2481             =head2 mbegin()
2482              
2483             This method returns a copy of C<@-> at the moment of the
2484             last match. You should ordinarily not need to bother with
2485             this, C should be able to supply all your needs.
2486              
2487             =head2 mend()
2488              
2489             This method returns a copy of C<@+> at the moment of the
2490             last match.
2491              
2492             =head2 mvar(NUMBER)
2493              
2494             The C method returns the captures of the last match.
2495             C corresponds to $1, C to $2, and so on.
2496             C happens to return the target string matched,
2497             as a byproduct of walking down the C<@-> and C<@+> arrays
2498             after the match.
2499              
2500             If called without a parameter, C will return a
2501             reference to an array containing all captures.
2502              
2503             =head2 capture
2504              
2505             The C method returns the the captures of the last
2506             match as an array. Unlink C, this method does not
2507             include the matched string. It is equivalent to getting an
2508             array back that contains C<$1, $2, $3, ...>.
2509              
2510             If no captures were found in the match, an empty array is
2511             returned, rather than C. You are therefore guaranteed
2512             to be able to use C<< for my $c ($re->capture) { ... >>
2513             without have to check whether anything was captured.
2514              
2515             =head2 matched()
2516              
2517             If pattern tracking has been set, via the C attribute,
2518             or through the C method, this method will return the
2519             original pattern of the last successful match. Returns undef
2520             match has yet been performed, or tracking has not been enabled.
2521              
2522             See below in the NOTES section for additional subtleties of
2523             which you should be aware of when tracking patterns.
2524              
2525             Note that this method is not available in 5.6.0, due to
2526             limitations in the implementation of C<(?{...})> at the time.
2527              
2528             =head2 Statistics/Reporting routines
2529              
2530             =head2 stats_add
2531              
2532             Returns the number of patterns added to the assembly (whether
2533             by C or C). Duplicate patterns are not included
2534             in this total.
2535              
2536             =head2 stats_dup
2537              
2538             Returns the number of duplicate patterns added to the assembly.
2539             If non-zero, this may be a sign that something is wrong with
2540             your data (or at the least, some needless redundancy). This may
2541             occur when you have two patterns (for instance, C and
2542             C) which map to the same result.
2543              
2544             =head2 stats_raw()
2545              
2546             Returns the raw number of bytes in the patterns added to the
2547             assembly. This includes both original and duplicate patterns.
2548             For instance, adding the two patterns C and C will
2549             count as 4 bytes.
2550              
2551             =head2 stats_cooked()
2552              
2553             Return the true number of bytes added to the assembly. This
2554             will not include duplicate patterns. Furthermore, it may differ
2555             from the raw bytes due to quotemeta treatment. For instance,
2556             C will count as 7 (not 8) bytes, because C<\,> will
2557             be stored as C<,>. Also, C<\Qa.b\E> is 7 bytes long, however,
2558             after the quotemeta directives are processed, C will be
2559             stored, for a total of 4 bytes.
2560              
2561             =head2 stats_length()
2562              
2563             Returns the length of the resulting assembled expression.
2564             Until C or C have been called, the length
2565             will be 0 (since the assembly will have not yet been
2566             performed). The length includes only the pattern, not the
2567             additional (C<(?-xism...>) fluff added by the compilation.
2568              
2569             =head2 dup_warn(NUMBER|CODEREF)
2570              
2571             Turns warnings about duplicate patterns on or off. By
2572             default, no warnings are emitted. If the method is
2573             called with no parameters, or a true parameter,
2574             the object will carp about patterns it has
2575             already seen. To turn off the warnings, use 0 as a
2576             parameter.
2577              
2578             $r->dup_warn();
2579              
2580             The method may also be passed a code block. In this case
2581             the code will be executed and it will receive a reference
2582             to the object in question, and the lexed pattern.
2583              
2584             $r->dup_warn(
2585             sub {
2586             my $self = shift;
2587             print $self->stats_add, " patterns added at line $.\n",
2588             join( '', @_ ), " added previously\n";
2589             }
2590             )
2591              
2592             =head2 Anchor routines
2593              
2594             Suppose you wish to assemble a series of patterns that all begin
2595             with C<^> and end with C<$> (anchor pattern to the beginning and
2596             end of line). Rather than add the anchors to each and every pattern
2597             (and possibly forget to do so when a new entry is added), you may
2598             specify the anchors in the object, and they will appear in the
2599             resulting pattern, and you no longer need to (or should) put them
2600             in your source patterns. For example, the two following snippets
2601             will produce identical patterns:
2602              
2603             $r->add(qw(^this ^that ^them))->as_string;
2604              
2605             $r->add(qw(this that them))->anchor_line_begin->as_string;
2606              
2607             # both techniques will produce ^th(?:at|em|is)
2608              
2609             All anchors are possible word (C<\b>) boundaries, line
2610             boundaries (C<^> and C<$>) and string boundaries (C<\A>
2611             and C<\Z> (or C<\z> if you absolutely need it)).
2612              
2613             The shortcut C> implies both
2614             C_begin> C_end>
2615             is also available. If different anchors are specified
2616             the most specific anchor wins. For instance, if both
2617             C and C are
2618             specified, C takes precedence.
2619              
2620             All the anchor methods are chainable.
2621              
2622             =head2 anchor_word_begin
2623              
2624             The resulting pattern will be prefixed with a C<\b>
2625             word boundary assertion when the value is true. Set
2626             to 0 to disable.
2627              
2628             $r->add('pre')->anchor_word_begin->as_string;
2629             # produces '\bpre'
2630              
2631             =head2 anchor_word_end
2632              
2633             The resulting pattern will be suffixed with a C<\b>
2634             word boundary assertion when the value is true. Set
2635             to 0 to disable.
2636              
2637             $r->add(qw(ing tion))
2638             ->anchor_word_end
2639             ->as_string; # produces '(?:tion|ing)\b'
2640              
2641             =head2 anchor_word
2642              
2643             The resulting pattern will be have C<\b>
2644             word boundary assertions at the beginning and end
2645             of the pattern when the value is true. Set
2646             to 0 to disable.
2647              
2648             $r->add(qw(cat carrot)
2649             ->anchor_word(1)
2650             ->as_string; # produces '\bca(?:rro)t\b'
2651              
2652             =head2 anchor_line_begin
2653              
2654             The resulting pattern will be prefixed with a C<^>
2655             line boundary assertion when the value is true. Set
2656             to 0 to disable.
2657              
2658             $r->anchor_line_begin;
2659             # or
2660             $r->anchor_line_begin(1);
2661              
2662             =head2 anchor_line_end
2663              
2664             The resulting pattern will be suffixed with a C<$>
2665             line boundary assertion when the value is true. Set
2666             to 0 to disable.
2667              
2668             # turn it off
2669             $r->anchor_line_end(0);
2670              
2671             =head2 anchor_line
2672              
2673             The resulting pattern will be have the C<^> and C<$>
2674             line boundary assertions at the beginning and end
2675             of the pattern, respectively, when the value is true. Set
2676             to 0 to disable.
2677              
2678             $r->add(qw(cat carrot)
2679             ->anchor_line
2680             ->as_string; # produces '^ca(?:rro)t$'
2681              
2682             =head2 anchor_string_begin
2683              
2684             The resulting pattern will be prefixed with a C<\A>
2685             string boundary assertion when the value is true. Set
2686             to 0 to disable.
2687              
2688             $r->anchor_string_begin(1);
2689              
2690             =head2 anchor_string_end
2691              
2692             The resulting pattern will be suffixed with a C<\Z>
2693             string boundary assertion when the value is true. Set
2694             to 0 to disable.
2695              
2696             # disable the string boundary end anchor
2697             $r->anchor_string_end(0);
2698              
2699             =head2 anchor_string_end_absolute
2700              
2701             The resulting pattern will be suffixed with a C<\z>
2702             string boundary assertion when the value is true. Set
2703             to 0 to disable.
2704              
2705             # disable the string boundary absolute end anchor
2706             $r->anchor_string_end_absolute(0);
2707              
2708             If you don't understand the difference between
2709             C<\Z> and C<\z>, the former will probably do what
2710             you want.
2711              
2712             =head2 anchor_string
2713              
2714             The resulting pattern will be have the C<\A> and C<\Z>
2715             string boundary assertions at the beginning and end
2716             of the pattern, respectively, when the value is true. Set
2717             to 0 to disable.
2718              
2719             $r->add(qw(cat carrot)
2720             ->anchor_string
2721             ->as_string; # produces '\Aca(?:rro)t\Z'
2722              
2723             =head2 anchor_string_absolute
2724              
2725             The resulting pattern will be have the C<\A> and C<\z>
2726             string boundary assertions at the beginning and end
2727             of the pattern, respectively, when the value is true. Set
2728             to 0 to disable.
2729              
2730             $r->add(qw(cat carrot)
2731             ->anchor_string_absolute
2732             ->as_string; # produces '\Aca(?:rro)t\z'
2733              
2734             =head2 debug(NUMBER)
2735              
2736             Turns debugging on or off. Statements are printed
2737             to the currently selected file handle (STDOUT by default).
2738             If you are already using this handle, you will have to
2739             arrange to select an output handle to a file of your own
2740             choosing, before call the C, C or C)
2741             functions, otherwise it will scribble all over your
2742             carefully formatted output.
2743              
2744             =over 4
2745              
2746             =item * 0
2747              
2748             Off. Turns off all debugging output.
2749              
2750             =item * 1
2751              
2752             Add. Trace the addition of patterns.
2753              
2754             =item * 2
2755              
2756             Reduce. Trace the process of reduction and assembly.
2757              
2758             =item * 4
2759              
2760             Lex. Trace the lexing of the input patterns into its constituent
2761             tokens.
2762              
2763             =item * 8
2764              
2765             Time. Print to STDOUT the time taken to load all the patterns. This is
2766             nothing more than the difference between the time the object was
2767             instantiated and the time reduction was initiated.
2768              
2769             # load=
2770              
2771             Any lengthy computation performed in the client code will be reflected
2772             in this value. Another line will be printed after reduction is
2773             complete.
2774              
2775             # reduce=
2776              
2777             The above output lines will be changed to C and
2778             C if the internal state of the object is corrupted
2779             and the initial timestamp is lost.
2780              
2781             The code attempts to load L in order to report fractional
2782             seconds. If this is not successful, the elapsed time is displayed
2783             in whole seconds.
2784              
2785             =back
2786              
2787             Values can be added (or or'ed together) to trace everything
2788              
2789             $r->debug(7)->add( '\\d+abc' );
2790              
2791             Calling C with no arguments turns debugging off.
2792              
2793             =head2 dump()
2794              
2795             Produces a synthetic view of the internal data structure. How
2796             to interpret the results is left as an exercise to the reader.
2797              
2798             print $r->dump;
2799              
2800             =head2 chomp(0|1)
2801              
2802             Turns chomping on or off.
2803              
2804             IMPORTANT: As of version 0.24, chomping is now on by default as it
2805             makes C Just Work. The only time you may run into trouble
2806             is with C. So don't do that, or else explicitly turn
2807             off chomping.
2808              
2809             To avoid incorporating (spurious)
2810             record separators (such as "\n" on Unix) when reading from a file,
2811             C Cs its input. If you don't want this to happen,
2812             call C with a false value.
2813              
2814             $re->chomp(0); # really want the record separators
2815             $re->add();
2816              
2817             =head2 fold_meta_pairs(NUMBER)
2818              
2819             Determines whether C<\s>, C<\S> and C<\w>, C<\W> and C<\d>, C<\D>
2820             are folded into a C<.> (dot). Folding happens by default (for
2821             reasons of backwards compatibility, even though it is wrong when
2822             the C expression modifier is active).
2823              
2824             Call this method with a false value to prevent this behaviour (which
2825             is only a problem when dealing with C<\n> if the C expression
2826             modifier is also set).
2827              
2828             $re->add( '\\w', '\\W' );
2829             my $clone = $re->clone;
2830              
2831             $clone->fold_meta_pairs(0);
2832             print $clone->as_string; # prints '.'
2833             print $re->as_string; # print '[\W\w]'
2834              
2835             =head2 indent(NUMBER)
2836              
2837             Sets the level of indent for pretty-printing nested groups
2838             within a pattern. See the C method for more details.
2839             When called without a parameter, no indenting is performed.
2840              
2841             $re->indent( 4 );
2842             print $re->as_string;
2843              
2844             =head2 lookahead(0|1)
2845              
2846             Turns on zero-width lookahead assertions. This is usually
2847             beneficial when you expect that the pattern will usually fail.
2848             If you expect that the pattern will usually match you will
2849             probably be worse off.
2850              
2851             =head2 flags(STRING)
2852              
2853             Sets the flags that govern how the pattern behaves (for
2854             versions of Perl up to 5.9 or so, these are C). By
2855             default no flags are enabled.
2856              
2857             =head2 modifiers(STRING)
2858              
2859             An alias of the C method, for users familiar with
2860             C.
2861              
2862             =head2 track(0|1)
2863              
2864             Turns tracking on or off. When this attribute is enabled,
2865             additional housekeeping information is inserted into the
2866             assembled expression using C<({...}> embedded code
2867             constructs. This provides the necessary information to
2868             determine which, of the original patterns added, was the
2869             one that caused the match.
2870              
2871             $re->track( 1 );
2872             if( $target =~ /$re/ ) {
2873             print "$target matched by ", $re->matched, "\n";
2874             }
2875              
2876             Note that when this functionality is enabled, no
2877             reduction is performed and no character classes are
2878             generated. In other words, C is not
2879             reduced down to C<(?:br|t)ag> and C is not
2880             reduced to C.
2881              
2882             =head2 unroll_plus(0|1)
2883              
2884             Turns the unrolling of plus metacharacters on or off. When
2885             a pattern is broken up, C becomes C, C (and
2886             C becomes C, C. This may allow the freed C
2887             to assemble with other patterns. Not enabled by default.
2888              
2889             =head2 lex(SCALAR)
2890              
2891             Change the pattern used to break a string apart into tokens.
2892             You can examine the C script as a starting point.
2893              
2894             =head2 reduce(0|1)
2895              
2896             Turns pattern reduction on or off. A reduced pattern may
2897             be considerably shorter than an unreduced pattern. Consider
2898             C I C. An unreduced
2899             pattern will be very similar to those produced by
2900             C. Reduction is on by default. Turning
2901             it off speeds assembly (but assembly is pretty fast -- it's
2902             the breaking up of the initial patterns in the lexing stage
2903             that can consume a non-negligible amount of time).
2904              
2905             =head2 mutable(0|1)
2906              
2907             This method has been marked as DEPRECATED. It will be removed
2908             in a future release. See the C method for a technique
2909             to replace its functionality.
2910              
2911             =head2 reset()
2912              
2913             Empties out the patterns that have been Ced or C-ed
2914             into the object. Does not modify the state of controller attributes
2915             such as C, C, C and the like.
2916              
2917             =head2 Default_Lexer
2918              
2919             B the C function is a class method, not
2920             an object method. It is a fatal error to call it as an object
2921             method.
2922              
2923             The C method lets you replace the default pattern
2924             used for all subsequently created C objects. It
2925             will not have any effect on existing objects. (It is also possible
2926             to override the lexer pattern used on a per-object basis).
2927              
2928             The parameter should be an ordinary scalar, not a compiled
2929             pattern. If the pattern fails to match all parts of the string,
2930             the missing parts will be returned as single chunks. Therefore
2931             the following pattern is legal (albeit rather cork-brained):
2932              
2933             Regexp::Assemble::Default_Lexer( '\\d' );
2934              
2935             The above pattern will split up input strings digit by digit, and
2936             all non-digit characters as single chunks.
2937              
2938             =head1 DIAGNOSTICS
2939              
2940             "Cannot pass a C to Default_Lexer"
2941              
2942             You tried to replace the default lexer pattern with an object
2943             instead of a scalar. Solution: You probably tried to call
2944             C<< $obj->Default_Lexer >>. Call the qualified class method instead
2945             C.
2946              
2947             "filter method not passed a coderef"
2948              
2949             "pre_filter method not passed a coderef"
2950              
2951             A reference to a subroutine (anonymous or otherwise) was expected.
2952             Solution: read the documentation for the C method.
2953              
2954             "duplicate pattern added: /.../"
2955              
2956             The C attribute is active, and a duplicate pattern was
2957             added (well duh!). Solution: clean your data.
2958              
2959             "cannot open [file] for input: [reason]"
2960              
2961             The C method was unable to open the specified file for
2962             whatever reason. Solution: make sure the file exists and the script
2963             has the required privileges to read it.
2964              
2965             =head1 NOTES
2966              
2967             This module has been tested successfully with a range of versions
2968             of perl, from 5.005_03 to 5.9.3. Use of 5.6.0 is not recommended.
2969              
2970             The expressions produced by this module can be used with the PCRE
2971             library.
2972              
2973             Remember to "double up" your backslashes if the patterns are
2974             hard-coded as constants in your program. That is, you should
2975             literally C rather than C. It
2976             usually will work either way, but it's good practice to do so.
2977              
2978             Where possible, supply the simplest tokens possible. Don't add
2979             C when C will do. The reason is that
2980             if you also add C the resulting assembly changes
2981             dramatically: C I
2982             C. Since R::A doesn't perform enough analysis,
2983             it won't "unroll" the C<{2}> quantifier, and will fail to notice
2984             the divergence after the first C<-d\d+>.
2985              
2986             Furthermore, when the string 'X-123000P' is matched against the
2987             first assembly, the regexp engine will have to backtrack over each
2988             alternation (the one that ends in Y B the one that ends in Z)
2989             before determining that there is no match. No such backtracking
2990             occurs in the second pattern: as soon as the engine encounters the
2991             'P' in the target string, neither of the alternations at that point
2992             (C<-\d+Y> or C) could succeed and so the match fails.
2993              
2994             C does, however, know how to build character
2995             classes. Given C, C and C, it will assemble these
2996             into C. When C<-> (dash) appears as a candidate for a
2997             character class it will be the first character in the class. When
2998             C<^> (circumflex) appears as a candidate for a character class it
2999             will be the last character in the class.
3000              
3001             It also knows about meta-characters than can "absorb" regular
3002             characters. For instance, given C and C, it knows that
3003             C<5> can be represented by C<\d> and so the assembly is just C.
3004             The "absorbent" meta-characters it deals with are C<.>, C<\d>, C<\s>
3005             and C<\W> and their complements. It will replace C<\d>/C<\D>,
3006             C<\s>/C<\S> and C<\w>/C<\W> by C<.> (dot), and it will drop C<\d>
3007             if C<\w> is also present (as will C<\D> in the presence of C<\W>).
3008              
3009             C deals correctly with C's propensity
3010             to backslash many characters that have no need to be. Backslashes on
3011             non-metacharacters will be removed. Similarly, in character classes,
3012             a number of characters lose their magic and so no longer need to be
3013             backslashed within a character class. Two common examples are C<.>
3014             (dot) and C<$>. Such characters will lose their backslash.
3015              
3016             At the same time, it will also process C<\Q...\E> sequences. When
3017             such a sequence is encountered, the inner section is extracted and
3018             C is applied to the section. The resulting quoted text
3019             is then used in place of the original unquoted text, and the C<\Q>
3020             and C<\E> metacharacters are thrown away. Similar processing occurs
3021             with the C<\U...\E> and C<\L...\E> sequences. This may have surprising
3022             effects when using a dispatch table. In this case, you will need
3023             to know exactly what the module makes of your input. Use the C
3024             method to find out what's going on:
3025              
3026             $pattern = join( '', @{$re->lexstr($pattern)} );
3027              
3028             If all the digits 0..9 appear in a character class, C
3029             will replace them by C<\d>. I'd do it for letters as well, but
3030             thinking about accented characters and other glyphs hurts my head.
3031              
3032             In an alternation, the longest paths are chosen first (for example,
3033             C). When two paths have the same length, the path
3034             with the most subpaths will appear first. This aims to put the
3035             "busiest" paths to the front of the alternation. For example, the
3036             list C, C, C, C and C will produce the
3037             pattern C<(?:f(?:ew|ig|un)|b(?:ad|it))>. See F for a
3038             real-world example of how alternations are sorted. Once you have
3039             looked at that, everything should be crystal clear.
3040              
3041             When tracking is in use, no reduction is performed. nor are
3042             character classes formed. The reason is that it is
3043             too difficult to determine the original pattern afterwards. Consider the
3044             two patterns C and C. These should be reduced to
3045             C. The final character matches one of two possibilities.
3046             To resolve whether it matched an C<'e'> or C<'m'> would require
3047             keeping track of the fact that the pattern finished up in a character
3048             class, which would the require a whole lot more work to figure out
3049             which character of the class matched. Without character classes
3050             it becomes much easier. Instead, C is produced, which
3051             lets us find out more simply where we ended up.
3052              
3053             Similarly, C and C should form C<(?:dog|sea)food>.
3054             When the pattern is being assembled, the tracking decision needs
3055             to be made at the end of the grouping, but the tail of the pattern
3056             has not yet been visited. Deferring things to make this work correctly
3057             is a vast hassle. In this case, the pattern becomes merely
3058             C<(?:dogfood|seafood>. Tracked patterns will therefore be bulkier than
3059             simple patterns.
3060              
3061             There is an open bug on this issue:
3062              
3063             L
3064              
3065             If this bug is ever resolved, tracking would become much easier to
3066             deal with (none of the C hassle would be required - you could
3067             just match like a regular RE and it would Just Work).
3068              
3069             =head1 SEE ALSO
3070              
3071             =over 4
3072              
3073             =item L
3074              
3075             General information about Perl's regular expressions.
3076              
3077             =item L
3078              
3079             Specific information about C.
3080              
3081             =item Regex::PreSuf
3082              
3083             C takes a string and chops it itself into tokens of
3084             length 1. Since it can't deal with tokens of more than one character,
3085             it can't deal with meta-characters and thus no regular expressions.
3086             Which is the main reason why I wrote this module.
3087              
3088             =item Regexp::Optimizer
3089              
3090             C produces regular expressions that are similar to
3091             those produced by R::A with reductions switched off. It's biggest
3092             drawback is that it is exponentially slower than Regexp::Assemble on
3093             very large sets of patterns.
3094              
3095             =item Regexp::Parser
3096              
3097             Fine grained analysis of regular expressions.
3098              
3099             =item Regexp::Trie
3100              
3101             Funnily enough, this was my working name for C
3102             during its development. I changed the name because I thought it
3103             was too obscure. Anyway, C does much the same as
3104             C and C except that it runs
3105             much faster (according to the author). It does not recognise
3106             meta characters (that is, 'a+b' is interpreted as 'a\+b').
3107              
3108             =item Text::Trie
3109              
3110             C is well worth investigating. Tries can outperform very
3111             bushy (read: many alternations) patterns.
3112              
3113             =item Tree::Trie
3114              
3115             C is another module that builds tries. The algorithm that
3116             C uses appears to be quite similar to the
3117             algorithm described therein, except that C solves its
3118             end-marker problem without having to rewrite the leaves.
3119              
3120             =back
3121              
3122             =head1 See Also
3123              
3124             For alternatives to this module, consider one of:
3125              
3126             =over 4
3127              
3128             =item o L
3129              
3130             =item o L
3131              
3132             =item o L
3133              
3134             =back
3135              
3136             =head1 LIMITATIONS
3137              
3138             Some mildly complex cases are not handled well. See examples/failure.01.pl
3139             and L.
3140              
3141             See also L for a discussion
3142             of some of the issues arising with the use of a huge number of alterations. Thanx to
3143             Slaven Rezic for the details of trie 'v' non-trie operations within Perl which influence
3144             regexp handling of alternations.
3145              
3146             does not attempt to find common substrings. For
3147             instance, it will not collapse C down to C.
3148             If there's a module out there that performs this sort of string
3149             analysis I'd like to know about it. But keep in mind that the
3150             algorithms that do this are very expensive: quadratic or worse.
3151              
3152             C does not interpret meta-character modifiers.
3153             For instance, if the following two patterns are
3154             given: C and C, it will not determine that C<\d> can be
3155             matched by C<\d+>. Instead, it will produce C. Along
3156             a similar line of reasoning, it will not determine that C and
3157             C is equivalent to C (It will produce C
3158             instead).
3159              
3160             You cannot remove a pattern that has been added to an object. You'll
3161             just have to start over again. Adding a pattern is difficult enough,
3162             I'd need a solid argument to convince me to add a C method.
3163             If you need to do this you should read the documentation for the
3164             C method.
3165              
3166             C does not (yet)? employ the C<(?E...)>
3167             construct.
3168              
3169             The module does not produce POSIX-style regular expressions. This
3170             would be quite easy to add, if there was a demand for it.
3171              
3172             =head1 BUGS
3173              
3174             Patterns that generate look-ahead assertions sometimes produce
3175             incorrect patterns in certain obscure corner cases. If you
3176             suspect that this is occurring in your pattern, disable
3177             lookaheads.
3178              
3179             Tracking doesn't really work at all with 5.6.0. It works better
3180             in subsequent 5.6 releases. For maximum reliability, the use of
3181             a 5.8 release is strongly recommended. Tracking barely works with
3182             5.005_04. Of note, using C<\d>-style meta-characters invariably
3183             causes panics. Tracking really comes into its own in Perl 5.10.
3184              
3185             If you feed C patterns with nested parentheses,
3186             there is a chance that the resulting pattern will be uncompilable
3187             due to mismatched parentheses (not enough closing parentheses). This
3188             is normal, so long as the default lexer pattern is used. If you want
3189             to find out which pattern among a list of 3000 patterns are to blame
3190             (speaking from experience here), the F script offers
3191             a strategy for pinpointing the pattern at fault. While you may not
3192             be able to use the script directly, the general approach is easy to
3193             implement.
3194              
3195             The algorithm used to assemble the regular expressions makes extensive
3196             use of mutually-recursive functions (that is, A calls B, B calls
3197             A, ...) For deeply similar expressions, it may be possible to provoke
3198             "Deep recursion" warnings.
3199              
3200             The module has been tested extensively, and has an extensive test
3201             suite (that achieves close to 100% statement coverage), but you
3202             never know... A bug may manifest itself in two ways: creating a
3203             pattern that cannot be compiled, such as C, or a pattern
3204             that compiles correctly but that either matches things it shouldn't,
3205             or doesn't match things it should. It is assumed that Such problems
3206             will occur when the reduction algorithm encounters some sort of
3207             edge case. A temporary work-around is to disable reductions:
3208              
3209             my $pattern = $assembler->reduce(0)->re;
3210              
3211             A discussion about implementation details and where bugs might lurk
3212             appears in the README file. If this file is not available locally,
3213             you should be able to find a copy on the Web at your nearest CPAN
3214             mirror.
3215              
3216             Seriously, though, a number of people have been using this module to
3217             create expressions anywhere from 140Kb to 600Kb in size, and it seems to
3218             be working according to spec. Thus, I don't think there are any serious
3219             bugs remaining.
3220              
3221             If you are feeling brave, extensive debugging traces are available to
3222             figure out where assembly goes wrong.
3223              
3224             Please report all bugs at
3225             L
3226              
3227             Make sure you include the output from the following two commands:
3228              
3229             perl -MRegexp::Assemble -le 'print $Regexp::Assemble::VERSION'
3230             perl -V
3231              
3232             There is a mailing list for the discussion of C.
3233             Subscription details are available at
3234             L.
3235              
3236             =head1 ACKNOWLEDGEMENTS
3237              
3238             This module grew out of work I did building access maps for Postfix,
3239             a modern SMTP mail transfer agent. See L
3240             for more information. I used Perl to build large regular expressions
3241             for blocking dynamic/residential IP addresses to cut down on spam
3242             and viruses. Once I had the code running for this, it was easy to
3243             start adding stuff to block really blatant spam subject lines, bogus
3244             HELO strings, spammer mailer-ids and more...
3245              
3246             I presented the work at the French Perl Workshop in 2004, and the
3247             thing most people asked was whether the underlying mechanism for
3248             assembling the REs was available as a module. At that time it was
3249             nothing more that a twisty maze of scripts, all different. The
3250             interest shown indicated that a module was called for. I'd like to
3251             thank the people who showed interest. Hey, it's going to make I
3252             messy scripts smaller, in any case.
3253              
3254             Thomas Drugeon was a valuable sounding board for trying out
3255             early ideas. Jean Forget and Philippe Blayo looked over an early
3256             version. H.Merijn Brandt stopped over in Paris one evening, and
3257             discussed things over a few beers.
3258              
3259             Nicholas Clark pointed out that while what this module does
3260             (?:c|sh)ould be done in perl's core, as per the 2004 TODO, he
3261             encouraged me to continue with the development of this module. In
3262             any event, this module allows one to gauge the difficulty of
3263             undertaking the endeavour in C. I'd rather gouge my eyes out with
3264             a blunt pencil.
3265              
3266             Paul Johnson settled the question as to whether this module should
3267             live in the Regex:: namespace, or Regexp:: namespace. If you're
3268             not convinced, try running the following one-liner:
3269              
3270             perl -le 'print ref qr//'
3271              
3272             Philippe Bruhat found a couple of corner cases where this module
3273             could produce incorrect results. Such feedback is invaluable,
3274             and only improves the module's quality.
3275              
3276             =head1 Machine-Readable Change Log
3277              
3278             The file Changes was converted into Changelog.ini by L.
3279              
3280             =head1 AUTHOR
3281              
3282             David Landgren
3283              
3284             Copyright (C) 2004-2011. All rights reserved.
3285              
3286             http://www.landgren.net/perl/
3287              
3288             If you use this module, I'd love to hear about what you're using
3289             it for. If you want to be informed of updates, send me a note.
3290              
3291             Ron Savage is co-maint of the module, starting with V 0.36.
3292              
3293             =head1 Repository
3294              
3295             L
3296              
3297             =head1 TODO
3298              
3299             1. Tree equivalencies. Currently, /contend/ /content/ /resend/ /resent/
3300             produces (?:conten[dt]|resend[dt]) but it is possible to produce
3301             (?:cont|res)en[dt] if one can spot the common tail nodes (and walk back
3302             the equivalent paths). Or be by me my => /[bm][ey]/ in the simplest case.
3303              
3304             To do this requires a certain amount of restructuring of the code.
3305             Currently, the algorithm uses a two-phase approach. In the first
3306             phase, the trie is traversed and reductions are performed. In the
3307             second phase, the reduced trie is traversed and the pattern is
3308             emitted.
3309              
3310             What has to occur is that the reduction and emission have to occur
3311             together. As a node is completed, it is replaced by its string
3312             representation. This then allows child nodes to be compared for
3313             equality with a simple 'eq'. Since there is only a single traversal,
3314             the overall generation time might drop, even though the context
3315             baggage required to delve through the tree will be more expensive
3316             to carry along (a hash rather than a couple of scalars).
3317              
3318             Actually, a simpler approach is to take on a secret sentinel
3319             atom at the end of every pattern, which gives the reduction
3320             algorithm sufficient traction to create a perfect trie.
3321              
3322             I'm rewriting the reduction code using this technique.
3323              
3324             2. Investigate how (?>foo) works. Can it be applied?
3325              
3326             5. How can a tracked pattern be serialised? (Add freeze and thaw methods).
3327              
3328             6. Store callbacks per tracked pattern.
3329              
3330             12. utf-8... hmmmm...
3331              
3332             14. Adding qr//'ed patterns. For example, consider
3333             $r->add ( qr/^abc/i )
3334             ->add( qr/^abd/ )
3335             ->add( qr/^ab e/x );
3336             this should admit abc abC aBc aBC abd abe as matches
3337              
3338             16. Allow a fast, unsafe tracking mode, that can be used if a(?bc)?
3339             can't happen. (Possibly carp if it does appear during traversal)?
3340              
3341             17. given a-\d+-\d+-\d+-\d+-b, produce a(?:-\d+){4}-b. Something
3342             along the lines of (.{4))(\1+) would let the regexp engine
3343             itself be brought to bear on the matter, which is a rather
3344             appealing idea. Consider
3345              
3346             while(/(?!\+)(\S{2,}?)(\1+)/g) { ... $1, $2 ... }
3347              
3348             as a starting point.
3349              
3350             19. The reduction code has become unbelievably baroque. Adding code
3351             to handle (sting,singing,sing) => s(?:(?:ing)?|t)ing was far
3352             too difficult. Adding more stuff just breaks existing behaviour.
3353             And fixing the ^abcd$ ... bug broke stuff all over again.
3354             Now that the corner cases are more clearly identified, a full
3355             rewrite of the reduction code is needed. And would admit the
3356             possibility of implementing items 1 and 17.
3357              
3358             20. Handle debug unrev with a separate bit
3359              
3360             23. Japhy's http://www.perlmonks.org/index.pl?node_id=90876 list2range
3361             regexp
3362              
3363             24. Lookahead assertions contain serious bugs (as shown by
3364             assembling powersets. Need to save more context during reduction,
3365             which in turn will simplify the preparation of the lookahead
3366             classes. See also 19.
3367              
3368             26. _lex() swamps the overall run-time. It stems from the decision
3369             to use a single regexp to pull apart any pattern. A suite of
3370             simpler regexp to pick of parens, char classes, quantifiers
3371             and bare tokens may be faster. (This has been implemented as
3372             _fastlex(), but it's only marginally faster. Perhaps split-by-
3373             char and lex a la C?
3374              
3375             27. We don't, as yet, unroll_plus a paren e.g. (abc)+?
3376              
3377             28. We don't reroll unrolled a a* to a+ in indented or tracked
3378             output
3379              
3380             29. Use (*MARK n) in blead for tracked patterns, and use (*FAIL) for
3381             the unmatchable pattern.
3382              
3383             =head1 LICENSE
3384              
3385             This library is free software; you can redistribute it and/or modify
3386             it under the same terms as Perl itself.
3387              
3388             =cut
3389              
3390             # Return a +ve value to tell Perl the module is ready to go.
3391              
3392             'The Lusty Decadent Delights of Imperial Pompeii';