File Coverage

lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
Criterion Covered Total %
statement 381 502 75.9
branch 160 276 57.9
condition 88 141 62.4
subroutine 22 26 84.6
pod 3 14 21.4
total 654 959 68.2


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Copyright 2006 Apache Software Foundation
3             #
4             # Licensed under the Apache License, Version 2.0 (the "License");
5             # you may not use this file except in compliance with the License.
6             # You may obtain a copy of the License at
7             #
8             # http://www.apache.org/licenses/LICENSE-2.0
9             #
10             # Unless required by applicable law or agreed to in writing, software
11             # distributed under the License is distributed on an "AS IS" BASIS,
12             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13             # See the License for the specific language governing permissions and
14             # limitations under the License.
15             # </@LICENSE>
16              
17             =head1 NAME
18              
19             Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset
20              
21             =head1 SYNOPSIS
22              
23             This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
24             suitable for use in Rule2XSBody rules or other parallel matching algorithms.
25              
26             =cut
27              
28             package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;
29              
30 1     1   9 use Mail::SpamAssassin::Plugin;
  1         6  
  1         29  
31 1     1   6 use Mail::SpamAssassin::Logger;
  1         2  
  1         66  
32 1     1   9 use Mail::SpamAssassin::Util qw(untaint_var);
  1         2  
  1         51  
33 1     1   354 use Mail::SpamAssassin::Util::Progress;
  1         3  
  1         48  
34              
35 1     1   10 use Errno qw(ENOENT EACCES EEXIST);
  1         2  
  1         120  
36 1     1   7 use Data::Dumper;
  1         3  
  1         70  
37              
38 1     1   7 use strict;
  1         1  
  1         19  
39 1     1   5 use warnings;
  1         1  
  1         24  
40             # use bytes;
41 1     1   5 use re 'taint';
  1         2  
  1         55  
42              
43             our @ISA = qw(Mail::SpamAssassin::Plugin);
44              
45 1     1   6 use constant DEBUG_RE_PARSING => 0; # noisy!
  1         2  
  1         5410  
46              
47             # a few settings that control what kind of bases are output.
48              
49             # treat all rules as lowercase for purposes of term extraction?
50             # $main->{bases_must_be_casei} = 1;
51             # $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/
52             # $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/
53             # $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
54             # $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]
55             # $main->{base_quiet} = 0; # silences progress output
56              
57             # TODO: it would be nice to have a clean API to pass such settings
58             # through to plugins instead of hanging them off $main
59              
60             ##############################################################################
61              
62             # testing purposes only
63             my $fixup_re_test;
64             #$fixup_re_test = 1; fixup_re("fr()|\\\\|"); die;
65             #$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;
66             #$fixup_re_test = 1; fixup_re("\\33\$b"); die;
67             #$fixup_re_test = 1; fixup_re("[link]"); die;
68             #$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;
69              
70             ###########################################################################
71              
72             sub new {
73 15     15 1 115 my $class = shift;
74 15         97 my $mailsaobject = shift;
75 15   33     135 $class = ref($class) || $class;
76 15         89 my $self = $class->SUPER::new($mailsaobject);
77 15         51 bless ($self, $class);
78              
79 15         118 $self->{show_progress} = !$mailsaobject->{base_quiet};
80              
81             # $self->test(); exit;
82 15         180 return $self;
83             }
84              
85             ###########################################################################
86              
87             sub finish_parsing_end {
88 15     15 1 53 my ($self, $params) = @_;
89 15         40 my $conf = $params->{conf};
90 15         49 $self->extract_bases($conf);
91             }
92              
93             sub extract_bases {
94 15     15 0 32 my ($self, $conf) = @_;
95              
96 15         31 my $main = $conf->{main};
97 15 50       44 if (!$main->{base_extract}) { return; }
  0         0  
98              
99             $self->{show_progress} and
100 15 50       50 info("base extraction starting. this can take a while...");
101              
102 15         46 $self->extract_set($conf, $conf->{body_tests}, 'body');
103             }
104              
105             sub extract_set {
106 15     15 0 41 my ($self, $conf, $test_set, $ruletype) = @_;
107              
108 15         24 foreach my $pri (keys %{$test_set}) {
  15         57  
109 15         28 my $nicepri = $pri; $nicepri =~ s/-/neg/g;
  15         103  
110 15         100 $self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
111             }
112             }
113              
114             ###########################################################################
115              
116             sub extract_set_pri {
117 15     15 0 49 my ($self, $conf, $rules, $ruletype) = @_;
118              
119 15         36 my @good_bases;
120             my @failed;
121 15         37 my $yes = 0;
122 15         32 my $no = 0;
123 15         23 my $count = 0;
124 15         60 my $start = time;
125 15         33 $self->{main} = $conf->{main}; # for use in extract_hints()
126 15 50       49 $self->{show_progress} and info ("extracting from rules of type $ruletype");
127 15         37 my $tflags = $conf->{tflags};
128              
129             # attempt to find good "base strings" (simplified regexp subsets) for each
130             # regexp. We try looking at the regexp from both ends, since there
131             # may be a good long string of text at the end of the rule.
132              
133             # require this many chars in a base string + delimiters for it to be viable
134 15         28 my $min_chars = 5;
135              
136 15         19 my $progress;
137             $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
138 15 50 0     44 total => (scalar keys %{$rules} || 1),
139             itemtype => 'rules',
140             });
141              
142 15         33 my $cached = { };
143 15         34 my $cachefile;
144              
145 15 50       42 if ($self->{main}->{bases_cache_dir}) {
146 0         0 $cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";
147 0         0 dbg("zoom: reading cache file $cachefile");
148 0         0 $cached = $self->read_cachefile($cachefile);
149             }
150              
151             NEXT_RULE:
152 15         36 foreach my $name (keys %{$rules}) {
  15         59  
153 42 50 33     320 $self->{show_progress} and $progress and $progress->update(++$count);
154              
155 42         181 my $rule = $rules->{$name};
156 42         179 my $cachekey = join "#", $name, $rule;
157              
158 42         139 my $cent = $cached->{rule_bases}->{$cachekey};
159 42 50       126 if (defined $cent) {
160 0 0       0 if (defined $cent->{g}) {
161 0         0 dbg("zoom: YES (cached) $rule $name");
162 0         0 foreach my $ent (@{$cent->{g}}) {
  0         0  
163             # note: we have to copy these, since otherwise later
164             # modifications corrupt the cached data
165             push @good_bases, {
166             base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}
167 0         0 };
168             }
169 0         0 $yes++;
170             }
171             else {
172 0         0 dbg("zoom: NO (cached) $rule $name");
173 0         0 push @failed, { orig => $rule }; # no need to cache this
174 0         0 $no++;
175             }
176 0         0 next NEXT_RULE;
177             }
178              
179             # ignore ReplaceTags rules
180 42         121 my $is_a_replacetags_rule = $conf->{rules_to_replace}->{$name};
181 42         80 my ($minlen, $lossy, @bases);
182              
183 42 50       146 if (!$is_a_replacetags_rule) {
184             eval { # catch die()s
185 42         446 my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
186 41         205 ($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);
187             # dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases));
188 40         554 1;
189 42 100       87 } or do {
190 2 50       23 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  2         24  
191 2         57 dbg("zoom: giving up on regexp: $eval_stat");
192             };
193              
194 42 50 50     903 if ($lossy && ($tflags->{$name}||'') =~ /\bmultiple\b/) {
      66        
195 0         0 warn "\nzoom: rule $name will loop on SpamAssassin older than 3.3.2 ".
196             "running under Perl 5.12 or older, Bug 6558\n";
197             }
198              
199             # if any of the extracted hints in a set are too short, the entire
200             # set is invalid; this is because each set of N hints represents just
201             # 1 regexp.
202 42         339 foreach my $str (@bases) {
203 135         656 my $len = length fixup_re($str); # bug 6143: count decoded characters
204 135 100 100     766 if ($len < $min_chars) { $minlen = undef; @bases = (); last; }
  3 100       44  
  3         34  
  3         23  
205 62         178 elsif (!defined($minlen) || $len < $minlen) { $minlen = $len; }
206             }
207             }
208              
209 42 100 66     1036 if ($is_a_replacetags_rule || !$minlen || !@bases) {
      66        
210 5 50       138 dbg("zoom: ignoring rule %s, %s", $name,
    50          
211             $is_a_replacetags_rule ? 'is a replace rule'
212             : !@bases ? 'no bases' : 'no minlen');
213 5         80 push @failed, { orig => $rule };
214 5         67 $cached->{rule_bases}->{$cachekey} = { };
215 5         43 $no++;
216             }
217             else {
218             # dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
219              
220             # figure out if we have e.g. ["foo", "foob", "foobar"]; in this
221             # case, we only need to track ["foo"].
222 37         120 my %subsumed;
223 37         100 foreach my $base1 (@bases) {
224 132         302 foreach my $base2 (@bases) {
225 2524 100 100     19328 if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {
226 4         83 $subsumed{$base1} = 1; # base2 is inside base1; discard the longer
227             }
228             }
229             }
230              
231 37         105 my @forcache;
232 37         96 foreach my $base (@bases) {
233 132 100       413 next if $subsumed{$base};
234 128         1326 push @good_bases, {
235             base => $base, orig => $rule, name => "$name,[l=$lossy]"
236             };
237             # *separate* copies for cache -- we modify the @good_bases entry
238 128         869 push @forcache, {
239             base => $base, orig => $rule, name => "$name,[l=$lossy]"
240             };
241             }
242              
243 37         1046 $cached->{rule_bases}->{$cachekey} = { g => \@forcache };
244 37         438 $yes++;
245             }
246             }
247              
248 15 50 33     331 $self->{show_progress} and $progress and $progress->final();
249              
250 15         862 dbg("zoom: $ruletype: found ".(scalar @good_bases).
251             " usable base strings in $yes rules, skipped $no rules");
252              
253             # NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
254             # ("food" =~ "foo" / "food") will return "food". So therefore if a pattern
255             # subsumes other patterns, we need to return hits for all of them. We also
256             # need to take care of the case where multiple regexps wind up sharing the
257             # same base.
258             #
259             # Another gotcha, an exception to the subsumption rule; if one pattern isn't
260             # entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
261             # returned as two hits, correctly. So we only have to be smart about the
262             # full-subsumption case; overlapping is taken care of for us, by re2c.
263             #
264             # TODO: there's a bug here. Since the code in extract_hints() has been
265             # modified to support more complex regexps, we can no longer simply assume
266             # that if pattern A is not contained in pattern B, that means that pattern B
267             # doesn't subsume it. Consider, for example, A="foo*bar" and
268             # B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
269             # that without running the A RE match itself somehow against B.
270             # same issue remains with:
271             #
272             # "foo?bar" / "fobar"
273             # "fo(?:o|oo|)bar" / "fobar"
274             # "fo(?:o|oo)?bar" / "fobar"
275             # "fo(?:o*|baz)bar" / "fobar"
276             # "(?:fo(?:o*|baz)bar|blargh)" / "fobar"
277             #
278             # it's worse with this:
279             #
280             # "fo(?:o|oo|)bar" / "foo*bar"
281             #
282             # basically, this is impossible to compute without reimplementing most of
283             # re2c, and it appears the re2c developers don't plan to offer this:
284             # https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
285              
286 15         305 $conf->{base_orig}->{$ruletype} = { };
287 15         174 $conf->{base_string}->{$ruletype} = { };
288              
289 15         53 $count = 0;
290 15 50 0     56 $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
291             total => (scalar @good_bases || 1),
292             itemtype => 'bases',
293             });
294              
295             # this bit is annoyingly O(N^2). Rewrite the data -- the @good_bases
296             # array -- into a more efficient format, using arrays and with a little
297             # bit of precomputation, to go (quite a bit) faster
298              
299 15         62 my @rewritten;
300 15         42 foreach my $set1 (@good_bases) {
301 128         376 my $base = $set1->{base};
302 128 50 33     611 next if (!$base || !$set1->{name});
303             push @rewritten, [
304             $base, # 0
305             $set1->{name}, # 1
306             $set1->{orig}, # 2
307 128         2275 length $base, # 3
308             qr/\Q$base\E/, # 4
309             0 # 5, has_multiple flag
310             ];
311             }
312 15         226 @good_bases = @rewritten;
313              
314 15         82 foreach my $set1 (@good_bases) {
315 128 50 33     501 $self->{show_progress} and $progress and $progress->update(++$count);
316              
317 128 100       271 my $base1 = $set1->[0]; next unless $base1;
  128         273  
318 104         226 my $name1 = $set1->[1];
319 104         210 my $orig1 = $set1->[2];
320 104         470 $conf->{base_orig}->{$ruletype}->{$name1} = $orig1;
321 104         316 my $len1 = $set1->[3];
322              
323 104         233 foreach my $set2 (@good_bases) {
324 3247 100       6224 next if ($set1 == $set2);
325              
326 3143 100       4737 my $base2 = $set2->[0]; next unless $base2;
  3143         5372  
327 2660         3895 my $name2 = $set2->[1];
328              
329             # clobber exact dups; this can happen if a regexp outputs the
330             # same base string multiple times
331 2660 100 100     5090 if ($base1 eq $base2 &&
      66        
332             $name1 eq $name2 &&
333             $orig1 eq $set2->[2])
334             {
335 24         82 $set2->[0] = ''; # clobber
336 24         56 next;
337             }
338              
339             # skip if it's too short to contain the other base string
340 2636 100       4776 next if ($len1 < $set2->[3]);
341              
342             # skip if either already contains the other rule's name
343             # optimize: this can only happen if the base has more than
344             # one rule already attached, ie [5]
345 1510 100 100     3813 next if ($set2->[5] && $name2 =~ /(?: |^)\Q$name1\E(?: |$)/);
346              
347             # don't use $name1 here, since another base in the set2 loop
348             # may have added $name2 since we set that
349 1423 100 100     3781 next if ($set1->[5] && $set1->[1] =~ /(?: |^)\Q$name2\E(?: |$)/);
350              
351             # and finally check to see if it *does* contain the other base string
352 1369 100       4749 next if ($base1 !~ $set2->[4]);
353              
354             # base2 is just a subset of base1
355             # dbg("zoom: subsuming '$base2' ($name2) into '$base1': [1]=$set1->[1] [5]=$set1->[5]");
356 27         137 $set1->[1] .= " ".$name2;
357 27         103 $set1->[5] = 1;
358             }
359             }
360              
361             # we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS
362             # both contain "killed" for example, pointing at different rules, which
363             # the above search hasn't found. Collapse them here with a hash
364 15         57 my %bases;
365 15         67 foreach my $set (@good_bases) {
366 128         330 my $base = $set->[0];
367 128 100       330 next unless $base;
368              
369 104 100       270 if (defined $bases{$base}) {
370 12         71 $bases{$base} .= " ".$set->[1];
371             } else {
372 92         783 $bases{$base} = $set->[1];
373             }
374             }
375 15         81 undef @good_bases;
376              
377 15         100 foreach my $base (keys %bases) {
378             # uniq the list, since there are probably dup rules listed
379 92         200 my %u;
380 92         338 for my $i (split ' ', $bases{$base}) {
381 137 100       322 next if exists $u{$i}; undef $u{$i};
  120         315  
382             }
383 92         661 $conf->{base_string}->{$ruletype}->{$base} = join ' ', sort keys %u;
384             }
385 15 50 33     144 $self->{show_progress} and $progress and $progress->final();
386              
387 15 50       90 if ($cachefile) {
388 0         0 $self->write_cachefile ($cachefile, $cached);
389             }
390              
391 15         55 my $elapsed = time - $start;
392             $self->{show_progress} and info ("$ruletype: ".
393 15 50       1399 (scalar keys %{$conf->{base_string}->{$ruletype}}).
  0         0  
394             " base strings extracted in $elapsed seconds\n");
395             }
396              
397             ###########################################################################
398              
399             # TODO:
400             # NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i
401             # => should extract 'scription' somehow
402             # /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
403             # => should understand alternations; tricky
404              
405             sub simplify_and_qr_regexp {
406 42     42 0 137 my $self = shift;
407 42         311 my $rule = shift;
408              
409 42         106 my $main = $self->{main};
410 42         940 $rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);
411              
412             # remove the regexp modifiers, keep for later
413 42         160 my $mods = '';
414 42         453 while ($rule =~ s/^\(\?([a-z]*)\)//) { $mods .= $1; }
  20         129  
415              
416             # modifier removal
417 42         135 while ($rule =~ s/^\(\?-([a-z]*)\)//) {
418 0         0 foreach my $modchar (split '', $mods) {
419 0         0 $mods =~ s/$modchar//g;
420             }
421             }
422              
423 42         96 my $lossy = 0;
424              
425             # now: simplify aspects of the regexp. Bear in mind that we can
426             # simplify as long as we cause the regexp to become more general;
427             # more hits is OK, since false positives will be discarded afterwards
428             # anyway. Simplification that causes the regexp to *not* hit
429             # stuff that the "real" rule would hit, however, is a bad thing.
430              
431 42 100       262 if ($main->{bases_must_be_casei}) {
432 37         148 $rule = lc $rule;
433              
434 37         66 $lossy = 1;
435 37 100       290 $mods =~ s/i// and $lossy = 0;
436              
437             # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
438 37 100       384 $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and $lossy++;
439              
440             # always case-i: /A(?-i:ct)/ => /Act/
441 37 50       168 $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs and $lossy++;
442              
443             # remove (?i)
444 37         86 $rule =~ s/\(\?i\)//gs;
445             }
446             else {
447 5 50       65 die "case-i" if $rule =~ /\(\?i\)/;
448 5 100       43 die "case-i" if $mods =~ /i/;
449              
450             # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
451 4 50       13 $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and die "case-i";
452              
453             # we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/
454 4         16 $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs;
455             }
456              
457             # remove /m and /s modifiers
458 41 50       122 $mods =~ s/m// and $lossy++;
459 41 100       135 $mods =~ s/s// and $lossy++;
460              
461             # remove (^|\b)'s
462             # T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is
463 41 50       260 $rule =~ s/\(\^\|\\b\)//gs and $lossy++;
464 41 50       142 $rule =~ s/\(\$\|\\b\)//gs and $lossy++;
465 41 50       116 $rule =~ s/\(\\b\|\^\)//gs and $lossy++;
466 41 50       152 $rule =~ s/\(\\b\|\$\)//gs and $lossy++;
467              
468             # remove (?!credit)
469 41 100       114 $rule =~ s/\(\?\![^\)]+\)//gs and $lossy++;
470              
471             # remove \b's
472 41 100       187 $rule =~ s/(?<!\\)\\b//gs and $lossy++;
473              
474             # remove the "?=" trick
475             # (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...)
476 41         101 $rule =~ s/\(\?\=\[[^\]]+\]\)//gs;
477              
478 41 100       144 $mods .= "L" if $lossy;
479 41         297 ($rule, $mods);
480             }
481              
482             sub extract_hints {
483 41     41 0 89 my $self = shift;
484 41         275 my $rawrule = shift;
485 41         150 my $rule = shift;
486 41         170 my $mods = shift;
487              
488 41         105 my $main = $self->{main};
489 41         185 my $orig = $rule;
490              
491 41         109 my $lossy = 0;
492 41 100       257 $mods =~ s/L// and $lossy++;
493              
494             # if there are anchors, give up; we can't get much
495             # faster than these anyway
496 41 50       416 die "anchors" if $rule =~ /^\(?(?:\^|\\A)/;
497              
498             # die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/;
499             # just remove end-of-string anchors; they're slow so could gain
500             # from our speedup
501 41 50       412 $rule =~ s/(?<!\\)(?:\$|\\Z)\)?$// and $lossy++;
502              
503             # simplify (?:..) to (..)
504             $main->{bases_allow_noncapture_groups} or
505 41 50       280 $rule =~ s/\(\?:/\(/g;
506              
507             # simplify some grouping arrangements so they're easier for us to parse
508             # (foo)? => (foo|)
509 41         228 $rule =~ s/\((.*?)\)\?/\($1\|\)/gs;
510             # r? => (r|)
511 41         185 $rule =~ s/(?<!\\)(\w)\?/\($1\|\)/gs;
512              
513 41         508 my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
514 41 50       142 $tmpfh or die "failed to create a temporary file";
515 41         173 untaint_var(\$tmpf);
516              
517 41 50       586 print $tmpfh "use bytes; m{" . $rule . "}" . $mods
518             or die "error writing to $tmpf: $!";
519 41 50       1631 close $tmpfh or die "error closing $tmpf: $!";
520              
521 41         450 my $perl = $self->get_perl();
522 41         196 local *IN;
523 41 50       155031 open (IN, "$perl -c -Mre=debug $tmpf 2>&1 |")
524             or die "cannot run $perl: ".exit_status_str($?,$!);
525              
526 41         704 my($inbuf,$nread,$fullstr); $fullstr = '';
  41         395  
527 41         448656 while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }
  41         1146  
528 41 50       221 defined $nread or die "error reading from pipe: $!";
529              
530 41 50       3679 unlink $tmpf or die "cannot unlink $tmpf: $!";
531 41 50       1966 close IN or die "error closing pipe: $!";
532 41 50       201 defined $fullstr or warn "empty result from a pipe";
533              
534             # now parse the -Mre=debug output.
535             # perl 5.10 format
536 41         2130 $fullstr =~ s/^.*\nFinal program:\n//gs;
537             # perl 5.6/5.8 format
538 41         301 $fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs;
539 41         215 $fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;
540             # common to all
541 41         130 $fullstr =~ s/\nOffsets:.*$//gs;
542              
543             # clean up every other line that doesn't start with a space
544 41         1057 $fullstr =~ s/^\S.*$//gm;
545              
546 41 50       936 if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
547 0         0 die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";
548             }
549 41         1121 my $opsstr = $1;
550              
551             # what's left looks like this:
552             # 1: EXACTF <v>(3)
553             # 3: ANYOF[1ILil](14)
554             # 14: EXACTF <a>(16)
555             # 16: CURLY {2,7}(29)
556             # 18: ANYOF[A-Za-z](0)
557             # 29: SPACE(30)
558             # 30: EXACTF <http://>(33)
559             # 33: END(0)
560             #
561 41         133 DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";
562              
563 41         302 my @ops;
564 41         1470 foreach my $op (split(/\n/s, $opsstr)) {
565 439 50       1306 next unless $op;
566              
567 439 100       3767 if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*?)\s*(?:\(\d+\))?$/) {
    100          
    50          
    0          
568             # perl 5.8: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)
569             # perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)
570 321         4271 push @ops, [ $1, $2, $3 ];
571             }
572             elsif ($op =~ /^ (\s*)<(.*)>\.\.\.\s*$/) {
573             # 5: TRIE-EXACT[im](44)
574             # <message contained attachments that have been blocked by guin>...
575 4         35 my $spcs = $1;
576             # we could use the entire length here, but it's easier to trim to
577             # the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
578             # suite results will match, since the sa-update --list extraction will
579             # be the same for all versions. (The "..." trailer is important btw)
580 4         38 my $str = substr ($2, 0, 55);
581 4         67 push @ops, [ $spcs, '_moretrie', "<$str...>" ];
582             }
583             elsif ($op =~ /^ (\s*)(<.*>)\s*(?:\(\d+\))?$/) {
584             # 5: TRIE-EXACT[am](21)
585             # <am> (21)
586             # <might> (12)
587 114         859 push @ops, [ $1, '_moretrie', $2 ];
588             }
589             elsif ($op =~ /^ at .+ line \d+$/) {
590 0         0 next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':
591             }
592             else {
593 0         0 warn "cannot parse '$op': $opsstr";
594 0         0 next;
595             }
596             }
597              
598             # unroll the branches; returns a list of versions.
599             # e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ]
600 41         196 my @unrolled;
601 41 100       269 if ($main->{bases_split_out_alternations}) {
602 35         668 @unrolled = $self->unroll_branches(0, \@ops);
603             } else {
604 6         58 @unrolled = ( \@ops );
605             }
606              
607             # now find the longest DFA-friendly string in each unrolled version
608 41         178 my @longests;
609 41         136 foreach my $opsarray (@unrolled) {
610 136         353 my $longestexact = '';
611 136         328 my $buf = '';
612              
613             # use a closure to keep the code succinct
614             my $add_candidate = sub {
615 430 100   430   1017 if (length $buf > length $longestexact) { $longestexact = $buf; }
  159         335  
616 430         746 $buf = '';
617 136         1408 };
618              
619 136         274 my $prevop;
620 136         234 foreach my $op (@{$opsarray}) {
  136         357  
621 635         1152 my ($spcs, $item, $args) = @{$op};
  635         1962  
622              
623 635 100       1834 next if ($item eq 'NOTHING');
624              
625             # EXACT == case-sensitive
626             # EXACTF == case-i
627             # we can do both, since we canonicalize to lc.
628 626 100 100     7674 if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
    100 66        
    50 100        
    100 66        
      66        
      100        
      66        
      33        
      33        
      0        
      66        
      100        
      66        
      66        
      33        
      33        
629             {
630 142         653 my $str = $1;
631 142         335 $buf .= $str;
632 142 100       509 if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
633             # a high Unicode codepoint, interpreted by perl 5.8.x. cut and stop
634 1         97 $add_candidate->();
635             }
636 142 50 33     514 if (length $str >= 55 && $buf =~ s/\.\.\.$//) {
637             # perl 5.8.x truncates with a "..." here! cut and stop
638 0         0 $add_candidate->();
639             }
640             }
641             # _moretrie == a TRIE-EXACT entry
642             elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
643             {
644 190         484 $buf .= $1;
645 190 100 66     531 if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
646             # perl 5.8.x truncates with a "..." here! cut and stop
647 4         28 $add_candidate->();
648             }
649             }
650             # /(?:foo|bar|baz){2}/ results in a CURLYX beforehand
651             elsif ($item =~ /^EXACT/ &&
652             $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
653             $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
654             $args =~ /<(.*)>/)
655             {
656 0         0 $buf .= $1;
657 0 0 0     0 if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
658             # perl 5.8.x truncates with a "..." here! cut and stop
659 0         0 $add_candidate->();
660             }
661             }
662             # CURLYX, for perl >= 5.9.5
663             elsif ($item =~ /^_moretrie/ &&
664             $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
665             $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
666             $args =~ /<(.*)>/)
667             {
668 5         26 $buf .= $1;
669 5 50 33     34 if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
670             # perl 5.8.x truncates with a "..." here! cut and stop
671 0         0 $add_candidate->();
672             }
673             }
674             else {
675             # not an /^EXACT/; clear the buffer
676 289         821 $add_candidate->();
677 289 100       1458 if ($item !~ /^(?:END|CLOSE\d|MINMOD)$/)
678             {
679 173         312 $lossy = 1;
680 173         288 DEBUG_RE_PARSING and warn "item $item makes regexp lossy";
681             }
682             }
683 626         1318 $prevop = $op;
684             }
685 136         479 $add_candidate->();
686              
687 136 100       299 if (!$longestexact) {
688 1         102 die "no long-enough string found in $rawrule";
689             # all unrolled versions must have a long string, otherwise
690             # we cannot reliably match all variants of the rule
691             } else {
692 135 100       1265 push @longests, ($main->{bases_must_be_casei}) ?
693             lc $longestexact : $longestexact;
694             }
695             }
696              
697 40         108 DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
698 40         2537 return ($lossy, @longests);
699             }
700              
701             ###########################################################################
702              
703             sub unroll_branches {
704 187     187 0 605 my ($self, $depth, $opslist) = @_;
705              
706 187 50       530 die "too deep" if ($depth++ > 5);
707              
708 187         302 my @ops = (@{$opslist}); # copy
  187         558  
709 187         744 my @pre_branch_ops;
710             my $branch_spcs;
711 187         0 my $trie_spcs;
712 187         0 my $open_spcs;
713              
714             # our input looks something like this 2-level structure:
715             # 1: BOUND(2)
716             # 2: EXACT <Dear >(5)
717             # 5: BRANCH(9)
718             # 6: EXACT <IT>(8)
719             # 8: NALNUM(24)
720             # 9: BRANCH(23)
721             # 10: EXACT <Int>(12)
722             # 12: BRANCH(14)
723             # 13: NOTHING(21)
724             # 14: BRANCH(17)
725             # 15: EXACT <a>(21)
726             # 17: BRANCH(20)
727             # 18: EXACT <er>(21)
728             # 20: TAIL(21)
729             # 21: EXACT <net>(24)
730             # 23: TAIL(24)
731             # 24: EXACT < shop>(27)
732             # 27: END(0)
733             #
734             # or:
735             #
736             # 1: OPEN1(3)
737             # 3: BRANCH(6)
738             # 4: EXACT <v>(9)
739             # 6: BRANCH(9)
740             # 7: EXACT <\\/>(9)
741             # 9: CLOSE1(11)
742             # 11: CURLY {2,5}(14)
743             # 13: REG_ANY(0)
744             # 14: EXACT < g r a >(17)
745             # 17: ANYOF[a-z](28)
746             # 28: END(0)
747             #
748             # or:
749             #
750             # 1: EXACT <i >(3)
751             # 3: OPEN1(5)
752             # 5: TRIE-EXACT[am](21)
753             # <am> (21)
754             # <might> (12)
755             # 12: OPEN2(14)
756             # 14: TRIE-EXACT[ ](19)
757             # < be>
758             # <>
759             # 19: CLOSE2(21)
760             # 21: CLOSE1(23)
761             # 23: EXACT < c>(25)
762              
763 187         323 DEBUG_RE_PARSING and warn "starting parse";
764              
765             # this happens for /foo|bar/ instead of /(?:foo|bar)/ ; transform
766             # it into the latter. bit of a kludge to do this before the loop, but hey.
767             # note that it doesn't fix the CLOSE1/END ordering to be correct
768 187 50 66     1317 if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {
769 0         0 my @newops = ([ "", "OPEN1", "" ]);
770 0         0 foreach my $op (@ops) {
771 0         0 push @newops, [ " ".$op->[0], $op->[1], $op->[2] ];
772             }
773 0         0 push @newops, [ "", "CLOSE1", "" ];
774 0         0 @ops = @newops;
775             }
776              
777             # iterate until we start a branch set. using
778             # /dkjfksl(foo|bar(baz|argh)boo)gab/ as an example, we're at "dkj..."
779             # just hitting an OPEN is not enough; wait until we see a TRIE-EXACT
780             # or a BRANCH, *then* unroll the most recent OPEN set.
781 187         390 while (1) {
782 968         1652 my $op = shift @ops;
783 968 100       2061 last unless defined $op;
784              
785 838         1227 my ($spcs, $item, $args) = @{$op};
  838         2801  
786 838         1360 DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";
787              
788 838 100 100     4602 if ($item =~ /^OPEN/) {
    100          
    100          
    100          
    100          
789 54         158 $open_spcs = $spcs;
790 54         127 next; # next will be a BRANCH or TRIE
791              
792             } elsif ($item =~ /^TRIE/) {
793 49         121 $trie_spcs = $spcs;
794 49         120 last;
795              
796             } elsif ($item =~ /^BRANCH/) {
797 8         90 $branch_spcs = $spcs;
798 8         53 last;
799              
800             } elsif ($item =~ /^EXACT/ && defined $open_spcs) {
801             # perl 5.9.5 does this; f(o|oish) => OPEN, EXACT, TRIE-EXACT
802 7         75 push @pre_branch_ops, [ $open_spcs, $item, $args ];
803 7         34 next;
804              
805             } elsif (defined $open_spcs) {
806             # OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:
807             # ignore this OPEN block entirely and don't try to unroll it
808 2         23 undef $open_spcs;
809              
810             } else {
811 718         1515 push @pre_branch_ops, $op;
812             }
813             }
814              
815             # no branches found? we're done unrolling on this one!
816 187 100       537 if (scalar @ops == 0) {
817 130         733 return [ @pre_branch_ops ];
818             }
819              
820             # otherwise we're at the start of a new branch set
821             # /(foo|bar(baz|argh)boo)gab/
822 57         151 my @alts;
823             my @in_this_branch;
824              
825 57         141 DEBUG_RE_PARSING and warn "entering branch: ".
826             "open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".
827             "branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".
828             "trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";
829              
830             # indentation level to remove from "normal" ops (using a s///)
831 57 100       319 my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
832 57         181 my $trie_sub_spcs = "";
833 57         129 while (1) {
834 387         683 my $op = shift @ops;
835 387 50       846 last unless defined $op;
836 387         641 my ($spcs, $item, $args) = @{$op};
  387         1155  
837 387         648 DEBUG_RE_PARSING and warn "in: [$spcs] $item $args";
838              
839 387 100 100     4072 if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt
    50 66        
    100 66        
    100 33        
    100 100        
      100        
      100        
      66        
840 9         59 push @alts, [ @pre_branch_ops, @in_this_branch ];
841 9         38 @in_this_branch = ();
842 9         65 $open_sub_spcs = $branch_spcs." ";
843 9         38 $trie_sub_spcs = "";
844 9         35 next;
845             }
846             elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end
847 0         0 push @alts, [ @pre_branch_ops, @in_this_branch ];
848 0         0 undef $branch_spcs;
849 0         0 $open_sub_spcs = "";
850 0         0 $trie_sub_spcs = "";
851 0         0 last;
852             }
853             elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {
854 135 100       387 if (scalar @in_this_branch > 0) {
855 86         355 push @alts, [ @pre_branch_ops, @in_this_branch ];
856             }
857             # use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)
858 135         679 @in_this_branch = ( [ $open_spcs, $item, $args ] );
859 135 50       331 $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
860 135         311 $trie_sub_spcs = " ";
861 135         261 next;
862             }
863             elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end
864 52         215 push @alts, [ @pre_branch_ops, @in_this_branch ];
865 52         165 undef $branch_spcs;
866 52         121 undef $open_spcs;
867 52         97 undef $trie_spcs;
868 52         90 $open_sub_spcs = "";
869 52         111 $trie_sub_spcs = "";
870 52         116 last;
871             }
872             elsif ($item eq 'END') { # of string
873 5         29 push @alts, [ @pre_branch_ops, @in_this_branch ];
874 5         24 undef $branch_spcs;
875 5         16 undef $open_spcs;
876 5         16 undef $trie_spcs;
877 5         19 $open_sub_spcs = "";
878 5         29 $trie_sub_spcs = "";
879 5         22 last;
880             }
881             else {
882 186 50       467 if ($open_sub_spcs) {
883             # deindent the space-level to match the opening brace
884 186         1151 $spcs =~ s/^$open_sub_spcs//;
885             # tries also add one more indent level in
886 186         906 $spcs =~ s/^$trie_sub_spcs//;
887             }
888 186         796 push @in_this_branch, [ $spcs, $item, $args ];
889             # note that we ignore ops at a deeper $spcs level entirely (until later!)
890             }
891             }
892              
893 57 50       203 if (defined $branch_spcs) {
894 0         0 die "fell off end of string with a branch open: '$branch_spcs'";
895             }
896              
897             # we're now after the branch set: /gab/
898             # @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz|argh)boo/ ]
899 57         257 foreach my $alt (@alts) {
900 152         279 push @{$alt}, @ops; # add all remaining ops to each one
  152         435  
901             # note that this could include more (?:...); we don't care, since
902             # those can be handled by recursing
903             }
904              
905             # ok, parsed the entire ops list
906             # @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz|argh)boogab/ ]
907              
908 57         114 if (DEBUG_RE_PARSING) {
909             print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
910             }
911              
912             # now recurse, to unroll the remaining branches (if any exist)
913 57         134 my @rets;
914 57         155 foreach my $alt (@alts) {
915 152         745 push @rets, $self->unroll_branches($depth, $alt);
916             }
917              
918 57         125 if (DEBUG_RE_PARSING) {
919             print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
920             }
921              
922 57         413 return @rets;
923             }
924              
925             ###########################################################################
926              
927             sub test {
928 0     0 0 0 my ($self) = @_;
929              
930 0         0 $self->test_split_alt("foo", "/foo/");
931 0         0 $self->test_split_alt("(foo)", "/foo/");
932 0         0 $self->test_split_alt("foo(bar)baz", "/foobarbaz/");
933 0         0 $self->test_split_alt("x(foo|)", "/xfoo/ /x/");
934 0         0 $self->test_split_alt("fo(o|)", "/foo/ /fo/");
935 0         0 $self->test_split_alt("(foo|bar)", "/foo/ /bar/");
936 0         0 $self->test_split_alt("foo|bar", "/foo/ /bar/");
937 0         0 $self->test_split_alt("foo (bar|baz) argh", "/foo bar argh/ /foo baz argh/");
938 0         0 $self->test_split_alt("foo (bar|baz|bl(arg|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");
939 0         0 $self->test_split_alt("(s(otc|tco)k)", "/sotck/ /stcok/");
940 0         0 $self->test_split_alt("(business partner(s|ship|)|silent partner(s|ship|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");
941             }
942              
943             sub test_split_alt {
944 0     0 0 0 my ($self, $in, $out) = @_;
945              
946 0         0 my @got = $self->split_alt($in);
947 0         0 $out =~ s/^\///;
948 0         0 $out =~ s/\/$//;
949 0         0 my @want = split(/\/ \//, $out);
950              
951 0         0 my $failed = 0;
952 0 0       0 if (scalar @want != scalar @got) {
953 0         0 warn "FAIL: results count don't match";
954 0         0 $failed++;
955             }
956             else {
957 0         0 my %got = map { $_ => 1 } @got;
  0         0  
958 0         0 foreach my $w (@want) {
959 0 0       0 if (!$got{$w}) {
960 0         0 warn "FAIL: '$w' not found";
961 0         0 $failed++;
962             }
963             }
964             }
965              
966 0 0       0 if ($failed) {
967 0 0       0 print "want: /".join('/ /', @want)."/\n" or die "error writing: $!";
968 0 0       0 print "got: /".join('/ /', @got)."/\n" or die "error writing: $!";
969 0         0 return 0;
970             } else {
971 0 0       0 print "ok\n" or die "error writing: $!";
972 0         0 return 1;
973             }
974             }
975              
976             ###########################################################################
977              
978             sub get_perl {
979 41     41 0 132 my ($self) = @_;
980 41         85 my $perl;
981              
982             # allow user override of the perl interpreter to use when
983             # extracting base strings.
984             # TODO: expose this via sa-compile command-line option
985 41         173 my $fromconf = $self->{main}->{conf}->{re_parser_perl};
986              
987 41 50       493 if ($fromconf) {
    50          
988 0         0 $perl = $fromconf;
989             } elsif ($^X =~ m|^/|) {
990 41         116 $perl = $^X;
991             } else {
992 1     1   12 use Config;
  1         9  
  1         1126  
993 0         0 $perl = $Config{perlpath};
994 0         0 $perl =~ s|/[^/]*$|/$^X|;
995             }
996 41         266 untaint_var(\$perl);
997 41         116 return $perl;
998             }
999              
1000             ###########################################################################
1001              
1002             sub read_cachefile {
1003 0     0 0 0 my ($self, $cachefile) = @_;
1004 0         0 local *IN;
1005 0 0       0 if (open(IN, "<".$cachefile)) {
1006 0         0 my($inbuf,$nread,$str); $str = '';
  0         0  
1007 0         0 while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf }
  0         0  
1008 0 0       0 defined $nread or die "error reading from $cachefile: $!";
1009 0 0       0 close IN or die "error closing $cachefile: $!";
1010              
1011 0         0 untaint_var(\$str);
1012 0         0 my $VAR1; # Data::Dumper
1013 0 0       0 if (eval $str) {
1014 0         0 return $VAR1; # Data::Dumper's naming
1015             }
1016             }
1017 0         0 return { };
1018             }
1019              
1020             sub write_cachefile {
1021 0     0 0 0 my ($self, $cachefile, $cached) = @_;
1022              
1023 0         0 my $dump = Data::Dumper->new ([ $cached ]);
1024 0         0 $dump->Deepcopy(1);
1025 0         0 $dump->Purity(1);
1026 0         0 $dump->Indent(1);
1027 0 0       0 if (mkdir($self->{main}->{bases_cache_dir})) {
    0          
1028             # successfully created
1029             } elsif ($! == EEXIST) {
1030 0         0 dbg("zoom: ok, cache directory already existed");
1031             } else {
1032 0         0 warn "cannot create a directory: $!";
1033             }
1034 0 0       0 open(CACHE, ">$cachefile") or warn "cannot write to $cachefile";
1035 0 0       0 print CACHE ($dump->Dump, ";1;") or die "error writing: $!";
1036 0 0       0 close CACHE or die "error closing $cachefile: $!";
1037             }
1038              
1039             =over 4
1040              
1041             =item my ($cleanregexp) = fixup_re($regexp);
1042              
1043             Converts encoded characters in a regular expression pattern into their
1044             equivalent characters
1045              
1046             =back
1047              
1048             =cut
1049              
1050             sub fixup_re {
1051 135     135 1 372 my $re = shift;
1052            
1053 135 0       298 if ($fixup_re_test) { print "INPUT: /$re/\n" or die "error writing: $!" }
  0 50       0  
1054            
1055 135         562 my $output = "";
1056 135         1373 my $TOK = qr([\"\\]);
1057              
1058 135         396 my $STATE;
1059 135         1029 local ($1,$2);
1060 135         1398 while ($re =~ /\G(.*?)($TOK)/gcs) {
1061 0         0 my $pre = $1;
1062 0         0 my $tok = $2;
1063              
1064 0 0       0 if (length($pre)) {
1065 0         0 $output .= "\"$pre\"";
1066             }
1067              
1068 0 0       0 if ($tok eq '"') {
    0          
1069 0         0 $output .= '"\\""';
1070             }
1071             elsif ($tok eq '\\') {
1072 0 0       0 $re =~ /\G(x\{[^\}]+\}|[0-7]{1,3}|.)/gcs or die "\\ at end of string!";
1073 0         0 my $esc = $1;
1074 0 0       0 if ($esc eq '"') {
    0          
    0          
    0          
1075 0         0 $output .= '"\\""';
1076             } elsif ($esc eq '\\') {
1077 0         0 $output .= '"**BACKSLASH**"'; # avoid hairy escape-parsing
1078             } elsif ($esc =~ /^x\{(\S+)\}\z/) {
1079 0         0 $output .= '"'.chr(hex($1)).'"';
1080             } elsif ($esc =~ /^[0-7]{1,3}\z/) {
1081 0         0 $output .= '"'.chr(oct($esc)).'"';
1082             } else {
1083 0         0 $output .= "\"$esc\"";
1084             }
1085             }
1086             else {
1087 0 0       0 print "PRE: $pre\nTOK: $tok\n" or die "error writing: $!";
1088             }
1089             }
1090            
1091 135 50       398 if (!defined(pos($re))) {
    0          
1092             # no matches
1093 135         673 $output .= "\"$re\"";
1094             # Bug 6649: protect NL, NULL, ^Z, (and controls to stay on the safe side)
1095 135         529 $output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
  0         0  
1096             }
1097             elsif (pos($re) <= length($re)) {
1098 0         0 $output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
  0         0  
1099 0         0 $output .= fixup_re(substr($re, pos($re)));
1100             }
1101              
1102 135         298 $output =~ s/^""/"/; # protect start and end quotes
1103 135         348 $output =~ s/(?<!\\)""\z/"/;
1104 135         266 $output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
1105 135         243 $output =~ s/\*\*BACKSLASH\*\*/\\\\/gs;
1106              
1107 135 0       314 if ($fixup_re_test) { print "OUTPUT: $output\n" or die "error writing: $!" }
  0 50       0  
1108 135         604 return $output;
1109             }
1110              
1111             1;