File Coverage

lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
Criterion Covered Total %
statement 400 523 76.4
branch 152 272 55.8
condition 92 151 60.9
subroutine 29 33 87.8
pod 3 14 21.4
total 676 993 68.0


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Copyright 2006 Apache Software Foundation
3             #
4             # Licensed under the Apache License, Version 2.0 (the "License");
5             # you may not use this file except in compliance with the License.
6             # You may obtain a copy of the License at
7             #
8             # http://www.apache.org/licenses/LICENSE-2.0
9             #
10             # Unless required by applicable law or agreed to in writing, software
11             # distributed under the License is distributed on an "AS IS" BASIS,
12             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13             # See the License for the specific language governing permissions and
14             # limitations under the License.
15             # </@LICENSE>
16              
17             =head1 NAME
18              
19             Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset
20              
21             =head1 SYNOPSIS
22              
23             This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
24             suitable for use in Rule2XSBody rules or other parallel matching algorithms.
25              
26             =cut
27              
28              
29             use Mail::SpamAssassin::Plugin;
30 1     1   8 use Mail::SpamAssassin::Logger;
  1         2  
  1         27  
31 1     1   4 use Mail::SpamAssassin::Util qw(untaint_var qr_to_string);
  1         2  
  1         60  
32 1     1   7 use Mail::SpamAssassin::Util::Progress;
  1         2  
  1         52  
33 1     1   306  
  1         3  
  1         39  
34             use Errno qw(ENOENT EACCES EEXIST);
35 1     1   6 use Data::Dumper;
  1         2  
  1         97  
36 1     1   598  
  1         5591  
  1         53  
37             use strict;
38 1     1   7 use warnings;
  1         2  
  1         18  
39 1     1   12 # use bytes;
  1         3  
  1         27  
40             use re 'taint';
41 1     1   5  
  1         2  
  1         26  
42             # Not a constant hashref for 5.6 compat
43             use constant SLOT_BASE => 0;
44 1     1   4 use constant SLOT_NAME => 1;
  1         2  
  1         46  
45 1     1   5 use constant SLOT_ORIG => 2;
  1         2  
  1         36  
46 1     1   6 use constant SLOT_LEN_BASE => 3;
  1         2  
  1         46  
47 1     1   6 use constant SLOT_BASE_INITIAL => 4;
  1         2  
  1         37  
48 1     1   5 use constant SLOT_HAS_MULTIPLE => 5;
  1         2  
  1         49  
49 1     1   6  
  1         2  
  1         51  
50             use constant CLOBBER => '';
51 1     1   7  
  1         2  
  1         65  
52             our @ISA = qw(Mail::SpamAssassin::Plugin);
53              
54             use constant DEBUG_RE_PARSING => 0; # noisy!
55 1     1   7  
  1         2  
  1         4608  
56             # a few settings that control what kind of bases are output.
57              
58             # treat all rules as lowercase for purposes of term extraction?
59             # $main->{bases_must_be_casei} = 1;
60             # $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/
61             # $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/
62             # $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
63             # $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]
64             # $main->{base_quiet} = 0; # silences progress output
65              
66             # TODO: it would be nice to have a clean API to pass such settings
67             # through to plugins instead of hanging them off $main
68              
69             ##############################################################################
70              
71             # testing purposes only
72             my $fixup_re_test;
73             #$fixup_re_test = 1; fixup_re("fr()|\\\\|"); die;
74             #$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;
75             #$fixup_re_test = 1; fixup_re("\\33\$b"); die;
76             #$fixup_re_test = 1; fixup_re("[link]"); die;
77             #$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;
78              
79             ###########################################################################
80              
81             my $class = shift;
82             my $mailsaobject = shift;
83 15     15 1 122 $class = ref($class) || $class;
84 15         111 my $self = $class->SUPER::new($mailsaobject);
85 15   33     244 bless ($self, $class);
86 15         112  
87 15         40 $self->{show_progress} = !$mailsaobject->{base_quiet};
88              
89 15         79 # $self->test(); exit;
90             return $self;
91             }
92 15         136  
93             ###########################################################################
94              
95             my ($self, $params) = @_;
96             my $conf = $params->{conf};
97             $self->extract_bases($conf);
98 15     15 1 46 }
99 15         43  
100 15         63 my ($self, $conf) = @_;
101              
102             my $main = $conf->{main};
103             if (!$main->{base_extract}) { return; }
104 15     15 0 47  
105             $self->{show_progress} and
106 15         40 info("base extraction starting. this can take a while...");
107 15 50       56  
  0         0  
108             $self->extract_set($conf, $conf->{body_tests}, 'body');
109             }
110 15 50       53  
111             my ($self, $conf, $test_set, $ruletype) = @_;
112 15         102  
113             foreach my $pri (keys %{$test_set}) {
114             my $nicepri = $pri; $nicepri =~ s/-/neg/g;
115             $self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
116 15     15 0 99 }
117             }
118 15         32  
  15         119  
119 15         40 ###########################################################################
  15         137  
120 15         113  
121             my ($self, $conf, $rules, $ruletype) = @_;
122              
123             my @good_bases;
124             my @failed;
125             my $yes = 0;
126             my $no = 0;
127 15     15 0 53 my $count = 0;
128             my $start = time;
129 15         33 $self->{main} = $conf->{main}; # for use in extract_hints()
130             $self->{show_progress} and info ("extracting from rules of type $ruletype");
131 15         32 my $tflags = $conf->{tflags};
132 15         75  
133 15         25 # attempt to find good "base strings" (simplified regexp subsets) for each
134 15         28 # regexp. We try looking at the regexp from both ends, since there
135 15         42 # may be a good long string of text at the end of the rule.
136 15 50       40  
137 15         54 # require this many chars in a base string + delimiters for it to be viable
138             my $min_chars = 5;
139              
140             my $progress;
141             $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
142             total => (scalar keys %{$rules} || 1),
143             itemtype => 'rules',
144 15         32 });
145              
146 15         40 my $cached = { };
147             my $cachefile;
148 15 50 0     52  
149             if ($self->{main}->{bases_cache_dir}) {
150             $cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";
151             dbg("zoom: reading cache file $cachefile");
152 15         35 $cached = $self->read_cachefile($cachefile);
153 15         32 }
154              
155 15 50       55 NEXT_RULE:
156 0         0 foreach my $name (keys %{$rules}) {
157 0         0 $self->{show_progress} and $progress and $progress->update(++$count);
158 0         0  
159             #my $rule = $rules->{$name};
160             my $rule = qr_to_string($conf->{test_qrs}->{$name});
161             if (!defined $rule) {
162 15         31 die "zoom: error: regexp for $rule not found\n";
  15         66  
163 42 50 33     325 }
164             my $cachekey = $name.'#'.$rule;
165              
166 42         965 my $cent = $cached->{rule_bases}->{$cachekey};
167 42 50       148 if (defined $cent) {
168 0         0 if (defined $cent->{g}) {
169             dbg("zoom: YES (cached) $rule $name");
170 42         511 foreach my $ent (@{$cent->{g}}) {
171             # note: we have to copy these, since otherwise later
172 42         175 # modifications corrupt the cached data
173 42 50       128 push @good_bases, {
174 0 0       0 base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}
175 0         0 };
176 0         0 }
  0         0  
177             $yes++;
178             }
179             else {
180             dbg("zoom: NO (cached) $rule $name");
181 0         0 push @failed, { orig => $rule }; # no need to cache this
182             $no++;
183 0         0 }
184             next NEXT_RULE;
185             }
186 0         0  
187 0         0 # ignore ReplaceTags rules
188 0         0 my $is_a_replacetags_rule = $conf->{replace_rules}->{$name};
189             my ($minlen, $lossy, @bases);
190 0         0  
191             if (!$is_a_replacetags_rule) {
192             eval { # catch die()s
193             my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
194 42         116 ($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);
195 42         99 # dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases));
196             1;
197 42 50       150 } or do {
198             my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
199 42         385 dbg("zoom: giving up on regexp: $eval_stat");
200 41         205 };
201              
202 40         578 #if ($lossy && ($tflags->{$name}||'') =~ /\bmultiple\b/) {
203 42 100       81 # warn "\nzoom: $vers rule $name will loop on SpamAssassin older than 3.3.2 ".
204 2 50       36 # "running under Perl 5.12 or older, Bug 6558\n";
  2         17  
205 2         57 #}
206              
207             # if any of the extracted hints in a set are too short, the entire
208             # set is invalid; this is because each set of N hints represents just
209             # 1 regexp.
210             foreach my $str (@bases) {
211             my $len = length fixup_re($str); # bug 6143: count decoded characters
212             if ($len < $min_chars) { $minlen = undef; @bases = (); last; }
213             elsif (!defined($minlen) || $len < $minlen) { $minlen = $len; }
214             }
215             }
216 42         413  
217 135         941 if ($is_a_replacetags_rule || !$minlen || !@bases) {
218 135 100 100     1316 dbg("zoom: ignoring rule %s, %s", $name,
  3 100       56  
  3         27  
  3         24  
219 62         267 $is_a_replacetags_rule ? 'is a replace rule'
220             : !@bases ? 'no bases' : 'no minlen');
221             push @failed, { orig => $rule };
222             $cached->{rule_bases}->{$cachekey} = { };
223 42 100 66     1012 $no++;
      66        
224 5 50       132 }
    50          
225             else {
226             # dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
227 5         81  
228 5         70 # figure out if we have e.g. ["foo", "foob", "foobar"]; in this
229 5         38 # case, we only need to track ["foo"].
230             my %subsumed;
231             foreach my $base1 (@bases) {
232             foreach my $base2 (@bases) {
233             if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {
234             $subsumed{$base1} = 1; # base2 is inside base1; discard the longer
235             }
236 37         113 }
237 37         106 }
238 132         431  
239 2524 100 100     22754 my @forcache;
240 4         61 foreach my $base (@bases) {
241             next if $subsumed{$base};
242             push @good_bases, {
243             base => $base, orig => $rule, name => "$name,[l=$lossy]"
244             };
245 37         305 # *separate* copies for cache -- we modify the @good_bases entry
246 37         138 push @forcache, {
247 132 100       636 base => $base, orig => $rule, name => "$name,[l=$lossy]"
248 128         1736 };
249             }
250              
251             $cached->{rule_bases}->{$cachekey} = { g => \@forcache };
252 128         1049 $yes++;
253             }
254             }
255              
256             $self->{show_progress} and $progress and $progress->final();
257 37         645  
258 37         399 dbg("zoom: $ruletype: found ".(scalar @good_bases).
259             " usable base strings in $yes rules, skipped $no rules");
260              
261             # NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
262 15 50 33     279 # ("food" =~ "foo" / "food") will return "food". So therefore if a pattern
263             # subsumes other patterns, we need to return hits for all of them. We also
264 15         638 # need to take care of the case where multiple regexps wind up sharing the
265             # same base.
266             #
267             # Another gotcha, an exception to the subsumption rule; if one pattern isn't
268             # entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
269             # returned as two hits, correctly. So we only have to be smart about the
270             # full-subsumption case; overlapping is taken care of for us, by re2c.
271             #
272             # TODO: there's a bug here. Since the code in extract_hints() has been
273             # modified to support more complex regexps, we can no longer simply assume
274             # that if pattern A is not contained in pattern B, that means that pattern B
275             # doesn't subsume it. Consider, for example, A="foo*bar" and
276             # B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
277             # that without running the A RE match itself somehow against B.
278             # same issue remains with:
279             #
280             # "foo?bar" / "fobar"
281             # "fo(?:o|oo|)bar" / "fobar"
282             # "fo(?:o|oo)?bar" / "fobar"
283             # "fo(?:o*|baz)bar" / "fobar"
284             # "(?:fo(?:o*|baz)bar|blargh)" / "fobar"
285             #
286             # it's worse with this:
287             #
288             # "fo(?:o|oo|)bar" / "foo*bar"
289             #
290             # basically, this is impossible to compute without reimplementing most of
291             # re2c, and it appears the re2c developers don't plan to offer this:
292             # https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
293              
294             $conf->{base_orig}->{$ruletype} = { };
295             $conf->{base_string}->{$ruletype} = { };
296              
297             $count = 0;
298             $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
299             total => (scalar @good_bases || 1),
300 15         232 itemtype => 'bases',
301 15         144 });
302              
303 15         43 # this bit is annoyingly O(N^2). Rewrite the data -- the @good_bases
304 15 50 0     60 # array -- into a more efficient format, using arrays and with a little
305             # bit of precomputation, to go (quite a bit) faster
306             my @rewritten;
307             foreach my $set1 (@good_bases) {
308             my $base = $set1->{base};
309             next if (!$base || !$set1->{name});
310             push @rewritten, [
311             $base, # 0 - SLOT_BASE
312 15         62 $set1->{name}, # 1 - SLOT_NAME
313 15         61 $set1->{orig}, # 2 - SLOT_ORIG
314 128         462 length $base, # 3 - SLOT_LEN_BASE
315 128 50 33     702 $base, # 4 - SLOT_BASE_INITIAL
316             0 # 5 - SLOT_HAS_MULTIPLE, has_multiple flag
317             ];
318             }
319              
320 128         1017 @good_bases = sort {
321             $b->[SLOT_LEN_BASE] <=> $a->[SLOT_LEN_BASE] ||
322             $a->[SLOT_BASE] cmp $b->[SLOT_BASE] ||
323             $a->[SLOT_NAME] cmp $b->[SLOT_NAME] ||
324             $a->[SLOT_ORIG] cmp $b->[SLOT_ORIG]
325             } @rewritten;
326              
327 15 50 100     248  
  418   100     2105  
328             my $base_orig = $conf->{base_orig}->{$ruletype};
329             my $next_base_position = 0;
330             for my $set1 (@good_bases) {
331             $next_base_position++;
332             $self->{show_progress} and $progress and $progress->update(++$count);
333             my $base1 = $set1->[SLOT_BASE] or next; # got clobbered
334 15         77 my $name1 = $set1->[SLOT_NAME];
335 15         48 my $orig1 = $set1->[SLOT_ORIG];
336 15         70 my $len1 = $set1->[SLOT_LEN_BASE];
337 128         297 $base_orig->{$name1} = $orig1;
338 128 50 33     342  
339 128 100       481 foreach my $set2 (@good_bases[$next_base_position .. $#good_bases]) { # order from smallest to largest
340 104         270 # clobber false and exact dups; this can happen if a regexp outputs the
341 104         142 # same base string multiple times
342 104         209 if (!$set2->[SLOT_BASE] ||
343 104         353 (
344             $base1 eq $set2->[SLOT_BASE] &&
345 104         440 $name1 eq $set2->[SLOT_NAME] &&
346             $orig1 eq $set2->[SLOT_ORIG]
347             )
348 1685 100 100     6466 )
      66        
      66        
349             {
350             #dbg("clobbering: [base2][$set2->[SLOT_BASE]][name2][$set2->[SLOT_NAME]][orig][$set2->[SLOT_ORIG]]");
351             $set2->[SLOT_BASE] = CLOBBER; # clobber
352             next;
353             }
354              
355             # Cannot be a subset if it does not contain the other base string
356             next if index($base1,$set2->[SLOT_BASE_INITIAL]) == -1;
357 24         75  
358 24         130 # skip if either already contains the other rule's name
359             # optimize: this can only happen if the base has more than
360             # one rule already attached, ie [5]
361             next if ($set2->[SLOT_HAS_MULTIPLE] && index($set2->[SLOT_NAME],$name1) > -1 && $set2->[SLOT_NAME] =~ /(?: |^)\Q$name1\E(?: |$)/);
362 1661 100       6358  
363             # don't use $name1 here, since another base in the set2 loop
364             # may have added $name2 since we set that
365             next if ($set1->[SLOT_HAS_MULTIPLE] && index($set1->[SLOT_NAME],$set2->[SLOT_NAME]) > -1 && $set1->[SLOT_NAME] =~ /(?: |^)\Q$set2->[SLOT_NAME]\E(?: |$)/);
366              
367 33 0 33     107 # $set2->[SLOT_BASE] is just a subset of base1
      33        
368             #dbg("zoom: subsuming '$set2->[SLOT_BASE]' ($set2->[SLOT_NAME]) into '$base1': [SLOT_BASE]=$set1->[SLOT_BASE] [SLOT_HAS_MULTIPLE]=$set1->[SLOT_HAS_MULTIPLE]");
369             $set1->[SLOT_NAME] .= " ".$set2->[SLOT_NAME];
370             $set1->[SLOT_HAS_MULTIPLE] = 1;
371 33 50 66     104 }
      33        
372             }
373              
374             # we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS
375 33         141 # both contain "killed" for example, pointing at different rules, which
376 33         102 # the above search hasn't found. Collapse them here with a hash
377             my %bases;
378             foreach my $set (@good_bases) {
379             my $base = $set->[0];
380             next unless $base;
381              
382             if (defined $bases{$base}) {
383 15         63 $bases{$base} .= " ".$set->[1];
384 15         82 } else {
385 128         399 $bases{$base} = $set->[1];
386 128 100       355 }
387             }
388 104 100       251 undef @good_bases;
389 12         66  
390             my $base_string = $conf->{base_string}->{$ruletype};
391 92         709 foreach my $base (keys %bases) {
392             # uniq the list, since there are probably dup rules listed
393             my %u;
394 15         94 for my $i (split ' ', $bases{$base}) {
395             next if exists $u{$i}; undef $u{$i};
396 15         56 }
397 15         139 $base_string->{$base} = join ' ', sort keys %u;
398             }
399 92         202  
400 92         287 $self->{show_progress} and $progress and $progress->final();
401 137 100       387  
  120         307  
402             if ($cachefile) {
403 92         478 $self->write_cachefile ($cachefile, $cached);
404             }
405              
406 15 50 33     96 my $elapsed = time - $start;
407             $self->{show_progress} and info ("$ruletype: ".
408 15 50       48 (scalar keys %{$conf->{base_string}->{$ruletype}}).
409 0         0 " base strings extracted in $elapsed seconds\n");
410             }
411              
412 15         51 ###########################################################################
413              
414 15 50       1310 # TODO:
  0         0  
415             # NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i
416             # => should extract 'scription' somehow
417             # /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
418             # => should understand alternations; tricky
419              
420             my $self = shift;
421             my $rule = shift;
422              
423             my $main = $self->{main};
424              
425              
426             my $mods = '';
427 42     42 0 279  
428 42         176 # remove the regexp modifiers, keep for later
429             while ($rule =~ s/^\(\?([a-z]*)\)//) {
430 42         138 $mods .= $1;
431             }
432              
433 42         163 # modifier removal
434             while ($rule =~ s/^\(\?-([a-z]*)\)//) {
435             foreach my $modchar (split '', $mods) {
436 42         550 $mods =~ s/$modchar//g;
437 20         127 }
438             }
439              
440             my $lossy = 0;
441 42         194  
442 0         0 # now: simplify aspects of the regexp. Bear in mind that we can
443 0         0 # simplify as long as we cause the regexp to become more general;
444             # more hits is OK, since false positives will be discarded afterwards
445             # anyway. Simplification that causes the regexp to *not* hit
446             # stuff that the "real" rule would hit, however, is a bad thing.
447 42         74  
448             if ($main->{bases_must_be_casei}) {
449             $rule = lc $rule;
450              
451             $lossy = 1;
452             $mods =~ s/i// and $lossy = 0;
453              
454             # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
455 42 100       183 $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and $lossy++;
456 37         163  
457             # always case-i: /A(?-i:ct)/ => /Act/
458 37         79 $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs and $lossy++;
459 37 100       360  
460             # remove (?i)
461             $rule =~ s/\(\?i\)//gs;
462 37 100       247 }
463             else {
464             die "case-i" if $rule =~ /\(\?i\)/;
465 37 50       123 die "case-i" if $mods =~ /i/;
466              
467             # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
468 37         99 $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and die "case-i";
469              
470             # we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/
471 5 50       26 $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs;
472 5 100       32 }
473              
474             # remove /m and /s modifiers
475 4 50       21 $mods =~ s/m// and $lossy++;
476             $mods =~ s/s// and $lossy++;
477              
478 4         10 # remove (^|\b)'s
479             # T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is
480             $rule =~ s/\(\^\|\\b\)//gs and $lossy++;
481             $rule =~ s/\(\$\|\\b\)//gs and $lossy++;
482 41 50       279 $rule =~ s/\(\\b\|\^\)//gs and $lossy++;
483 41 100       282 $rule =~ s/\(\\b\|\$\)//gs and $lossy++;
484              
485             # remove (?!credit)
486             $rule =~ s/\(\?\![^\)]+\)//gs and $lossy++;
487 41 50       317  
488 41 50       154 # remove \b's
489 41 50       163 $rule =~ s/(?<!\\)\\b//gs and $lossy++;
490 41 50       131  
491             # remove the "?=" trick
492             # (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...)
493 41 100       170 $rule =~ s/\(\?\=\[[^\]]+\]\)//gs;
494              
495             $mods .= "L" if $lossy;
496 41 100       187 ($rule, $mods);
497             }
498              
499             my $self = shift;
500 41         91 my $rawrule = shift;
501             my $rule = shift;
502 41 100       228 my $mods = shift;
503 41         393  
504             my $main = $self->{main};
505             my $orig = $rule;
506              
507 41     41 0 79 my $lossy = 0;
508 41         67 $mods =~ s/L// and $lossy++;
509 41         213  
510 41         192 # if there are anchors, give up; we can't get much
511             # faster than these anyway
512 41         100 die "anchors" if $rule =~ /^\(?(?:\^|\\A)/;
513 41         245  
514             # die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/;
515 41         80 # just remove end-of-string anchors; they're slow so could gain
516 41 100       430 # from our speedup
517             $rule =~ s/(?<!\\)(?:\$|\\Z)\)?$// and $lossy++;
518              
519             # simplify (?:..) to (..)
520 41 50       254 $main->{bases_allow_noncapture_groups} or
521             $rule =~ s/\(\?:/\(/g;
522              
523             # simplify some grouping arrangements so they're easier for us to parse
524             # (foo)? => (foo|)
525 41 50       208 $rule =~ s/\((.*?)\)\?/\($1\|\)/gs;
526             # r? => (r|)
527             $rule =~ s/(?<!\\)(\w)\?/\($1\|\)/gs;
528              
529 41 50       271 my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
530             $tmpfh or die "failed to create a temporary file";
531             untaint_var(\$tmpf);
532              
533 41         203 print $tmpfh "use bytes; m{" . $rule . "}" . $mods
534             or die "error writing to $tmpf: $!";
535 41         185 close $tmpfh or die "error closing $tmpf: $!";
536              
537 41         537 my $perl = $self->get_perl();
538 41 50       200 local *IN;
539 41         180 open (IN, "$perl -c -Mre=debug $tmpf 2>&1 |")
540             or die "cannot run $perl: ".exit_status_str($?,$!);
541 41 50       751  
542             my($inbuf,$nread,$fullstr); $fullstr = '';
543 41 50       2166 while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }
544             defined $nread or die "error reading from pipe: $!";
545 41         443  
546 41         207 unlink $tmpf or die "cannot unlink $tmpf: $!";
547 41 50       193994 close IN or die "error closing pipe: $!";
548             defined $fullstr or warn "empty result from a pipe";
549              
550 41         707 # now parse the -Mre=debug output.
  41         412  
551 41         385321 # perl 5.10 format
  41         1537  
552 41 50       236 $fullstr =~ s/^.*\nFinal program:\n//gs;
553             # perl 5.6/5.8 format
554 41 50       6859 $fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs;
555 41 50       2001 $fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;
556 41 50       281 # common to all
557             $fullstr =~ s/\nOffsets:.*$//gs;
558              
559             # clean up every other line that doesn't start with a space
560 41         2302 $fullstr =~ s/^\S.*$//gm;
561              
562 41         275 if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
563 41         347 die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";
564             }
565 41         182 my $opsstr = $1;
566              
567             # what's left looks like this:
568 41         1037 # 1: EXACTF <v>(3)
569             # 3: ANYOF[1ILil](14)
570 41 50       1128 # 14: EXACTF <a>(16)
571 0         0 # 16: CURLY {2,7}(29)
572             # 18: ANYOF[A-Za-z](0)
573 41         1213 # 29: SPACE(30)
574             # 30: EXACTF <http://>(33)
575             # 33: END(0)
576             #
577             DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";
578              
579             my @ops;
580             foreach my $op (split(/\n/s, $opsstr)) {
581             next unless $op;
582              
583             if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*?)\s*(?:\(\d+\))?$/) {
584             # perl 5.8: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)
585 41         259 # perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)
586             push @ops, [ $1, $2, $3 ];
587 41         91 }
588 41         2886 elsif ($op =~ /^ (\s*)<(.*)>\.\.\.\s*$/) {
589 439 50       1808 # 5: TRIE-EXACT[im](44)
590             # <message contained attachments that have been blocked by guin>...
591 439 100       4472 my $spcs = $1;
    100          
    50          
    0          
592             # we could use the entire length here, but it's easier to trim to
593             # the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
594 321         7532 # suite results will match, since the sa-update --list extraction will
595             # be the same for all versions. (The "..." trailer is important btw)
596             my $str = substr ($2, 0, 55);
597             push @ops, [ $spcs, '_moretrie', "<$str...>" ];
598             }
599 4         38 elsif ($op =~ /^ (\s*)(<.*>)\s*(?:\(\d+\))?$/) {
600             # 5: TRIE-EXACT[am](21)
601             # <am> (21)
602             # <might> (12)
603             push @ops, [ $1, '_moretrie', $2 ];
604 4         59 }
605 4         92 elsif ($op =~ /^ at .+ line \d+$/) {
606             next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':
607             }
608             else {
609             warn "cannot parse '$op': $opsstr";
610             next;
611 114         1417 }
612             }
613              
614 0         0 # unroll the branches; returns a list of versions.
615             # e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ]
616             my @unrolled;
617 0         0 if ($main->{bases_split_out_alternations}) {
618 0         0 @unrolled = $self->unroll_branches(0, \@ops);
619             } else {
620             @unrolled = ( \@ops );
621             }
622              
623             # now find the longest DFA-friendly string in each unrolled version
624 41         370 my @longests;
625 41 100       336 foreach my $opsarray (@unrolled) {
626 35         742 my $longestexact = '';
627             my $buf = '';
628 6         75  
629             # use a closure to keep the code succinct
630             my $add_candidate = sub {
631             if (length $buf > length $longestexact) { $longestexact = $buf; }
632 41         137 $buf = '';
633 41         132 };
634 136         561  
635 136         400 my $prevop;
636             foreach my $op (@{$opsarray}) {
637             my ($spcs, $item, $args) = @{$op};
638              
639 430 100   430   1396 next if ($item eq 'NOTHING');
  159         391  
640 430         839  
641 136         1331 # EXACT == case-sensitive
642             # EXACTF == case-i
643 136         258 # we can do both, since we canonicalize to lc.
644 136         178 if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
  136         342  
645 635         1070 {
  635         2404  
646             my $str = $1;
647 635 100       1777 $buf .= $str;
648             if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
649             # a high Unicode codepoint, interpreted by perl 5.8.x. cut and stop
650             $add_candidate->();
651             }
652 626 100 100     10931 if (length $str >= 55 && $buf =~ s/\.\.\.$//) {
    100 66        
    50 100        
    100 66        
      66        
      100        
      66        
      33        
      33        
      0        
      66        
      100        
      66        
      66        
      33        
      33        
653             # perl 5.8.x truncates with a "..." here! cut and stop
654 142         853 $add_candidate->();
655 142         453 }
656 142 100       580 }
657             # _moretrie == a TRIE-EXACT entry
658 1         30 elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
659             {
660 142 50 33     657 $buf .= $1;
661             if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
662 0         0 # perl 5.8.x truncates with a "..." here! cut and stop
663             $add_candidate->();
664             }
665             }
666             # /(?:foo|bar|baz){2}/ results in a CURLYX beforehand
667             elsif ($item =~ /^EXACT/ &&
668 190         529 $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
669 190 100 66     754 $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
670             $args =~ /<(.*)>/)
671 4         28 {
672             $buf .= $1;
673             if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
674             # perl 5.8.x truncates with a "..." here! cut and stop
675             $add_candidate->();
676             }
677             }
678             # CURLYX, for perl >= 5.9.5
679             elsif ($item =~ /^_moretrie/ &&
680 0         0 $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
681 0 0 0     0 $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
682             $args =~ /<(.*)>/)
683 0         0 {
684             $buf .= $1;
685             if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
686             # perl 5.8.x truncates with a "..." here! cut and stop
687             $add_candidate->();
688             }
689             }
690             else {
691             # not an /^EXACT/; clear the buffer
692 5         34 $add_candidate->();
693 5 50 33     44 if ($item !~ /^(?:END|CLOSE\d|MINMOD)$/)
694             {
695 0         0 $lossy = 1;
696             DEBUG_RE_PARSING and warn "item $item makes regexp lossy";
697             }
698             }
699             $prevop = $op;
700 289         1014 }
701 289 100       1307 $add_candidate->();
702              
703 173         606 if (!$longestexact) {
704 173         272 die "no long-enough string found in $rawrule\n";
705             # all unrolled versions must have a long string, otherwise
706             # we cannot reliably match all variants of the rule
707 626         1809 } else {
708             push @longests, ($main->{bases_must_be_casei}) ?
709 136         315 lc $longestexact : $longestexact;
710             }
711 136 100       328 }
712 1         89  
713             DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
714             return ($lossy, @longests);
715             }
716 135 100       1553  
717             ###########################################################################
718              
719             my ($self, $depth, $opslist) = @_;
720              
721 40         106 die "too deep" if ($depth++ > 5);
722 40         2812  
723             my @ops = (@{$opslist}); # copy
724             my @pre_branch_ops;
725             my $branch_spcs;
726             my $trie_spcs;
727             my $open_spcs;
728 187     187 0 591  
729             # our input looks something like this 2-level structure:
730 187 50       533 # 1: BOUND(2)
731             # 2: EXACT <Dear >(5)
732 187         283 # 5: BRANCH(9)
  187         673  
733 187         609 # 6: EXACT <IT>(8)
734             # 8: NALNUM(24)
735 187         0 # 9: BRANCH(23)
736 187         0 # 10: EXACT <Int>(12)
737             # 12: BRANCH(14)
738             # 13: NOTHING(21)
739             # 14: BRANCH(17)
740             # 15: EXACT <a>(21)
741             # 17: BRANCH(20)
742             # 18: EXACT <er>(21)
743             # 20: TAIL(21)
744             # 21: EXACT <net>(24)
745             # 23: TAIL(24)
746             # 24: EXACT < shop>(27)
747             # 27: END(0)
748             #
749             # or:
750             #
751             # 1: OPEN1(3)
752             # 3: BRANCH(6)
753             # 4: EXACT <v>(9)
754             # 6: BRANCH(9)
755             # 7: EXACT <\\/>(9)
756             # 9: CLOSE1(11)
757             # 11: CURLY {2,5}(14)
758             # 13: REG_ANY(0)
759             # 14: EXACT < g r a >(17)
760             # 17: ANYOF[a-z](28)
761             # 28: END(0)
762             #
763             # or:
764             #
765             # 1: EXACT <i >(3)
766             # 3: OPEN1(5)
767             # 5: TRIE-EXACT[am](21)
768             # <am> (21)
769             # <might> (12)
770             # 12: OPEN2(14)
771             # 14: TRIE-EXACT[ ](19)
772             # < be>
773             # <>
774             # 19: CLOSE2(21)
775             # 21: CLOSE1(23)
776             # 23: EXACT < c>(25)
777              
778             DEBUG_RE_PARSING and warn "starting parse";
779              
780             # this happens for /foo|bar/ instead of /(?:foo|bar)/ ; transform
781             # it into the latter. bit of a kludge to do this before the loop, but hey.
782             # note that it doesn't fix the CLOSE1/END ordering to be correct
783             if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {
784             my @newops = ([ "", "OPEN1", "" ]);
785             foreach my $op (@ops) {
786             push @newops, [ " ".$op->[0], $op->[1], $op->[2] ];
787 187         273 }
788             push @newops, [ "", "CLOSE1", "" ];
789             @ops = @newops;
790             }
791              
792 187 50 66     1317 # iterate until we start a branch set. using
793 0         0 # /dkjfksl(foo|bar(baz|argh)boo)gab/ as an example, we're at "dkj..."
794 0         0 # just hitting an OPEN is not enough; wait until we see a TRIE-EXACT
795 0         0 # or a BRANCH, *then* unroll the most recent OPEN set.
796             while (1) {
797 0         0 my $op = shift @ops;
798 0         0 last unless defined $op;
799              
800             my ($spcs, $item, $args) = @{$op};
801             DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";
802              
803             if ($item =~ /^OPEN/) {
804             $open_spcs = $spcs;
805 187         374 next; # next will be a BRANCH or TRIE
806 968         1431  
807 968 100       1901 } elsif ($item =~ /^TRIE/) {
808             $trie_spcs = $spcs;
809 838         1163 last;
  838         2854  
810 838         1230  
811             } elsif ($item =~ /^BRANCH/) {
812 838 100 100     5744 $branch_spcs = $spcs;
    100          
    100          
    100          
    100          
813 54         346 last;
814 54         260  
815             } elsif ($item =~ /^EXACT/ && defined $open_spcs) {
816             # perl 5.9.5 does this; f(o|oish) => OPEN, EXACT, TRIE-EXACT
817 49         181 push @pre_branch_ops, [ $open_spcs, $item, $args ];
818 49         160 next;
819              
820             } elsif (defined $open_spcs) {
821 8         74 # OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:
822 8         111 # ignore this OPEN block entirely and don't try to unroll it
823             undef $open_spcs;
824              
825             } else {
826 7         115 push @pre_branch_ops, $op;
827 7         92 }
828             }
829              
830             # no branches found? we're done unrolling on this one!
831             if (scalar @ops == 0) {
832 2         25 return [ @pre_branch_ops ];
833             }
834              
835 718         2193 # otherwise we're at the start of a new branch set
836             # /(foo|bar(baz|argh)boo)gab/
837             my @alts;
838             my @in_this_branch;
839              
840 187 100       433 DEBUG_RE_PARSING and warn "entering branch: ".
841 130         950 "open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".
842             "branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".
843             "trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";
844              
845             # indentation level to remove from "normal" ops (using a s///)
846 57         129 my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
847             my $trie_sub_spcs = "";
848             while (1) {
849 57         83 my $op = shift @ops;
850             last unless defined $op;
851             my ($spcs, $item, $args) = @{$op};
852             DEBUG_RE_PARSING and warn "in: [$spcs] $item $args";
853              
854             if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt
855 57 100       491 push @alts, [ @pre_branch_ops, @in_this_branch ];
856 57         268 @in_this_branch = ();
857 57         124 $open_sub_spcs = $branch_spcs." ";
858 387         614 $trie_sub_spcs = "";
859 387 50       796 next;
860 387         565 }
  387         1440  
861 387         645 elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end
862             push @alts, [ @pre_branch_ops, @in_this_branch ];
863 387 100 100     6135 undef $branch_spcs;
    50 66        
    100 66        
    100 33        
    100 100        
      100        
      100        
      66        
864 9         71 $open_sub_spcs = "";
865 9         33 $trie_sub_spcs = "";
866 9         31 last;
867 9         29 }
868 9         45 elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {
869             if (scalar @in_this_branch > 0) {
870             push @alts, [ @pre_branch_ops, @in_this_branch ];
871 0         0 }
872 0         0 # use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)
873 0         0 @in_this_branch = ( [ $open_spcs, $item, $args ] );
874 0         0 $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
875 0         0 $trie_sub_spcs = " ";
876             next;
877             }
878 135 100       390 elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end
879 86         349 push @alts, [ @pre_branch_ops, @in_this_branch ];
880             undef $branch_spcs;
881             undef $open_spcs;
882 135         874 undef $trie_spcs;
883 135 50       345 $open_sub_spcs = "";
884 135         313 $trie_sub_spcs = "";
885 135         365 last;
886             }
887             elsif ($item eq 'END') { # of string
888 52         272 push @alts, [ @pre_branch_ops, @in_this_branch ];
889 52         133 undef $branch_spcs;
890 52         143 undef $open_spcs;
891 52         105 undef $trie_spcs;
892 52         152 $open_sub_spcs = "";
893 52         116 $trie_sub_spcs = "";
894 52         184 last;
895             }
896             else {
897 5         42 if ($open_sub_spcs) {
898 5         34 # deindent the space-level to match the opening brace
899 5         13 $spcs =~ s/^$open_sub_spcs//;
900 5         13 # tries also add one more indent level in
901 5         13 $spcs =~ s/^$trie_sub_spcs//;
902 5         22 }
903 5         28 push @in_this_branch, [ $spcs, $item, $args ];
904             # note that we ignore ops at a deeper $spcs level entirely (until later!)
905             }
906 186 50       471 }
907              
908 186         1418 if (defined $branch_spcs) {
909             die "fell off end of string with a branch open: '$branch_spcs'";
910 186         972 }
911              
912 186         1361 # we're now after the branch set: /gab/
913             # @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz|argh)boo/ ]
914             foreach my $alt (@alts) {
915             push @{$alt}, @ops; # add all remaining ops to each one
916             # note that this could include more (?:...); we don't care, since
917 57 50       190 # those can be handled by recursing
918 0         0 }
919              
920             # ok, parsed the entire ops list
921             # @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz|argh)boogab/ ]
922              
923 57         250 if (DEBUG_RE_PARSING) {
924 152         399 print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
  152         510  
925             }
926              
927             # now recurse, to unroll the remaining branches (if any exist)
928             my @rets;
929             foreach my $alt (@alts) {
930             push @rets, $self->unroll_branches($depth, $alt);
931             }
932 57         95  
933             if (DEBUG_RE_PARSING) {
934             print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
935             }
936              
937 57         123 return @rets;
938 57         112 }
939 152         597  
940             ###########################################################################
941              
942 57         112 my ($self) = @_;
943              
944             $self->test_split_alt("foo", "/foo/");
945             $self->test_split_alt("(foo)", "/foo/");
946 57         468 $self->test_split_alt("foo(bar)baz", "/foobarbaz/");
947             $self->test_split_alt("x(foo|)", "/xfoo/ /x/");
948             $self->test_split_alt("fo(o|)", "/foo/ /fo/");
949             $self->test_split_alt("(foo|bar)", "/foo/ /bar/");
950             $self->test_split_alt("foo|bar", "/foo/ /bar/");
951             $self->test_split_alt("foo (bar|baz) argh", "/foo bar argh/ /foo baz argh/");
952 0     0 0 0 $self->test_split_alt("foo (bar|baz|bl(arg|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");
953             $self->test_split_alt("(s(otc|tco)k)", "/sotck/ /stcok/");
954 0         0 $self->test_split_alt("(business partner(s|ship|)|silent partner(s|ship|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");
955 0         0 }
956 0         0  
957 0         0 my ($self, $in, $out) = @_;
958 0         0  
959 0         0 my @got = $self->split_alt($in);
960 0         0 $out =~ s/^\///;
961 0         0 $out =~ s/\/$//;
962 0         0 my @want = split(/\/ \//, $out);
963 0         0  
964 0         0 my $failed = 0;
965             if (scalar @want != scalar @got) {
966             warn "FAIL: results count don't match";
967             $failed++;
968 0     0 0 0 }
969             else {
970 0         0 my %got = map { $_ => 1 } @got;
971 0         0 foreach my $w (@want) {
972 0         0 if (!$got{$w}) {
973 0         0 warn "FAIL: '$w' not found";
974             $failed++;
975 0         0 }
976 0 0       0 }
977 0         0 }
978 0         0  
979             if ($failed) {
980             print "want: /".join('/ /', @want)."/\n" or die "error writing: $!";
981 0         0 print "got: /".join('/ /', @got)."/\n" or die "error writing: $!";
  0         0  
982 0         0 return 0;
983 0 0       0 } else {
984 0         0 print "ok\n" or die "error writing: $!";
985 0         0 return 1;
986             }
987             }
988              
989             ###########################################################################
990 0 0       0  
991 0 0       0 my ($self) = @_;
992 0 0       0 my $perl;
993 0         0  
994             # allow user override of the perl interpreter to use when
995 0 0       0 # extracting base strings.
996 0         0 # TODO: expose this via sa-compile command-line option
997             my $fromconf = $self->{main}->{conf}->{re_parser_perl};
998              
999             if ($fromconf) {
1000             $perl = $fromconf;
1001             } elsif ($^X =~ m|^/|) {
1002             $perl = $^X;
1003 41     41 0 178 } else {
1004 41         85 use Config;
1005             $perl = $Config{perlpath};
1006             $perl =~ s|/[^/]*$|/$^X|;
1007             }
1008             untaint_var(\$perl);
1009 41         139 return $perl;
1010             }
1011 41 50       523  
    50          
1012 0         0 ###########################################################################
1013              
1014 41         220 my ($self, $cachefile) = @_;
1015             local *IN;
1016 1     1   9 if (open(IN, "<".$cachefile)) {
  1         3  
  1         962  
1017 0         0 my($inbuf,$nread,$str); $str = '';
1018 0         0 while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf }
1019             defined $nread or die "error reading from $cachefile: $!";
1020 41         355 close IN or die "error closing $cachefile: $!";
1021 41         283  
1022             untaint_var(\$str);
1023             my $VAR1; # Data::Dumper
1024             if (eval $str) {
1025             return $VAR1; # Data::Dumper's naming
1026             }
1027 0     0 0 0 }
1028 0         0 return { };
1029 0 0       0 }
1030 0         0  
  0         0  
1031 0         0 my ($self, $cachefile, $cached) = @_;
  0         0  
1032 0 0       0  
1033 0 0       0 my $dump = Data::Dumper->new ([ $cached ]);
1034             $dump->Deepcopy(1);
1035 0         0 $dump->Purity(1);
1036 0         0 $dump->Indent(1);
1037 0 0       0  
1038 0         0 my $cachedir = $self->{main}->{bases_cache_dir};
1039             if (mkdir($cachedir)) {
1040             # successfully created
1041 0         0 } elsif ($! == EEXIST) {
1042             dbg("zoom: ok, cache directory already existed");
1043             } else {
1044             warn "zoom: could not create cache directory: $cachedir ($!)\n";
1045 0     0 0 0 return;
1046             }
1047 0         0 open(CACHE, ">$cachefile") or warn "cannot write to $cachefile";
1048 0         0 print CACHE ($dump->Dump, ";1;") or die "error writing: $!";
1049 0         0 close CACHE or die "error closing $cachefile: $!";
1050 0         0 }
1051              
1052 0         0 =over 4
1053 0 0       0  
    0          
1054             =item my ($cleanregexp) = fixup_re($regexp);
1055              
1056 0         0 Converts encoded characters in a regular expression pattern into their
1057             equivalent characters
1058 0         0  
1059 0         0 =back
1060              
1061 0 0       0 =cut
1062 0 0       0  
1063 0 0       0 my $re = shift;
1064            
1065             if ($fixup_re_test) { print "INPUT: /$re/\n" or die "error writing: $!" }
1066            
1067             my $output = "";
1068             my $TOK = qr([\"\\]);
1069              
1070             my $STATE;
1071             local ($1,$2);
1072             while ($re =~ /\G(.*?)($TOK)/gcs) {
1073             my $pre = $1;
1074             my $tok = $2;
1075              
1076             if (length($pre)) {
1077             $output .= "\"$pre\"";
1078 135     135 1 478 }
1079              
1080 135 0       360 if ($tok eq '"') {
  0 50       0  
1081             $output .= '"\\""';
1082 135         411 }
1083 135         1324 elsif ($tok eq '\\') {
1084             $re =~ /\G(x\{[^\}]+\}|[0-7]{1,3}|.)/gcs or die "\\ at end of string!";
1085 135         234 my $esc = $1;
1086 135         707 if ($esc eq '"') {
1087 135         1377 $output .= '"\\""';
1088 0         0 } elsif ($esc eq '\\') {
1089 0         0 $output .= '"**BACKSLASH**"'; # avoid hairy escape-parsing
1090             } elsif ($esc =~ /^x\{(\S+)\}\z/) {
1091 0 0       0 $output .= '"'.chr(hex($1)).'"';
1092 0         0 } elsif ($esc =~ /^[0-7]{1,3}\z/) {
1093             $output .= '"'.chr(oct($esc)).'"';
1094             } else {
1095 0 0       0 $output .= "\"$esc\"";
    0          
1096 0         0 }
1097             }
1098             else {
1099 0 0       0 print "PRE: $pre\nTOK: $tok\n" or die "error writing: $!";
1100 0         0 }
1101 0 0       0 }
    0          
    0          
    0          
1102 0         0
1103             if (!defined(pos($re))) {
1104 0         0 # no matches
1105             $output .= "\"$re\"";
1106 0         0 # Bug 6649: protect NL, NULL, ^Z, (and controls to stay on the safe side)
1107             $output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
1108 0         0 }
1109             elsif (pos($re) <= length($re)) {
1110 0         0 $output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
1111             $output .= fixup_re(substr($re, pos($re)));
1112             }
1113              
1114 0 0       0 $output =~ s/^""/"/; # protect start and end quotes
1115             $output =~ s/(?<!\\)""\z/"/;
1116             $output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
1117             $output =~ s/\*\*BACKSLASH\*\*/\\\\/gs;
1118 135 50       361  
    0          
1119             if ($fixup_re_test) { print "OUTPUT: $output\n" or die "error writing: $!" }
1120 135         677 return $output;
1121             }
1122 135         574  
  0         0  
1123             1;