|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # <@LICENSE>  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Copyright 2006 Apache Software Foundation  | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Licensed under the Apache License, Version 2.0 (the "License");  | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # you may not use this file except in compliance with the License.  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # You may obtain a copy of the License at  | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #     http://www.apache.org/licenses/LICENSE-2.0  | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Unless required by applicable law or agreed to in writing, software  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # distributed under the License is distributed on an "AS IS" BASIS,  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # See the License for the specific language governing permissions and  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # limitations under the License.  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # </@LICENSE>  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This is a plugin to extract "base" strings from SpamAssassin 'body' rules,  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 suitable for use in Rule2XSBody rules or other parallel matching algorithms.  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
30
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
9
 | 
 use Mail::SpamAssassin::Plugin;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29
 | 
    | 
| 
31
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
6
 | 
 use Mail::SpamAssassin::Logger;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
66
 | 
    | 
| 
32
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
9
 | 
 use Mail::SpamAssassin::Util qw(untaint_var);  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
51
 | 
    | 
| 
33
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
354
 | 
 use Mail::SpamAssassin::Util::Progress;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
48
 | 
    | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
35
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
10
 | 
 use Errno qw(ENOENT EACCES EEXIST);  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
120
 | 
    | 
| 
36
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
7
 | 
 use Data::Dumper;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
    | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
38
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
7
 | 
 use strict;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
    | 
| 
39
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
5
 | 
 use warnings;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
    | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # use bytes;  | 
| 
41
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
5
 | 
 use re 'taint';  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
55
 | 
    | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @ISA = qw(Mail::SpamAssassin::Plugin);  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
45
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
6
 | 
 use constant DEBUG_RE_PARSING => 0;     # noisy!  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5410
 | 
    | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # a few settings that control what kind of bases are output.  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # treat all rules as lowercase for purposes of term extraction?  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $main->{bases_must_be_casei} = 1;  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $main->{base_quiet} = 0;      # silences progress output  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # TODO: it would be nice to have a clean API to pass such settings  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # through to plugins instead of hanging them off $main  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ##############################################################################  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # testing purposes only  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 my $fixup_re_test;  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #$fixup_re_test = 1; fixup_re("fr()|\\\\|"); die;  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #$fixup_re_test = 1; fixup_re("\\33\$b"); die;  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #$fixup_re_test = 1; fixup_re("[link]"); die;  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###########################################################################  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {  | 
| 
73
 | 
15
 | 
 
 | 
 
 | 
  
15
  
 | 
  
1
  
 | 
115
 | 
   my $class = shift;  | 
| 
74
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
97
 | 
   my $mailsaobject = shift;  | 
| 
75
 | 
15
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
135
 | 
   $class = ref($class) || $class;  | 
| 
76
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
89
 | 
   my $self = $class->SUPER::new($mailsaobject);  | 
| 
77
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
51
 | 
   bless ($self, $class);  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
79
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
118
 | 
   $self->{show_progress} = !$mailsaobject->{base_quiet};  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # $self->test(); exit;  | 
| 
82
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
180
 | 
   return $self;  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###########################################################################  | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub finish_parsing_end {  | 
| 
88
 | 
15
 | 
 
 | 
 
 | 
  
15
  
 | 
  
1
  
 | 
53
 | 
   my ($self, $params) = @_;  | 
| 
89
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
40
 | 
   my $conf = $params->{conf};  | 
| 
90
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
49
 | 
   $self->extract_bases($conf);  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub extract_bases {  | 
| 
94
 | 
15
 | 
 
 | 
 
 | 
  
15
  
 | 
  
0
  
 | 
32
 | 
   my ($self, $conf) = @_;  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
96
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
   my $main = $conf->{main};  | 
| 
97
 | 
15
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
44
 | 
   if (!$main->{base_extract}) { return; }  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $self->{show_progress} and  | 
| 
100
 | 
15
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
50
 | 
         info("base extraction starting.  this can take a while...");  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
102
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46
 | 
   $self->extract_set($conf, $conf->{body_tests}, 'body');  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub extract_set {  | 
| 
106
 | 
15
 | 
 
 | 
 
 | 
  
15
  
 | 
  
0
  
 | 
41
 | 
   my ($self, $conf, $test_set, $ruletype) = @_;  | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
108
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
   foreach my $pri (keys %{$test_set}) {  | 
| 
 
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
57
 | 
    | 
| 
109
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
     my $nicepri = $pri; $nicepri =~ s/-/neg/g;  | 
| 
 
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
103
 | 
    | 
| 
110
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
100
 | 
     $self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###########################################################################  | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub extract_set_pri {  | 
| 
117
 | 
15
 | 
 
 | 
 
 | 
  
15
  
 | 
  
0
  
 | 
49
 | 
   my ($self, $conf, $rules, $ruletype) = @_;  | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
119
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
36
 | 
   my @good_bases;  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my @failed;  | 
| 
121
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
   my $yes = 0;  | 
| 
122
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
32
 | 
   my $no = 0;  | 
| 
123
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
   my $count = 0;  | 
| 
124
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
60
 | 
   my $start = time;  | 
| 
125
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
   $self->{main} = $conf->{main};	# for use in extract_hints()  | 
| 
126
 | 
15
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
49
 | 
   $self->{show_progress} and info ("extracting from rules of type $ruletype");  | 
| 
127
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
   my $tflags = $conf->{tflags};  | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # attempt to find good "base strings" (simplified regexp subsets) for each  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # regexp.  We try looking at the regexp from both ends, since there  | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # may be a good long string of text at the end of the rule.  | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # require this many chars in a base string + delimiters for it to be viable  | 
| 
134
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
   my $min_chars = 5;  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
136
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
   my $progress;  | 
| 
137
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({  | 
| 
138
 | 
15
 | 
  
 50
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
44
 | 
                 total => (scalar keys %{$rules} || 1),  | 
| 
139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 itemtype => 'rules',  | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               });  | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
142
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
   my $cached = { };  | 
| 
143
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
34
 | 
   my $cachefile;  | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
145
 | 
15
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
42
 | 
   if ($self->{main}->{bases_cache_dir}) {  | 
| 
146
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";  | 
| 
147
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     dbg("zoom: reading cache file $cachefile");  | 
| 
148
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $cached = $self->read_cachefile($cachefile);  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 NEXT_RULE:  | 
| 
152
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
36
 | 
   foreach my $name (keys %{$rules}) {  | 
| 
 
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59
 | 
    | 
| 
153
 | 
42
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
320
 | 
     $self->{show_progress} and $progress and $progress->update(++$count);  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
155
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
181
 | 
     my $rule = $rules->{$name};  | 
| 
156
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
179
 | 
     my $cachekey = join "#", $name, $rule;  | 
| 
157
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
158
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
139
 | 
     my $cent = $cached->{rule_bases}->{$cachekey};  | 
| 
159
 | 
42
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
126
 | 
     if (defined $cent) {  | 
| 
160
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       if (defined $cent->{g}) {  | 
| 
161
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         dbg("zoom: YES (cached) $rule $name");  | 
| 
162
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         foreach my $ent (@{$cent->{g}}) {  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           # note: we have to copy these, since otherwise later  | 
| 
164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           # modifications corrupt the cached data  | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           push @good_bases, {  | 
| 
166
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}  | 
| 
167
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
           };  | 
| 
168
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
169
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $yes++;  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       else {  | 
| 
172
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         dbg("zoom: NO (cached) $rule $name");  | 
| 
173
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         push @failed, { orig => $rule };    # no need to cache this  | 
| 
174
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $no++;  | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
176
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       next NEXT_RULE;  | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # ignore ReplaceTags rules  | 
| 
180
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
121
 | 
     my $is_a_replacetags_rule = $conf->{rules_to_replace}->{$name};  | 
| 
181
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
80
 | 
     my ($minlen, $lossy, @bases);  | 
| 
182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
183
 | 
42
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
146
 | 
     if (!$is_a_replacetags_rule) {  | 
| 
184
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       eval {  # catch die()s  | 
| 
185
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
446
 | 
         my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);  | 
| 
186
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
205
 | 
         ($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);  | 
| 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases));  | 
| 
188
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
554
 | 
         1;  | 
| 
189
 | 
42
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
87
 | 
       } or do {  | 
| 
190
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
23
 | 
         my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
    | 
| 
191
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
57
 | 
         dbg("zoom: giving up on regexp: $eval_stat");  | 
| 
192
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       };  | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
194
 | 
42
 | 
  
 50
  
 | 
  
 50
  
 | 
 
 | 
 
 | 
903
 | 
       if ($lossy && ($tflags->{$name}||'') =~ /\bmultiple\b/) {  | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
195
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "\nzoom: rule $name will loop on SpamAssassin older than 3.3.2 ".  | 
| 
196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
              "running under Perl 5.12 or older, Bug 6558\n";  | 
| 
197
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
198
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # if any of the extracted hints in a set are too short, the entire  | 
| 
200
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # set is invalid; this is because each set of N hints represents just  | 
| 
201
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # 1 regexp.  | 
| 
202
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
339
 | 
       foreach my $str (@bases) {  | 
| 
203
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
656
 | 
         my $len = length fixup_re($str); # bug 6143: count decoded characters  | 
| 
204
 | 
135
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
766
 | 
         if ($len < $min_chars) { $minlen = undef; @bases = (); last; }  | 
| 
 
 | 
3
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
44
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
34
 | 
    | 
| 
 
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
    | 
| 
205
 | 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
178
 | 
         elsif (!defined($minlen) || $len < $minlen) { $minlen = $len; }  | 
| 
206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
209
 | 
42
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
1036
 | 
     if ($is_a_replacetags_rule || !$minlen || !@bases) {  | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
210
 | 
5
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
138
 | 
       dbg("zoom: ignoring rule %s, %s", $name,  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           $is_a_replacetags_rule ? 'is a replace rule'  | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           : !@bases ? 'no bases' : 'no minlen');  | 
| 
213
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
80
 | 
       push @failed, { orig => $rule };  | 
| 
214
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
67
 | 
       $cached->{rule_bases}->{$cachekey} = { };  | 
| 
215
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
43
 | 
       $no++;  | 
| 
216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
217
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {  | 
| 
218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");  | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # figure out if we have e.g. ["foo", "foob", "foobar"]; in this  | 
| 
221
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # case, we only need to track ["foo"].  | 
| 
222
 | 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
120
 | 
       my %subsumed;  | 
| 
223
 | 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
100
 | 
       foreach my $base1 (@bases) {  | 
| 
224
 | 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
302
 | 
         foreach my $base2 (@bases) {  | 
| 
225
 | 
2524
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
19328
 | 
           if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {  | 
| 
226
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
83
 | 
             $subsumed{$base1} = 1; # base2 is inside base1; discard the longer  | 
| 
227
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           }  | 
| 
228
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
229
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
231
 | 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
105
 | 
       my @forcache;  | 
| 
232
 | 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
96
 | 
       foreach my $base (@bases) {  | 
| 
233
 | 
132
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
413
 | 
         next if $subsumed{$base};  | 
| 
234
 | 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1326
 | 
         push @good_bases, {  | 
| 
235
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             base => $base, orig => $rule, name => "$name,[l=$lossy]"  | 
| 
236
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           };  | 
| 
237
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # *separate* copies for cache -- we modify the @good_bases entry  | 
| 
238
 | 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
869
 | 
         push @forcache, {  | 
| 
239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             base => $base, orig => $rule, name => "$name,[l=$lossy]"  | 
| 
240
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           };  | 
| 
241
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
243
 | 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1046
 | 
       $cached->{rule_bases}->{$cachekey} = { g => \@forcache };  | 
| 
244
 | 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
438
 | 
       $yes++;  | 
| 
245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
246
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
248
 | 
15
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
331
 | 
   $self->{show_progress} and $progress and $progress->final();  | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
250
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
862
 | 
   dbg("zoom: $ruletype: found ".(scalar @good_bases).  | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       " usable base strings in $yes rules, skipped $no rules");  | 
| 
252
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # NOTE: re2c will attempt to provide the longest pattern that matched; e.g.  | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # ("food" =~ "foo" / "food") will return "food".  So therefore if a pattern  | 
| 
255
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # subsumes other patterns, we need to return hits for all of them.  We also  | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # need to take care of the case where multiple regexps wind up sharing the  | 
| 
257
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # same base.     | 
| 
258
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Another gotcha, an exception to the subsumption rule; if one pattern isn't  | 
| 
260
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be  | 
| 
261
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # returned as two hits, correctly.  So we only have to be smart about the  | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # full-subsumption case; overlapping is taken care of for us, by re2c.  | 
| 
263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  | 
| 
264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # TODO: there's a bug here.  Since the code in extract_hints() has been  | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # modified to support more complex regexps, we can no longer simply assume  | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # that if pattern A is not contained in pattern B, that means that pattern B  | 
| 
267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # doesn't subsume it.  Consider, for example, A="foo*bar" and  | 
| 
268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # B="morefobarry"; A is indeed subsumed by B, but we won't be able to test  | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # that without running the A RE match itself somehow against B.  | 
| 
270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # same issue remains with:  | 
| 
271
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  | 
| 
272
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   "foo?bar" / "fobar"  | 
| 
273
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   "fo(?:o|oo|)bar" / "fobar"  | 
| 
274
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   "fo(?:o|oo)?bar" / "fobar"  | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   "fo(?:o*|baz)bar" / "fobar"  | 
| 
276
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   "(?:fo(?:o*|baz)bar|blargh)" / "fobar"  | 
| 
277
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  | 
| 
278
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # it's worse with this:  | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  | 
| 
280
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   "fo(?:o|oo|)bar" / "foo*bar"  | 
| 
281
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # basically, this is impossible to compute without reimplementing most of  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # re2c, and it appears the re2c developers don't plan to offer this:  | 
| 
284
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
286
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
305
 | 
   $conf->{base_orig}->{$ruletype} = { };  | 
| 
287
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
174
 | 
   $conf->{base_string}->{$ruletype} = { };  | 
| 
288
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
289
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
53
 | 
   $count = 0;  | 
| 
290
 | 
15
 | 
  
 50
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
56
 | 
   $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({  | 
| 
291
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 total => (scalar @good_bases || 1),  | 
| 
292
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 itemtype => 'bases',  | 
| 
293
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
               });  | 
| 
294
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # this bit is annoyingly O(N^2).  Rewrite the data -- the @good_bases  | 
| 
296
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # array -- into a more efficient format, using arrays and with a little  | 
| 
297
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # bit of precomputation, to go (quite a bit) faster  | 
| 
298
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
299
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
62
 | 
   my @rewritten;  | 
| 
300
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
42
 | 
   foreach my $set1 (@good_bases) {  | 
| 
301
 | 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
376
 | 
     my $base = $set1->{base};  | 
| 
302
 | 
128
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
611
 | 
     next if (!$base || !$set1->{name});  | 
| 
303
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     push @rewritten, [  | 
| 
304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $base,                # 0  | 
| 
305
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $set1->{name},        # 1  | 
| 
306
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       $set1->{orig},        # 2  | 
| 
307
 | 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2275
 | 
       length $base,         # 3  | 
| 
308
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       qr/\Q$base\E/,        # 4  | 
| 
309
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       0                     # 5, has_multiple flag  | 
| 
310
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     ];  | 
| 
311
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
312
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
226
 | 
   @good_bases = @rewritten;  | 
| 
313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
314
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
82
 | 
   foreach my $set1 (@good_bases) {  | 
| 
315
 | 
128
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
501
 | 
     $self->{show_progress} and $progress and $progress->update(++$count);  | 
| 
316
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
317
 | 
128
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
271
 | 
     my $base1 = $set1->[0]; next unless $base1;  | 
| 
 
 | 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
273
 | 
    | 
| 
318
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
226
 | 
     my $name1 = $set1->[1];  | 
| 
319
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
210
 | 
     my $orig1 = $set1->[2];  | 
| 
320
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
470
 | 
     $conf->{base_orig}->{$ruletype}->{$name1} = $orig1;  | 
| 
321
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
316
 | 
     my $len1 = $set1->[3];  | 
| 
322
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
323
 | 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
233
 | 
     foreach my $set2 (@good_bases) {  | 
| 
324
 | 
3247
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
6224
 | 
       next if ($set1 == $set2);  | 
| 
325
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
326
 | 
3143
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
4737
 | 
       my $base2 = $set2->[0]; next unless $base2;  | 
| 
 
 | 
3143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5372
 | 
    | 
| 
327
 | 
2660
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3895
 | 
       my $name2 = $set2->[1];  | 
| 
328
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
329
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # clobber exact dups; this can happen if a regexp outputs the   | 
| 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # same base string multiple times  | 
| 
331
 | 
2660
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
5090
 | 
       if ($base1 eq $base2 &&  | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
332
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           $name1 eq $name2 &&  | 
| 
333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           $orig1 eq $set2->[2])  | 
| 
334
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
335
 | 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
82
 | 
         $set2->[0] = '';       # clobber  | 
| 
336
 | 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
56
 | 
         next;  | 
| 
337
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # skip if it's too short to contain the other base string  | 
| 
340
 | 
2636
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
4776
 | 
       next if ($len1 < $set2->[3]);  | 
| 
341
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
342
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # skip if either already contains the other rule's name  | 
| 
343
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # optimize: this can only happen if the base has more than  | 
| 
344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # one rule already attached, ie [5]  | 
| 
345
 | 
1510
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
3813
 | 
       next if ($set2->[5] && $name2 =~ /(?: |^)\Q$name1\E(?: |$)/);  | 
| 
346
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # don't use $name1 here, since another base in the set2 loop  | 
| 
348
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # may have added $name2 since we set that  | 
| 
349
 | 
1423
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
3781
 | 
       next if ($set1->[5] && $set1->[1] =~ /(?: |^)\Q$name2\E(?: |$)/);  | 
| 
350
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
351
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # and finally check to see if it *does* contain the other base string  | 
| 
352
 | 
1369
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
4749
 | 
       next if ($base1 !~ $set2->[4]);  | 
| 
353
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
354
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # base2 is just a subset of base1  | 
| 
355
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # dbg("zoom: subsuming '$base2' ($name2) into '$base1': [1]=$set1->[1] [5]=$set1->[5]");  | 
| 
356
 | 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
137
 | 
       $set1->[1] .= " ".$name2;  | 
| 
357
 | 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
103
 | 
       $set1->[5] = 1;  | 
| 
358
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
359
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
360
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
361
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS  | 
| 
362
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # both contain "killed" for example, pointing at different rules, which  | 
| 
363
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # the above search hasn't found.  Collapse them here with a hash  | 
| 
364
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
57
 | 
   my %bases;  | 
| 
365
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
67
 | 
   foreach my $set (@good_bases) {  | 
| 
366
 | 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
330
 | 
     my $base = $set->[0];  | 
| 
367
 | 
128
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
330
 | 
     next unless $base;  | 
| 
368
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
369
 | 
104
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
270
 | 
     if (defined $bases{$base}) {  | 
| 
370
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
71
 | 
       $bases{$base} .= " ".$set->[1];  | 
| 
371
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
372
 | 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
783
 | 
       $bases{$base} = $set->[1];  | 
| 
373
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
374
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
375
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
81
 | 
   undef @good_bases;  | 
| 
376
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
377
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
100
 | 
   foreach my $base (keys %bases) {  | 
| 
378
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # uniq the list, since there are probably dup rules listed  | 
| 
379
 | 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
200
 | 
     my %u;  | 
| 
380
 | 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
338
 | 
     for my $i (split ' ', $bases{$base}) {  | 
| 
381
 | 
137
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
322
 | 
       next if exists $u{$i}; undef $u{$i};   | 
| 
 
 | 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
315
 | 
    | 
| 
382
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
383
 | 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
661
 | 
     $conf->{base_string}->{$ruletype}->{$base} = join ' ', sort keys %u;  | 
| 
384
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
385
 | 
15
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
144
 | 
   $self->{show_progress} and $progress and $progress->final();  | 
| 
386
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
387
 | 
15
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
90
 | 
   if ($cachefile) {  | 
| 
388
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $self->write_cachefile ($cachefile, $cached);  | 
| 
389
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
390
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
391
 | 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
55
 | 
   my $elapsed = time - $start;  | 
| 
392
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $self->{show_progress} and info ("$ruletype: ".  | 
| 
393
 | 
15
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1399
 | 
             (scalar keys %{$conf->{base_string}->{$ruletype}}).  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
394
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             " base strings extracted in $elapsed seconds\n");  | 
| 
395
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
396
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
397
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###########################################################################  | 
| 
398
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
399
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # TODO:  | 
| 
400
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i  | 
| 
401
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #     => should extract 'scription' somehow  | 
| 
402
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i  | 
| 
403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #     => should understand alternations; tricky  | 
| 
404
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
405
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub simplify_and_qr_regexp {  | 
| 
406
 | 
42
 | 
 
 | 
 
 | 
  
42
  
 | 
  
0
  
 | 
137
 | 
   my $self = shift;  | 
| 
407
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
311
 | 
   my $rule = shift;  | 
| 
408
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
409
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
106
 | 
   my $main = $self->{main};  | 
| 
410
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
940
 | 
   $rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);  | 
| 
411
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
412
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # remove the regexp modifiers, keep for later  | 
| 
413
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
160
 | 
   my $mods = '';  | 
| 
414
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
453
 | 
   while ($rule =~ s/^\(\?([a-z]*)\)//) { $mods .= $1; }  | 
| 
 
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
129
 | 
    | 
| 
415
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
416
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # modifier removal  | 
| 
417
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
135
 | 
   while ($rule =~ s/^\(\?-([a-z]*)\)//) {  | 
| 
418
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     foreach my $modchar (split '', $mods) {  | 
| 
419
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       $mods =~ s/$modchar//g;  | 
| 
420
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
421
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
423
 | 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
96
 | 
   my $lossy = 0;  | 
| 
424
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
425
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # now: simplify aspects of the regexp.  Bear in mind that we can  | 
| 
426
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # simplify as long as we cause the regexp to become more general;  | 
| 
427
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # more hits is OK, since false positives will be discarded afterwards  | 
| 
428
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # anyway.  Simplification that causes the regexp to *not* hit  | 
| 
429
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # stuff that the "real" rule would hit, however, is a bad thing.  | 
| 
430
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
431
 | 
42
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
262
 | 
   if ($main->{bases_must_be_casei}) {  | 
| 
432
 | 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
148
 | 
     $rule = lc $rule;  | 
| 
433
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
434
 | 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
66
 | 
     $lossy = 1;  | 
| 
435
 | 
37
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
290
 | 
     $mods =~ s/i// and $lossy = 0;  | 
| 
436
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
437
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/  | 
| 
438
 | 
37
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
384
 | 
     $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and $lossy++;  | 
| 
439
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
440
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # always case-i: /A(?-i:ct)/ => /Act/  | 
| 
441
 | 
37
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
168
 | 
     $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs and $lossy++;  | 
| 
442
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
443
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # remove (?i)  | 
| 
444
 | 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
86
 | 
     $rule =~ s/\(\?i\)//gs;  | 
| 
445
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
446
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   else {  | 
| 
447
 | 
5
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
65
 | 
     die "case-i" if $rule =~ /\(\?i\)/;  | 
| 
448
 | 
5
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
43
 | 
     die "case-i" if $mods =~ /i/;  | 
| 
449
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
450
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/  | 
| 
451
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
13
 | 
     $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and die "case-i";  | 
| 
452
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
453
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/  | 
| 
454
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
     $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs;  | 
| 
455
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
456
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
457
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # remove /m and /s modifiers  | 
| 
458
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
122
 | 
   $mods =~ s/m// and $lossy++;  | 
| 
459
 | 
41
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
135
 | 
   $mods =~ s/s// and $lossy++;  | 
| 
460
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
461
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # remove (^|\b)'s  | 
| 
462
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is  | 
| 
463
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
260
 | 
   $rule =~ s/\(\^\|\\b\)//gs and $lossy++;  | 
| 
464
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
142
 | 
   $rule =~ s/\(\$\|\\b\)//gs and $lossy++;  | 
| 
465
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
116
 | 
   $rule =~ s/\(\\b\|\^\)//gs and $lossy++;  | 
| 
466
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
152
 | 
   $rule =~ s/\(\\b\|\$\)//gs and $lossy++;  | 
| 
467
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
468
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # remove (?!credit)  | 
| 
469
 | 
41
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
114
 | 
   $rule =~ s/\(\?\![^\)]+\)//gs and $lossy++;  | 
| 
470
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
471
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # remove \b's  | 
| 
472
 | 
41
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
187
 | 
   $rule =~ s/(?<!\\)\\b//gs and $lossy++;  | 
| 
473
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
474
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # remove the "?=" trick  | 
| 
475
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...)  | 
| 
476
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
101
 | 
   $rule =~ s/\(\?\=\[[^\]]+\]\)//gs;  | 
| 
477
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
478
 | 
41
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
144
 | 
   $mods .= "L" if $lossy;  | 
| 
479
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
297
 | 
   ($rule, $mods);  | 
| 
480
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
481
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
482
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub extract_hints {  | 
| 
483
 | 
41
 | 
 
 | 
 
 | 
  
41
  
 | 
  
0
  
 | 
89
 | 
   my $self = shift;  | 
| 
484
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
275
 | 
   my $rawrule = shift;  | 
| 
485
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
150
 | 
   my $rule = shift;  | 
| 
486
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
170
 | 
   my $mods = shift;  | 
| 
487
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
488
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
105
 | 
   my $main = $self->{main};  | 
| 
489
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
185
 | 
   my $orig = $rule;  | 
| 
490
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
491
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
109
 | 
   my $lossy = 0;  | 
| 
492
 | 
41
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
257
 | 
   $mods =~ s/L// and $lossy++;  | 
| 
493
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
494
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # if there are anchors, give up; we can't get much   | 
| 
495
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # faster than these anyway  | 
| 
496
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
416
 | 
   die "anchors" if $rule =~ /^\(?(?:\^|\\A)/;  | 
| 
497
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
498
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/;  | 
| 
499
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # just remove end-of-string anchors; they're slow so could gain  | 
| 
500
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # from our speedup  | 
| 
501
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
412
 | 
   $rule =~ s/(?<!\\)(?:\$|\\Z)\)?$// and $lossy++;  | 
| 
502
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
503
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # simplify (?:..) to (..)  | 
| 
504
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $main->{bases_allow_noncapture_groups} or  | 
| 
505
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
280
 | 
             $rule =~ s/\(\?:/\(/g;  | 
| 
506
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
507
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # simplify some grouping arrangements so they're easier for us to parse  | 
| 
508
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # (foo)? => (foo|)  | 
| 
509
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
228
 | 
   $rule =~ s/\((.*?)\)\?/\($1\|\)/gs;  | 
| 
510
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # r? => (r|)  | 
| 
511
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
185
 | 
   $rule =~ s/(?<!\\)(\w)\?/\($1\|\)/gs;  | 
| 
512
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
513
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
508
 | 
   my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();  | 
| 
514
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
142
 | 
   $tmpfh  or die "failed to create a temporary file";  | 
| 
515
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
173
 | 
   untaint_var(\$tmpf);  | 
| 
516
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
517
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
586
 | 
   print $tmpfh "use bytes; m{" . $rule . "}" . $mods  | 
| 
518
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     or die "error writing to $tmpf: $!";  | 
| 
519
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1631
 | 
   close $tmpfh  or die "error closing $tmpf: $!";  | 
| 
520
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
521
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
450
 | 
   my $perl = $self->get_perl();  | 
| 
522
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
196
 | 
   local *IN;  | 
| 
523
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
155031
 | 
   open (IN, "$perl -c -Mre=debug $tmpf 2>&1 |")  | 
| 
524
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     or die "cannot run $perl: ".exit_status_str($?,$!);  | 
| 
525
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
526
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
704
 | 
   my($inbuf,$nread,$fullstr); $fullstr = '';  | 
| 
 
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
395
 | 
    | 
| 
527
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
448656
 | 
   while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }  | 
| 
 
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1146
 | 
    | 
| 
528
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
221
 | 
   defined $nread  or die "error reading from pipe: $!";  | 
| 
529
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
530
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3679
 | 
   unlink $tmpf  or die "cannot unlink $tmpf: $!";  | 
| 
531
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1966
 | 
   close IN      or die "error closing pipe: $!";  | 
| 
532
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
201
 | 
   defined $fullstr  or warn "empty result from a pipe";  | 
| 
533
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
534
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # now parse the -Mre=debug output.  | 
| 
535
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # perl 5.10 format  | 
| 
536
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2130
 | 
   $fullstr =~ s/^.*\nFinal program:\n//gs;  | 
| 
537
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # perl 5.6/5.8 format  | 
| 
538
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
301
 | 
   $fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs;  | 
| 
539
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
215
 | 
   $fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;  | 
| 
540
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # common to all  | 
| 
541
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
130
 | 
   $fullstr =~ s/\nOffsets:.*$//gs;  | 
| 
542
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
543
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # clean up every other line that doesn't start with a space  | 
| 
544
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1057
 | 
   $fullstr =~ s/^\S.*$//gm;  | 
| 
545
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
546
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
936
 | 
   if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {  | 
| 
547
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";  | 
| 
548
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
549
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1121
 | 
   my $opsstr = $1;  | 
| 
550
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
551
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # what's left looks like this:  | 
| 
552
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #    1: EXACTF <v>(3)  | 
| 
553
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #    3: ANYOF[1ILil](14)  | 
| 
554
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   14: EXACTF <a>(16)  | 
| 
555
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   16: CURLY {2,7}(29)  | 
| 
556
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   18:   ANYOF[A-Za-z](0)  | 
| 
557
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   29: SPACE(30)  | 
| 
558
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   30: EXACTF <http://>(33)  | 
| 
559
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   33: END(0)  | 
| 
560
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  | 
| 
561
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
133
 | 
   DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";  | 
| 
562
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
563
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
302
 | 
   my @ops;  | 
| 
564
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1470
 | 
   foreach my $op (split(/\n/s, $opsstr)) {  | 
| 
565
 | 
439
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1306
 | 
     next unless $op;  | 
| 
566
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
567
 | 
439
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
3767
 | 
     if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*?)\s*(?:\(\d+\))?$/) {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
568
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # perl 5.8:              <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)  | 
| 
569
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)  | 
| 
570
 | 
321
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4271
 | 
       push @ops, [ $1, $2, $3 ];  | 
| 
571
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
572
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif ($op =~ /^      (\s*)<(.*)>\.\.\.\s*$/) {  | 
| 
573
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       #    5:   TRIE-EXACT[im](44)  | 
| 
574
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       #         <message contained attachments that have been blocked by guin>...  | 
| 
575
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
35
 | 
       my $spcs = $1;  | 
| 
576
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # we could use the entire length here, but it's easier to trim to  | 
| 
577
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # the length of a perl 5.8.x/5.6.x EXACT* string; that way our test  | 
| 
578
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # suite results will match, since the sa-update --list extraction will  | 
| 
579
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # be the same for all versions.  (The "..." trailer is important btw)  | 
| 
580
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
       my $str = substr ($2, 0, 55);  | 
| 
581
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
67
 | 
       push @ops, [ $spcs, '_moretrie', "<$str...>" ];  | 
| 
582
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
583
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif ($op =~ /^      (\s*)(<.*>)\s*(?:\(\d+\))?$/) {  | 
| 
584
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       #    5:   TRIE-EXACT[am](21)  | 
| 
585
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       #         <am> (21)  | 
| 
586
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       #         <might> (12)  | 
| 
587
 | 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
859
 | 
       push @ops, [ $1, '_moretrie', $2 ];  | 
| 
588
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
589
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif ($op =~ /^ at .+ line \d+$/) {  | 
| 
590
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':   | 
| 
591
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
592
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {  | 
| 
593
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       warn "cannot parse '$op': $opsstr";  | 
| 
594
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       next;  | 
| 
595
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
596
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
597
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
598
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # unroll the branches; returns a list of versions.  | 
| 
599
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ]  | 
| 
600
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
196
 | 
   my @unrolled;  | 
| 
601
 | 
41
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
269
 | 
   if ($main->{bases_split_out_alternations}) {  | 
| 
602
 | 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
668
 | 
     @unrolled = $self->unroll_branches(0, \@ops);  | 
| 
603
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {  | 
| 
604
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
58
 | 
     @unrolled = ( \@ops );  | 
| 
605
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
606
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
607
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # now find the longest DFA-friendly string in each unrolled version  | 
| 
608
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
178
 | 
   my @longests;  | 
| 
609
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
136
 | 
   foreach my $opsarray (@unrolled) {  | 
| 
610
 | 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
353
 | 
     my $longestexact = '';  | 
| 
611
 | 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
328
 | 
     my $buf = '';  | 
| 
612
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
613
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # use a closure to keep the code succinct  | 
| 
614
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $add_candidate = sub {  | 
| 
615
 | 
430
 | 
  
100
  
 | 
 
 | 
  
430
  
 | 
 
 | 
1017
 | 
       if (length $buf > length $longestexact) { $longestexact = $buf; }  | 
| 
 
 | 
159
 | 
 
 | 
 
 | 
 
 | 
 
 | 
335
 | 
    | 
| 
616
 | 
430
 | 
 
 | 
 
 | 
 
 | 
 
 | 
746
 | 
       $buf = '';  | 
| 
617
 | 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1408
 | 
     };  | 
| 
618
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
619
 | 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
274
 | 
     my $prevop;  | 
| 
620
 | 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
234
 | 
     foreach my $op (@{$opsarray}) {  | 
| 
 
 | 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
357
 | 
    | 
| 
621
 | 
635
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1152
 | 
       my ($spcs, $item, $args) = @{$op};  | 
| 
 
 | 
635
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1962
 | 
    | 
| 
622
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
623
 | 
635
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1834
 | 
       next if ($item eq 'NOTHING');  | 
| 
624
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
625
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # EXACT == case-sensitive  | 
| 
626
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # EXACTF == case-i  | 
| 
627
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # we can do both, since we canonicalize to lc.  | 
| 
628
 | 
626
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
7674
 | 
       if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)  | 
| 
 
 | 
 
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
 50
  
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 33
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
629
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
630
 | 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
653
 | 
         my $str = $1;  | 
| 
631
 | 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
335
 | 
         $buf .= $str;  | 
| 
632
 | 
142
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
509
 | 
         if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {  | 
| 
633
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           # a high Unicode codepoint, interpreted by perl 5.8.x.  cut and stop  | 
| 
634
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
97
 | 
           $add_candidate->();  | 
| 
635
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
636
 | 
142
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
514
 | 
         if (length $str >= 55 && $buf =~ s/\.\.\.$//) {  | 
| 
637
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           # perl 5.8.x truncates with a "..." here!  cut and stop  | 
| 
638
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
           $add_candidate->();  | 
| 
639
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
640
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
641
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # _moretrie == a TRIE-EXACT entry  | 
| 
642
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)  | 
| 
643
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
644
 | 
190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
484
 | 
         $buf .= $1;  | 
| 
645
 | 
190
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
531
 | 
         if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {  | 
| 
646
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           # perl 5.8.x truncates with a "..." here!  cut and stop  | 
| 
647
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
           $add_candidate->();  | 
| 
648
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
649
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
650
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # /(?:foo|bar|baz){2}/ results in a CURLYX beforehand  | 
| 
651
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       elsif ($item =~ /^EXACT/ &&  | 
| 
652
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&  | 
| 
653
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&  | 
| 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           $args =~ /<(.*)>/)  | 
| 
655
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
656
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $buf .= $1;  | 
| 
657
 | 
  
0
  
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
0
 | 
         if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {  | 
| 
658
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           # perl 5.8.x truncates with a "..." here!  cut and stop  | 
| 
659
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
           $add_candidate->();  | 
| 
660
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
661
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
662
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # CURLYX, for perl >= 5.9.5  | 
| 
663
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       elsif ($item =~ /^_moretrie/ &&  | 
| 
664
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&  | 
| 
665
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&  | 
| 
666
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           $args =~ /<(.*)>/)  | 
| 
667
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       {  | 
| 
668
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
26
 | 
         $buf .= $1;  | 
| 
669
 | 
5
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
34
 | 
         if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {  | 
| 
670
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
           # perl 5.8.x truncates with a "..." here!  cut and stop  | 
| 
671
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
           $add_candidate->();  | 
| 
672
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
673
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
674
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       else {  | 
| 
675
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # not an /^EXACT/; clear the buffer  | 
| 
676
 | 
289
 | 
 
 | 
 
 | 
 
 | 
 
 | 
821
 | 
         $add_candidate->();  | 
| 
677
 | 
289
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1458
 | 
         if ($item !~ /^(?:END|CLOSE\d|MINMOD)$/)  | 
| 
678
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
679
 | 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
312
 | 
           $lossy = 1;  | 
| 
680
 | 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
288
 | 
           DEBUG_RE_PARSING and warn "item $item makes regexp lossy";  | 
| 
681
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
682
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
683
 | 
626
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1318
 | 
       $prevop = $op;  | 
| 
684
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
685
 | 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
479
 | 
     $add_candidate->();  | 
| 
686
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
687
 | 
136
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
299
 | 
     if (!$longestexact) {  | 
| 
688
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
102
 | 
       die "no long-enough string found in $rawrule";  | 
| 
689
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # all unrolled versions must have a long string, otherwise  | 
| 
690
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # we cannot reliably match all variants of the rule  | 
| 
691
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
692
 | 
135
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1265
 | 
       push @longests, ($main->{bases_must_be_casei}) ?  | 
| 
693
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             lc $longestexact : $longestexact;  | 
| 
694
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
695
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
696
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
697
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
108
 | 
   DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";  | 
| 
698
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2537
 | 
   return ($lossy, @longests);  | 
| 
699
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
700
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
701
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###########################################################################  | 
| 
702
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
703
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub unroll_branches {  | 
| 
704
 | 
187
 | 
 
 | 
 
 | 
  
187
  
 | 
  
0
  
 | 
605
 | 
   my ($self, $depth, $opslist) = @_;  | 
| 
705
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
706
 | 
187
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
530
 | 
   die "too deep" if ($depth++ > 5);  | 
| 
707
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
708
 | 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
302
 | 
   my @ops = (@{$opslist});      # copy  | 
| 
 
 | 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
558
 | 
    | 
| 
709
 | 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
744
 | 
   my @pre_branch_ops;  | 
| 
710
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my $branch_spcs;  | 
| 
711
 | 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   my $trie_spcs;  | 
| 
712
 | 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   my $open_spcs;  | 
| 
713
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
714
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # our input looks something like this 2-level structure:  | 
| 
715
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  1: BOUND(2)  | 
| 
716
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  2: EXACT <Dear >(5)  | 
| 
717
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  5: BRANCH(9)  | 
| 
718
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  6:   EXACT <IT>(8)  | 
| 
719
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  8:   NALNUM(24)  | 
| 
720
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  9: BRANCH(23)  | 
| 
721
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 10:   EXACT <Int>(12)  | 
| 
722
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 12:   BRANCH(14)  | 
| 
723
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 13:     NOTHING(21)  | 
| 
724
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 14:   BRANCH(17)  | 
| 
725
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 15:     EXACT <a>(21)  | 
| 
726
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 17:   BRANCH(20)  | 
| 
727
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 18:     EXACT <er>(21)  | 
| 
728
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 20:   TAIL(21)  | 
| 
729
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 21:   EXACT <net>(24)  | 
| 
730
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 23: TAIL(24)  | 
| 
731
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 24: EXACT < shop>(27)  | 
| 
732
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 27: END(0)  | 
| 
733
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
734
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # or:  | 
| 
735
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
736
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  1: OPEN1(3)  | 
| 
737
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  3:   BRANCH(6)  | 
| 
738
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  4:     EXACT <v>(9)  | 
| 
739
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  6:   BRANCH(9)  | 
| 
740
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  7:     EXACT <\\/>(9)  | 
| 
741
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  9: CLOSE1(11)  | 
| 
742
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 11: CURLY {2,5}(14)  | 
| 
743
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 13:   REG_ANY(0)  | 
| 
744
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 14: EXACT < g r a >(17)  | 
| 
745
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 17: ANYOF[a-z](28)  | 
| 
746
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 28: END(0)  | 
| 
747
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
748
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # or:  | 
| 
749
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
750
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  1: EXACT <i >(3)  | 
| 
751
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  3: OPEN1(5)  | 
| 
752
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  5:   TRIE-EXACT[am](21)  | 
| 
753
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #       <am> (21)  | 
| 
754
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #       <might> (12)  | 
| 
755
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 12:     OPEN2(14)  | 
| 
756
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 14:       TRIE-EXACT[ ](19)  | 
| 
757
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #           < be>  | 
| 
758
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #           <>  | 
| 
759
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 19:     CLOSE2(21)  | 
| 
760
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 21: CLOSE1(23)  | 
| 
761
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 23: EXACT < c>(25)  | 
| 
762
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
763
 | 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
323
 | 
   DEBUG_RE_PARSING and warn "starting parse";  | 
| 
764
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
765
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # this happens for /foo|bar/ instead of /(?:foo|bar)/ ; transform  | 
| 
766
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # it into the latter.  bit of a kludge to do this before the loop, but hey.  | 
| 
767
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # note that it doesn't fix the CLOSE1/END ordering to be correct  | 
| 
768
 | 
187
 | 
  
 50
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
1317
 | 
   if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {  | 
| 
769
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my @newops = ([ "", "OPEN1", "" ]);  | 
| 
770
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     foreach my $op (@ops) {  | 
| 
771
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       push @newops, [ "  ".$op->[0], $op->[1], $op->[2] ];  | 
| 
772
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
773
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     push @newops, [ "", "CLOSE1", "" ];  | 
| 
774
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     @ops = @newops;  | 
| 
775
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
776
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
777
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # iterate until we start a branch set. using  | 
| 
778
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # /dkjfksl(foo|bar(baz|argh)boo)gab/ as an example, we're at "dkj..."  | 
| 
779
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # just hitting an OPEN is not enough; wait until we see a TRIE-EXACT  | 
| 
780
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # or a BRANCH, *then* unroll the most recent OPEN set.  | 
| 
781
 | 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
390
 | 
   while (1) {  | 
| 
782
 | 
968
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1652
 | 
     my $op = shift @ops;  | 
| 
783
 | 
968
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2061
 | 
     last unless defined $op;  | 
| 
784
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
785
 | 
838
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1227
 | 
     my ($spcs, $item, $args) = @{$op};  | 
| 
 
 | 
838
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2801
 | 
    | 
| 
786
 | 
838
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1360
 | 
     DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";  | 
| 
787
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
788
 | 
838
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
4602
 | 
     if ($item =~ /^OPEN/) {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
789
 | 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
158
 | 
       $open_spcs = $spcs;  | 
| 
790
 | 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
127
 | 
       next;         # next will be a BRANCH or TRIE  | 
| 
791
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
792
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ($item =~ /^TRIE/) {  | 
| 
793
 | 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
121
 | 
       $trie_spcs = $spcs;  | 
| 
794
 | 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
120
 | 
       last;  | 
| 
795
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
796
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ($item =~ /^BRANCH/) {  | 
| 
797
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
90
 | 
       $branch_spcs = $spcs;  | 
| 
798
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
53
 | 
       last;  | 
| 
799
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
800
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ($item =~ /^EXACT/ && defined $open_spcs) {  | 
| 
801
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # perl 5.9.5 does this; f(o|oish) => OPEN, EXACT, TRIE-EXACT  | 
| 
802
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
75
 | 
       push @pre_branch_ops, [ $open_spcs, $item, $args ];  | 
| 
803
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
34
 | 
       next;  | 
| 
804
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
805
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif (defined $open_spcs) {  | 
| 
806
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:  | 
| 
807
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # ignore this OPEN block entirely and don't try to unroll it  | 
| 
808
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
       undef $open_spcs;  | 
| 
809
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
810
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
811
 | 
718
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1515
 | 
       push @pre_branch_ops, $op;  | 
| 
812
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
813
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
814
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
815
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # no branches found?  we're done unrolling on this one!  | 
| 
816
 | 
187
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
537
 | 
   if (scalar @ops == 0) {  | 
| 
817
 | 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
733
 | 
     return [ @pre_branch_ops ];  | 
| 
818
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
819
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
820
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # otherwise we're at the start of a new branch set  | 
| 
821
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # /(foo|bar(baz|argh)boo)gab/  | 
| 
822
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
151
 | 
   my @alts;  | 
| 
823
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my @in_this_branch;  | 
| 
824
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
825
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
141
 | 
   DEBUG_RE_PARSING and warn "entering branch: ".  | 
| 
826
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         "open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".  | 
| 
827
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         "branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".  | 
| 
828
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         "trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";  | 
| 
829
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
830
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # indentation level to remove from "normal" ops (using a s///)  | 
| 
831
 | 
57
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
319
 | 
   my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")."  ";  | 
| 
832
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
181
 | 
   my $trie_sub_spcs = "";  | 
| 
833
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
129
 | 
   while (1) {  | 
| 
834
 | 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
683
 | 
     my $op = shift @ops;  | 
| 
835
 | 
387
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
846
 | 
     last unless defined $op;  | 
| 
836
 | 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
641
 | 
     my ($spcs, $item, $args) = @{$op};  | 
| 
 
 | 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1155
 | 
    | 
| 
837
 | 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
648
 | 
     DEBUG_RE_PARSING and warn "in:  [$spcs] $item $args";  | 
| 
838
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
839
 | 
387
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
4072
 | 
     if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) {  # alt  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
840
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59
 | 
       push @alts, [ @pre_branch_ops, @in_this_branch ];  | 
| 
841
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
       @in_this_branch = ();  | 
| 
842
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
65
 | 
       $open_sub_spcs = $branch_spcs."  ";  | 
| 
843
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
       $trie_sub_spcs = "";  | 
| 
844
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
35
 | 
       next;  | 
| 
845
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
846
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end  | 
| 
847
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       push @alts, [ @pre_branch_ops, @in_this_branch ];  | 
| 
848
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       undef $branch_spcs;  | 
| 
849
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       $open_sub_spcs = "";  | 
| 
850
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       $trie_sub_spcs = "";  | 
| 
851
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       last;  | 
| 
852
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
853
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {  | 
| 
854
 | 
135
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
387
 | 
       if (scalar @in_this_branch > 0) {  | 
| 
855
 | 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
355
 | 
         push @alts, [ @pre_branch_ops, @in_this_branch ];  | 
| 
856
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
857
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)  | 
| 
858
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
679
 | 
       @in_this_branch = ( [ $open_spcs, $item, $args ] );  | 
| 
859
 | 
135
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
331
 | 
       $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")."  ";  | 
| 
860
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
311
 | 
       $trie_sub_spcs = "  ";  | 
| 
861
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
261
 | 
       next;  | 
| 
862
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
863
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) {   # end  | 
| 
864
 | 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
215
 | 
       push @alts, [ @pre_branch_ops, @in_this_branch ];  | 
| 
865
 | 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
165
 | 
       undef $branch_spcs;  | 
| 
866
 | 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
121
 | 
       undef $open_spcs;  | 
| 
867
 | 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
97
 | 
       undef $trie_spcs;  | 
| 
868
 | 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
90
 | 
       $open_sub_spcs = "";  | 
| 
869
 | 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
111
 | 
       $trie_sub_spcs = "";  | 
| 
870
 | 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
116
 | 
       last;  | 
| 
871
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
872
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif ($item eq 'END') {  # of string  | 
| 
873
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29
 | 
       push @alts, [ @pre_branch_ops, @in_this_branch ];  | 
| 
874
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
       undef $branch_spcs;  | 
| 
875
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
       undef $open_spcs;  | 
| 
876
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
       undef $trie_spcs;  | 
| 
877
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19
 | 
       $open_sub_spcs = "";  | 
| 
878
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29
 | 
       $trie_sub_spcs = "";  | 
| 
879
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22
 | 
       last;  | 
| 
880
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
881
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {  | 
| 
882
 | 
186
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
467
 | 
       if ($open_sub_spcs) {  | 
| 
883
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # deindent the space-level to match the opening brace  | 
| 
884
 | 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1151
 | 
         $spcs =~ s/^$open_sub_spcs//;  | 
| 
885
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # tries also add one more indent level in  | 
| 
886
 | 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
906
 | 
         $spcs =~ s/^$trie_sub_spcs//;  | 
| 
887
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
888
 | 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
796
 | 
       push @in_this_branch, [ $spcs, $item, $args ];  | 
| 
889
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # note that we ignore ops at a deeper $spcs level entirely (until later!)  | 
| 
890
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
891
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
892
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
893
 | 
57
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
203
 | 
   if (defined $branch_spcs) {  | 
| 
894
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     die "fell off end of string with a branch open: '$branch_spcs'";  | 
| 
895
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
896
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
897
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # we're now after the branch set: /gab/  | 
| 
898
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz|argh)boo/ ]  | 
| 
899
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
257
 | 
   foreach my $alt (@alts) {  | 
| 
900
 | 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
279
 | 
     push @{$alt}, @ops;     # add all remaining ops to each one  | 
| 
 
 | 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
435
 | 
    | 
| 
901
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # note that this could include more (?:...); we don't care, since  | 
| 
902
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # those can be handled by recursing  | 
| 
903
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
904
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
905
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # ok, parsed the entire ops list  | 
| 
906
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz|argh)boogab/ ]  | 
| 
907
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
908
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
114
 | 
   if (DEBUG_RE_PARSING) {  | 
| 
909
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }  | 
| 
910
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
911
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
912
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # now recurse, to unroll the remaining branches (if any exist)  | 
| 
913
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
134
 | 
   my @rets;  | 
| 
914
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
155
 | 
   foreach my $alt (@alts) {  | 
| 
915
 | 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
745
 | 
     push @rets, $self->unroll_branches($depth, $alt);  | 
| 
916
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
917
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
918
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
125
 | 
   if (DEBUG_RE_PARSING) {  | 
| 
919
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }  | 
| 
920
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
921
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
922
 | 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
413
 | 
   return @rets;  | 
| 
923
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
924
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
925
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###########################################################################  | 
| 
926
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
927
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub test {  | 
| 
928
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
0
 | 
   my ($self) = @_;  | 
| 
929
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
930
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("foo", "/foo/");  | 
| 
931
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("(foo)", "/foo/");  | 
| 
932
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("foo(bar)baz", "/foobarbaz/");  | 
| 
933
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("x(foo|)", "/xfoo/ /x/");  | 
| 
934
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("fo(o|)", "/foo/ /fo/");  | 
| 
935
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("(foo|bar)", "/foo/ /bar/");  | 
| 
936
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("foo|bar", "/foo/ /bar/");  | 
| 
937
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("foo (bar|baz) argh", "/foo bar argh/ /foo baz argh/");  | 
| 
938
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("foo (bar|baz|bl(arg|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");  | 
| 
939
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("(s(otc|tco)k)", "/sotck/ /stcok/");  | 
| 
940
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $self->test_split_alt("(business partner(s|ship|)|silent partner(s|ship|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");  | 
| 
941
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
942
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
943
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub test_split_alt {  | 
| 
944
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
0
 | 
   my ($self, $in, $out) = @_;  | 
| 
945
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
946
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   my @got = $self->split_alt($in);  | 
| 
947
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $out =~ s/^\///;  | 
| 
948
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $out =~ s/\/$//;  | 
| 
949
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   my @want = split(/\/ \//, $out);  | 
| 
950
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
951
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   my $failed = 0;  | 
| 
952
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   if (scalar @want != scalar @got) {  | 
| 
953
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     warn "FAIL: results count don't match";  | 
| 
954
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $failed++;  | 
| 
955
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
956
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   else {  | 
| 
957
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my %got = map { $_ => 1 } @got;  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
958
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     foreach my $w (@want) {  | 
| 
959
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       if (!$got{$w}) {  | 
| 
960
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         warn "FAIL: '$w' not found";  | 
| 
961
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $failed++;  | 
| 
962
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
963
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
964
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
965
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
966
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   if ($failed) {  | 
| 
967
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     print "want: /".join('/ /', @want)."/\n"  or die "error writing: $!";  | 
| 
968
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     print "got:  /".join('/ /', @got)."/\n"   or die "error writing: $!";  | 
| 
969
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     return 0;  | 
| 
970
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {  | 
| 
971
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     print "ok\n"  or die "error writing: $!";  | 
| 
972
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     return 1;  | 
| 
973
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
974
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
975
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
976
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###########################################################################  | 
| 
977
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
978
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_perl {  | 
| 
979
 | 
41
 | 
 
 | 
 
 | 
  
41
  
 | 
  
0
  
 | 
132
 | 
   my ($self) = @_;  | 
| 
980
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
85
 | 
   my $perl;  | 
| 
981
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
982
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # allow user override of the perl interpreter to use when  | 
| 
983
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # extracting base strings.  | 
| 
984
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # TODO: expose this via sa-compile command-line option  | 
| 
985
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
173
 | 
   my $fromconf = $self->{main}->{conf}->{re_parser_perl};  | 
| 
986
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
987
 | 
41
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
493
 | 
   if ($fromconf) {  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
988
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $perl = $fromconf;  | 
| 
989
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } elsif ($^X =~ m|^/|) {  | 
| 
990
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
116
 | 
     $perl = $^X;  | 
| 
991
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {  | 
| 
992
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
12
 | 
     use Config;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1126
 | 
    | 
| 
993
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $perl = $Config{perlpath};  | 
| 
994
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $perl =~ s|/[^/]*$|/$^X|;  | 
| 
995
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
996
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
266
 | 
   untaint_var(\$perl);  | 
| 
997
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
116
 | 
   return $perl;  | 
| 
998
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
999
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1000
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 ###########################################################################  | 
| 
1001
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1002
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub read_cachefile {  | 
| 
1003
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
0
 | 
   my ($self, $cachefile) = @_;  | 
| 
1004
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   local *IN;  | 
| 
1005
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   if (open(IN, "<".$cachefile)) {  | 
| 
1006
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my($inbuf,$nread,$str); $str = '';  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
1007
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf }  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
1008
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     defined $nread  or die "error reading from $cachefile: $!";  | 
| 
1009
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     close IN  or die "error closing $cachefile: $!";  | 
| 
1010
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1011
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     untaint_var(\$str);  | 
| 
1012
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $VAR1;              # Data::Dumper  | 
| 
1013
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     if (eval $str) {  | 
| 
1014
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       return $VAR1;        # Data::Dumper's naming  | 
| 
1015
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1016
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
1017
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   return { };  | 
| 
1018
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
1019
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1020
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub write_cachefile {  | 
| 
1021
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
0
 | 
   my ($self, $cachefile, $cached) = @_;  | 
| 
1022
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1023
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   my $dump = Data::Dumper->new ([ $cached ]);  | 
| 
1024
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $dump->Deepcopy(1);  | 
| 
1025
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $dump->Purity(1);  | 
| 
1026
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   $dump->Indent(1);  | 
| 
1027
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   if (mkdir($self->{main}->{bases_cache_dir})) {  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1028
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # successfully created  | 
| 
1029
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } elsif ($! == EEXIST) {  | 
| 
1030
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     dbg("zoom: ok, cache directory already existed");  | 
| 
1031
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {  | 
| 
1032
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     warn "cannot create a directory: $!";  | 
| 
1033
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
1034
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   open(CACHE, ">$cachefile")  or warn "cannot write to $cachefile";  | 
| 
1035
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   print CACHE ($dump->Dump, ";1;")  or die "error writing: $!";  | 
| 
1036
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
   close CACHE  or die "error closing $cachefile: $!";  | 
| 
1037
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
1038
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1039
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1040
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1041
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item my ($cleanregexp) = fixup_re($regexp);  | 
| 
1042
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1043
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Converts encoded characters in a regular expression pattern into their  | 
| 
1044
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 equivalent characters  | 
| 
1045
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1046
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1047
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1048
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
1049
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1050
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub fixup_re {  | 
| 
1051
 | 
135
 | 
 
 | 
 
 | 
  
135
  
 | 
  
1
  
 | 
372
 | 
   my $re = shift;  | 
| 
1052
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
1053
 | 
135
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
298
 | 
   if ($fixup_re_test) { print "INPUT: /$re/\n"  or die "error writing: $!" }  | 
| 
 
 | 
  
0
  
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
1054
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
1055
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
562
 | 
   my $output = "";  | 
| 
1056
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1373
 | 
   my $TOK = qr([\"\\]);  | 
| 
1057
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1058
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
396
 | 
   my $STATE;  | 
| 
1059
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1029
 | 
   local ($1,$2);  | 
| 
1060
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1398
 | 
   while ($re =~ /\G(.*?)($TOK)/gcs) {  | 
| 
1061
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $pre = $1;  | 
| 
1062
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my $tok = $2;  | 
| 
1063
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1064
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     if (length($pre)) {  | 
| 
1065
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       $output .= "\"$pre\"";  | 
| 
1066
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1067
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1068
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     if ($tok eq '"') {  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1069
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       $output .= '"\\""';  | 
| 
1070
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1071
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     elsif ($tok eq '\\') {  | 
| 
1072
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       $re =~ /\G(x\{[^\}]+\}|[0-7]{1,3}|.)/gcs or die "\\ at end of string!";  | 
| 
1073
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       my $esc = $1;  | 
| 
1074
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       if ($esc eq '"') {  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1075
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $output .= '"\\""';  | 
| 
1076
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       } elsif ($esc eq '\\') {  | 
| 
1077
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $output .= '"**BACKSLASH**"';   # avoid hairy escape-parsing  | 
| 
1078
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       } elsif ($esc =~ /^x\{(\S+)\}\z/) {  | 
| 
1079
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $output .= '"'.chr(hex($1)).'"';  | 
| 
1080
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       } elsif ($esc =~ /^[0-7]{1,3}\z/) {  | 
| 
1081
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $output .= '"'.chr(oct($esc)).'"';  | 
| 
1082
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       } else {  | 
| 
1083
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $output .= "\"$esc\"";  | 
| 
1084
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
1085
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1086
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     else {  | 
| 
1087
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
       print "PRE: $pre\nTOK: $tok\n"  or die "error writing: $!";  | 
| 
1088
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1089
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
1090
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     | 
| 
1091
 | 
135
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
398
 | 
   if (!defined(pos($re))) {  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1092
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # no matches  | 
| 
1093
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
673
 | 
     $output .= "\"$re\"";  | 
| 
1094
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # Bug 6649: protect NL, NULL, ^Z, (and controls to stay on the safe side)  | 
| 
1095
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
529
 | 
     $output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
1096
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
1097
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   elsif (pos($re) <= length($re)) {  | 
| 
1098
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
1099
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     $output .= fixup_re(substr($re, pos($re)));  | 
| 
1100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
1101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1102
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
298
 | 
   $output =~ s/^""/"/;  # protect start and end quotes  | 
| 
1103
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
348
 | 
   $output =~ s/(?<!\\)""\z/"/;  | 
| 
1104
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
266
 | 
   $output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"  | 
| 
1105
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
243
 | 
   $output =~ s/\*\*BACKSLASH\*\*/\\\\/gs;  | 
| 
1106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1107
 | 
135
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
314
 | 
   if ($fixup_re_test) { print "OUTPUT: $output\n"  or die "error writing: $!" }  | 
| 
 
 | 
0
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
1108
 | 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
604
 | 
   return $output;  | 
| 
1109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
1110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  |