File Coverage

blib/lib/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm
Criterion Covered Total %
statement 55 85 64.7
branch 10 32 31.2
condition 1 15 6.6
subroutine 8 8 100.0
pod 2 3 66.6
total 76 143 53.1


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Plugin::AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning
21              
22             =head1 SYNOPSIS
23              
24             loadplugin Mail::SpamAssassin::Plugin::AutoLearnThreshold
25              
26             =head1 DESCRIPTION
27              
28             This plugin implements the threshold-based auto-learning discriminator
29             for SpamAssassin's Bayes subsystem. Auto-learning is a mechanism
30             whereby high-scoring mails (or low-scoring mails, for non-spam) are fed
31             into its learning systems without user intervention, during scanning.
32              
33             Note that certain tests are ignored when determining whether a message
34             should be trained upon:
35              
36             =over 4
37              
38             =item * rules with tflags set to 'learn' (the Bayesian rules)
39              
40             =item * rules with tflags set to 'userconf' (user configuration)
41              
42             =item * rules with tflags set to 'noautolearn'
43              
44             =back
45              
46             Also note that auto-learning occurs using scores from either scoreset 0
47             or 1, depending on what scoreset is used during message check. It is
48             likely that the message check and auto-learn scores will be different.
49              
50             =cut
51              
52              
53             use Mail::SpamAssassin::Plugin;
54 22     22   159 use Mail::SpamAssassin::Logger;
  22         57  
  22         757  
55 22     22   126 use strict;
  22         56  
  22         1400  
56 22     22   172 use warnings;
  22         66  
  22         585  
57 22     22   118 # use bytes;
  22         65  
  22         882  
58             use re 'taint';
59 22     22   132  
  22         54  
  22         16741  
60             our @ISA = qw(Mail::SpamAssassin::Plugin);
61              
62             my $class = shift;
63             my $mailsaobject = shift;
64 63     63 1 195  
65 63         139 $class = ref($class) || $class;
66             my $self = $class->SUPER::new($mailsaobject);
67 63   33     389 bless ($self, $class);
68 63         327  
69 63         168 $self->set_config($mailsaobject->{conf});
70              
71 63         281 return $self;
72             }
73 63         532  
74             my($self, $conf) = @_;
75             my @cmds;
76              
77 63     63 0 182 =head1 USER OPTIONS
78 63         117  
79             The following configuration settings are used to control auto-learning:
80              
81             =over 4
82              
83             =item bayes_auto_learn_threshold_nonspam n.nn (default: 0.1)
84              
85             The score threshold below which a mail has to score, to be fed into
86             SpamAssassin's learning systems automatically as a non-spam message.
87              
88             =cut
89              
90             push (@cmds, {
91             setting => 'bayes_auto_learn_threshold_nonspam',
92             default => 0.1,
93 63         330 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
94             });
95              
96             =item bayes_auto_learn_threshold_spam n.nn (default: 12.0)
97              
98             The score threshold above which a mail has to score, to be fed into
99             SpamAssassin's learning systems automatically as a spam message.
100              
101             Note: SpamAssassin requires at least 3 points from the header, and 3
102             points from the body to auto-learn as spam. Therefore, the minimum
103             working value for this option is 6.
104              
105             If the test option autolearn_force is set, the minimum value will
106             remain at 6 points but there is no requirement that the points come
107             from body and header rules. This option is useful for autolearning
108             with rules that are considered to be extremely safe indicators of
109             the spaminess of a message.
110              
111             =cut
112              
113             push (@cmds, {
114             setting => 'bayes_auto_learn_threshold_spam',
115             default => 12.0,
116 63         274 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
117             });
118              
119             =item bayes_auto_learn_on_error (0 | 1) (default: 0)
120              
121             With C<bayes_auto_learn_on_error> off, autolearning will be performed
122             even if bayes classifier already agrees with the new classification (i.e.
123             yielded BAYES_00 for what we are now trying to teach it as ham, or yielded
124             BAYES_99 for spam). This is a traditional setting, the default was chosen
125             to retain backward compatibility.
126              
127             With C<bayes_auto_learn_on_error> turned on, autolearning will be performed
128             only when a bayes classifier had a different opinion from what the autolearner
129             is now trying to teach it (i.e. it made an error in judgement). This strategy
130             may or may not produce better future classifications, but usually works
131             very well, while also preventing unnecessary overlearning and slows down
132             database growth.
133              
134             =cut
135              
136             push (@cmds, {
137             setting => 'bayes_auto_learn_on_error',
138             default => 0,
139 63         243 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
140             });
141              
142             $conf->{parser}->register_commands(\@cmds);
143             }
144              
145 63         286 my ($self, $params) = @_;
146              
147             my $scan = $params->{permsgstatus};
148             my $conf = $scan->{conf};
149 12     12 1 37  
150             # Figure out min/max for autolearning.
151 12         28 # Default to specified auto_learn_threshold settings
152 12         26 my $min = $conf->{bayes_auto_learn_threshold_nonspam};
153             my $max = $conf->{bayes_auto_learn_threshold_spam};
154              
155             # Find out what score we should consider this message to have ...
156 12         43 my $score = $scan->get_autolearn_points();
157 12         26 my $body_only_points = $scan->get_body_only_points();
158             my $head_only_points = $scan->get_head_only_points();
159             my $learned_points = $scan->get_learned_points();
160 12         62  
161 12         51 # find out if any of the tests added an autolearn_force status
162 12         57 my $force_autolearn = $scan->get_autolearn_force_status();
163 12         58 my $force_autolearn_names = $scan->get_autolearn_force_names();
164              
165             dbg("learn: auto-learn? ham=$min, spam=$max, ".
166 12         47 "body-points=".$body_only_points.", ".
167 12         44 "head-points=".$head_only_points.", ".
168             "learned-points=".$learned_points);
169 12         164  
170             my $isspam;
171             if ($score < $min) {
172             $isspam = 0;
173             } elsif ($score >= $max) {
174 12         21 $isspam = 1;
175 12 100       63 } else {
    50          
176 3         11 dbg("learn: auto-learn? no: inside auto-learn thresholds, not considered ham or spam");
177             return;
178 0         0 }
179              
180 9         38 my $learner_said_ham_points = -1.0;
181 9         32 my $learner_said_spam_points = 1.0;
182              
183             if ($isspam) {
184 3         7 my $required_body_points = 3;
185 3         6 my $required_head_points = 3;
186              
187 3 50       11 #Set a lower threshold of "just has to be spam" if autolearn_force was set on a rule
188 0         0 if ($force_autolearn) {
189 0         0 $required_body_points = -99;
190             $required_head_points = -99;
191             dbg("learn: auto-learn: autolearn_force flagged for a rule. Removing separate body and head point threshold. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)");
192 0 0       0 dbg("learn: auto-learn: autolearn_force flagged because of rule(s): $force_autolearn_names");
193 0         0 } else {
194 0         0 dbg("learn: auto-learn: autolearn_force not flagged for a rule. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)");
195 0         0 }
196 0         0  
197             if ($body_only_points < $required_body_points) {
198 0         0 dbg("learn: auto-learn? no: scored as spam but too few body points (".
199             $body_only_points." < ".$required_body_points.")");
200             return;
201 0 0       0 }
202 0         0 if ($head_only_points < $required_head_points) {
203             dbg("learn: auto-learn? no: scored as spam but too few head points (".
204 0         0 $head_only_points." < ".$required_head_points.")");
205             return;
206 0 0       0 }
207 0         0 if ($learned_points < $learner_said_ham_points) {
208             dbg("learn: auto-learn? no: scored as spam but learner indicated ham (".
209 0         0 $learned_points." < ".$learner_said_ham_points.")");
210             return;
211 0 0       0 }
212 0         0  
213             if (!$scan->is_spam()) {
214 0         0 dbg("learn: auto-learn? no: scored as ham but autolearn wanted spam");
215             return;
216             }
217 0 0       0  
218 0         0 } else {
219 0         0 if ($learned_points > $learner_said_spam_points) {
220             dbg("learn: auto-learn? no: scored as ham but learner indicated spam (".
221             $learned_points." > ".$learner_said_spam_points.")");
222             return;
223 3 50       12 }
224 0         0  
225             if ($scan->is_spam()) {
226 0         0 dbg("learn: auto-learn? no: scored as spam but autolearn wanted ham");
227             return;
228             }
229 3 100       10 }
230 1         5  
231 1         4 if ($conf->{bayes_auto_learn_on_error}) {
232             # learn-on-error strategy chosen:
233             # only allow learning if the autolearning classifier was unsure or
234             # had a different opinion from what we are trying to make it learn
235 2 50       8 #
236             my $tests = $scan->get_tag('TESTS');
237             if (defined $tests && $tests ne 'none') {
238             my %t = map { ($_,1) } split(/,/, $tests);
239             if ($isspam && $t{'BAYES_99'} || !$isspam && $t{'BAYES_00'}) {
240 0         0 dbg("learn: auto-learn? no: learn-on-error, %s, already classified ".
241 0 0 0     0 "as such", $isspam ? 'spam' : 'ham');
242 0         0 return;
  0         0  
243 0 0 0     0 }
      0        
      0        
244 0 0       0 }
245             }
246 0         0  
247             dbg("learn: auto-learn? yes, ".($isspam?"spam ($score > $max)":"ham ($score < $min)")." autolearn_force=".($force_autolearn?"yes":"no"));
248            
249             #Return an array reference because call_plugins only carry's one return value
250             return [$isspam, $force_autolearn, $force_autolearn_names];
251 2 50       20 }
    50          
252              
253             1;
254 2         9  
255             =back
256              
257             =cut