File Coverage

blib/lib/Mail/SpamAssassin/Plugin/Bayes.pm
Criterion Covered Total %
statement 551 750 73.4
branch 157 340 46.1
condition 45 119 37.8
subroutine 58 71 81.6
pod 11 21 52.3
total 822 1301 63.1


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier
21              
22             =head1 DESCRIPTION
23              
24             This is a Bayesian-style probabilistic classifier, using an algorithm based on
25             the one detailed in Paul Graham's I<A Plan For Spam> paper at:
26              
27             http://www.paulgraham.com/spam.html
28              
29             It also incorporates some other aspects taken from Graham Robinson's webpage
30             on the subject at:
31              
32             http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
33              
34             And the chi-square probability combiner as described here:
35              
36             http://www.linuxjournal.com/print.php?sid=6467
37              
38             The results are incorporated into SpamAssassin as the BAYES_* rules.
39              
40             =head1 METHODS
41              
42             =cut
43              
44             package Mail::SpamAssassin::Plugin::Bayes;
45              
46 22     22   147 use strict;
  22         45  
  22         715  
47 22     22   131 use warnings;
  22         46  
  22         710  
48             # use bytes;
49 22     22   125 use re 'taint';
  22         50  
  22         1732  
50              
51             BEGIN {
52 22         120 eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
  22         914  
  22         587  
53 22 50   22   100 or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
  0         0  
  0         0  
54             }
55              
56 22     22   152 use Mail::SpamAssassin;
  22         48  
  22         665  
57 22     22   130 use Mail::SpamAssassin::Plugin;
  22         47  
  22         577  
58 22     22   134 use Mail::SpamAssassin::PerMsgStatus;
  22         49  
  22         678  
59 22     22   132 use Mail::SpamAssassin::Logger;
  22         65  
  22         1411  
60 22     22   143 use Mail::SpamAssassin::Util qw(untaint_var);
  22         47  
  22         1016  
61              
62             # pick ONLY ONE of these combining implementations.
63 22     22   7377 use Mail::SpamAssassin::Bayes::CombineChi;
  22         56  
  22         13343  
64             # use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
65              
66             our @ISA = qw(Mail::SpamAssassin::Plugin);
67              
68             # Which headers should we scan for tokens? Don't use all of them, as it's easy
69             # to pick up spurious clues from some. What we now do is use all of them
70             # *less* these well-known headers; that way we can pick up spammers' tracking
71             # headers (which are obviously not well-known in advance!).
72              
73             # Received is handled specially
74             our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise
75             |Delivered-To |Delivery-Date
76             |(?:X-)?Envelope-To
77             |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text
78              
79             |Subject # not worth a tiny gain vs. to db size increase
80              
81             # Date: can provide invalid cues if your spam corpus is
82             # older/newer than ham
83             |Date
84              
85             # List headers: ignore. a spamfiltering mailing list will
86             # become a nonspam sign.
87             |X-List|(?:X-)?Mailing-List
88             |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe
89             |Unsubscribe|Host|Id|Manager|Admin|Comment
90             |Name|Url)
91             |X-Unsub(?:scribe)?
92             |X-Mailman-Version |X-Been[Tt]here |X-Loop
93             |Mail-Followup-To
94             |X-eGroups-(?:Return|From)
95             |X-MDMailing-List
96             |X-XEmacs-List
97              
98             # gatewayed through mailing list (thanks to Allen Smith)
99             |(?:X-)?Resent-(?:From|To|Date)
100             |(?:X-)?Original-(?:From|To|Date)
101              
102             # Spamfilter/virus-scanner headers: too easy to chain from
103             # these
104             |X-MailScanner(?:-SpamCheck)?
105             |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))?
106             |X-Antispam |X-RBL-Warning |X-Mailscanner
107             |X-MDaemon-Deliver-To |X-Virus-Scanned
108             |X-Mass-Check-Id
109             |X-Pyzor |X-DCC-\S{2,25}-Metrics
110             |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner
111             |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status
112             |X-SpamCop-[^:]+
113             |X-SMTPD |(?:X-)?Spam-Apparently-To
114             |SPAM |X-Perlmx-Spam
115             |X-Bogosity
116              
117             # some noisy Outlook headers that add no good clues:
118             |Content-Class |Thread-(?:Index|Topic)
119             |X-Original[Aa]rrival[Tt]ime
120              
121             # Annotations from IMAP, POP, and MH:
122             |(?:X-)?Status |X-Flags |X-Keywords |Replied |Forwarded
123             |Lines |Content-Length
124             |X-UIDL? |X-IMAPbase
125              
126             # Annotations from Bugzilla
127             |X-Bugzilla-[^:]+
128              
129             # Annotations from VM: (thanks to Allen Smith)
130             |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified
131             |Summary-Format|VHeader|v\d-Data|Message-Order)
132              
133             # Annotations from Gnus:
134             | X-Gnus-Mail-Source
135             | Xref
136              
137             )}x;
138              
139             # Note only the presence of these headers, in order to reduce the
140             # hapaxen they generate.
141             our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
142             |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint
143             |D(?:KIM|omainKey)-Signature
144             )}ix;
145              
146             # tweaks tested as of Nov 18 2002 by jm posted to -devel at
147             # http://sourceforge.net/p/spamassassin/mailman/message/12977556/
148             # for results. The winners are now the default settings.
149 22     22   186 use constant IGNORE_TITLE_CASE => 1;
  22         50  
  22         1159  
150 22     22   122 use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0;
  22         43  
  22         898  
151 22     22   111 use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1;
  22         45  
  22         853  
152 22     22   111 use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
  22         38  
  22         846  
153              
154             # tweaks by jm on May 12 2003, see -devel email at
155             # http://sourceforge.net/p/spamassassin/mailman/message/14844556/
156 22     22   115 use constant PRE_CHEW_ADDR_HEADERS => 1;
  22         38  
  22         817  
157 22     22   118 use constant CHEW_BODY_URIS => 1;
  22         37  
  22         846  
158 22     22   112 use constant CHEW_BODY_MAILADDRS => 1;
  22         38  
  22         796  
159 22     22   107 use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
  22         49  
  22         776  
160 22     22   106 use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
  22         37  
  22         788  
161 22     22   119 use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
  22         89  
  22         777  
162 22     22   110 use constant IGNORE_MSGID_TOKENS => 0;
  22         39  
  22         777  
163              
164             # tweaks of 12 March 2004, see bug 2129.
165 22     22   117 use constant DECOMPOSE_BODY_TOKENS => 1;
  22         47  
  22         823  
166 22     22   110 use constant MAP_HEADERS_MID => 1;
  22         36  
  22         860  
167 22     22   113 use constant MAP_HEADERS_FROMTOCC => 1;
  22         39  
  22         834  
168 22     22   118 use constant MAP_HEADERS_USERAGENT => 1;
  22         39  
  22         877  
169              
170             # tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
171 22     22   114 use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
  22         41  
  22         913  
172 22     22   133 use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
  22         41  
  22         2514  
173              
174             # We store header-mined tokens in the db with a "HHeaderName:val" format.
175             # some headers may contain lots of gibberish tokens, so allow a little basic
176             # compression by mapping the header name at least here. these are the headers
177             # which appear with the most frequency in my db. note: this doesn't have to
178             # be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing
179             # tokens from multiple different headers may impact accuracy, so might as well
180             # avoid this if possible. These are the top ones from my corpus, BTW (jm).
181             our %HEADER_NAME_COMPRESSION = (
182             'Message-Id' => '*m',
183             'Message-ID' => '*M',
184             'Received' => '*r',
185             'User-Agent' => '*u',
186             'References' => '*f',
187             'In-Reply-To' => '*i',
188             'From' => '*F',
189             'Reply-To' => '*R',
190             'Return-Path' => '*p',
191             'Return-path' => '*rp',
192             'X-Mailer' => '*x',
193             'X-Authentication-Warning' => '*a',
194             'Organization' => '*o',
195             'Organisation' => '*o',
196             'Content-Type' => '*ct',
197             'Content-Disposition' => '*cd',
198             'Content-Transfer-Encoding' => '*ce',
199             'x-spam-relays-trusted' => '*RT',
200             'x-spam-relays-untrusted' => '*RU',
201             );
202              
203             # How many seconds should the opportunistic_expire lock be valid?
204             our $OPPORTUNISTIC_LOCK_VALID = 300;
205              
206             # Should we use the Robinson f(w) equation from
207             # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?
208             # It gives better results, in that scores are more likely to distribute
209             # into the <0.5 range for nonspam and >0.5 for spam.
210 22     22   168 use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
  22         54  
  22         1146  
211              
212             # How many of the most significant tokens should we use for the p(w)
213             # calculation?
214 22     22   127 use constant N_SIGNIFICANT_TOKENS => 150;
  22         51  
  22         1021  
215              
216             # How many significant tokens are required for a classifier score to
217             # be considered usable?
218 22     22   127 use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
  22         53  
  22         1190  
219              
220             # How long a token should we hold onto? (note: German speakers typically
221             # will require a longer token than English ones.)
222 22     22   131 use constant MAX_TOKEN_LENGTH => 15;
  22         57  
  22         166045  
223              
224             ###########################################################################
225              
226             sub new {
227 63     63 1 229 my $class = shift;
228 63         225 my ($main) = @_;
229              
230 63   33     456 $class = ref($class) || $class;
231 63         419 my $self = $class->SUPER::new($main);
232 63         193 bless ($self, $class);
233              
234 63         234 $self->{main} = $main;
235 63         218 $self->{conf} = $main->{conf};
236 63         206 $self->{use_ignores} = 1;
237              
238 63         335 $self->register_eval_rule("check_bayes");
239 63         528 $self;
240             }
241              
242             sub finish {
243 40     40 1 99 my $self = shift;
244 40 100       206 if ($self->{store}) {
245 39         284 $self->{store}->untie_db();
246             }
247 40         118 %{$self} = ();
  40         268  
248             }
249              
250             ###########################################################################
251              
252             # Plugin hook.
253             # Return this implementation object, for callers that need to know
254             # it. TODO: callers shouldn't *need* to know it!
255             # used only in test suite to get access to {store}, internal APIs.
256             #
257 1160     1160 0 1695 sub learner_get_implementation { return shift; }
258              
259             ###########################################################################
260              
261             # Plugin hook.
262             # Called in the parent process shortly before forking off child processes.
263             sub prefork_init {
264 0     0 0 0 my ($self) = @_;
265              
266 0 0 0     0 if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) {
267 0         0 $self->{store}->prefork_init;
268             }
269             }
270              
271             ###########################################################################
272              
273             # Plugin hook.
274             # Called in a child process shortly after being spawned.
275             sub spamd_child_init {
276 0     0 1 0 my ($self) = @_;
277              
278 0 0 0     0 if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) {
279 0         0 $self->{store}->spamd_child_init;
280             }
281             }
282              
283             ###########################################################################
284              
285             # Plugin hook.
286             sub check_bayes {
287 0     0 0 0 my ($self, $pms, $fulltext, $min, $max) = @_;
288              
289 0 0       0 return 0 if (!$self->{conf}->{use_learner});
290 0 0 0     0 return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules});
291              
292 0 0       0 if (!exists ($pms->{bayes_score})) {
293 0         0 my $timer = $self->{main}->time_method("check_bayes");
294 0         0 $pms->{bayes_score} = $self->scan($pms, $pms->{msg});
295             }
296              
297 0 0 0     0 if (defined $pms->{bayes_score} &&
      0        
      0        
      0        
298             ($min == 0 || $pms->{bayes_score} > $min) &&
299             ($max eq "undef" || $pms->{bayes_score} <= $max))
300             {
301 0 0       0 if ($self->{conf}->{detailed_bayes_score}) {
302             $pms->test_log(sprintf ("score: %3.4f, hits: %s",
303             $pms->{bayes_score},
304 0         0 $pms->{bayes_hits}));
305             }
306             else {
307 0         0 $pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score}));
308             }
309 0         0 return 1;
310             }
311              
312 0         0 return 0;
313             }
314              
315             ###########################################################################
316              
317             # Plugin hook.
318             sub learner_close {
319 6     6 1 25 my ($self, $params) = @_;
320 6         11 my $quiet = $params->{quiet};
321              
322             # do a sanity check here. Weird things happen if we remain tied
323             # after compiling; for example, spamd will never see that the
324             # number of messages has reached the bayes-scanning threshold.
325 6 50       28 if ($self->{store}->db_readable()) {
326 0 0       0 warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
327 0         0 $self->{store}->untie_db();
328             }
329             }
330              
331             ###########################################################################
332              
333             # read configuration items to control bayes behaviour. Called by
334             # BayesStore::read_db_configs().
335             sub read_db_configs {
336 42     42 0 87 my ($self) = @_;
337              
338             # use of hapaxes. Set on bayes object, since it controls prob
339             # computation.
340 42         119 $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
341             }
342             ###########################################################################
343              
344             sub ignore_message {
345 0     0 0 0 my ($self,$PMS) = @_;
346              
347 0 0       0 return 0 unless $self->{use_ignores};
348              
349 0         0 my $ig_from = $self->{main}->call_plugins ("check_wb_list",
350             { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
351 0         0 my $ig_to = $self->{main}->call_plugins ("check_wb_list",
352             { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
353              
354 0   0     0 my $ignore = $ig_from || $ig_to;
355              
356 0 0       0 dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
357              
358 0         0 return $ignore;
359             }
360              
361             ###########################################################################
362              
363             # Plugin hook.
364             sub learn_message {
365 10     10 1 55 my ($self, $params) = @_;
366 10         24 my $isspam = $params->{isspam};
367 10         23 my $msg = $params->{msg};
368 10         22 my $id = $params->{id};
369              
370 10 50       40 if (!$self->{conf}->{use_bayes}) { return; }
  0         0  
371              
372 10         49 my $msgdata = $self->get_body_from_msg ($msg);
373 10         22 my $ret;
374              
375             eval {
376 10         70 local $SIG{'__DIE__'}; # do not run user die() traps in here
377 10         39 my $timer = $self->{main}->time_method("b_learn");
378              
379 10         18 my $ok;
380 10 100       39 if ($self->{main}->{learn_to_journal}) {
381             # If we're going to learn to journal, we'll try going r/o first...
382             # If that fails for some reason, let's try going r/w. This happens
383             # if the DB doesn't exist yet.
384 2   33     11 $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
385             } else {
386 8         38 $ok = $self->{store}->tie_db_writable();
387             }
388              
389 10 100       38 if ($ok) {
390 8         60 $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
391              
392 8 50       51 if (!$self->{main}->{learn_caller_will_untie}) {
393 8         54 $self->{store}->untie_db();
394             }
395             }
396 10         77 1;
397 10 50       21 } or do { # if we died, untie the dbs.
398 0 0       0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
399 0         0 $self->{store}->untie_db();
400 0         0 die "bayes: (in learn) $eval_stat\n";
401             };
402              
403 10         61 return $ret;
404             }
405              
406             # this function is trapped by the wrapper above
407             sub _learn_trapped {
408 8     8   37 my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
409 8         28 my @msgid = ( $msgid );
410              
411 8 50       31 if (!defined $msgid) {
412 8         35 @msgid = $self->get_msgid($msg);
413             }
414              
415 8         33 foreach my $msgid_t ( @msgid ) {
416 12         59 my $seen = $self->{store}->seen_get ($msgid_t);
417              
418 12 100       55 if (defined ($seen)) {
419 4 100 66     52 if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
    50 33        
      66        
420 2         18 dbg("bayes: $msgid_t already learnt correctly, not learning twice");
421 2         7 return 0;
422             } elsif ($seen !~ /^[hs]$/) {
423 0         0 warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored");
424             } else {
425             # bug 3704: If the message was already learned, don't try learning it again.
426             # this prevents, for instance, manually learning as spam, then autolearning
427             # as ham, or visa versa.
428 2 50       13 if ($self->{main}->{learn_no_relearn}) {
429 0         0 dbg("bayes: $msgid_t already learnt as opposite, not re-learning");
430 0         0 return 0;
431             }
432              
433 2         13 dbg("bayes: $msgid_t already learnt as opposite, forgetting first");
434              
435             # kluge so that forget() won't untie the db on us ...
436 2         5 my $orig = $self->{main}->{learn_caller_will_untie};
437 2         5 $self->{main}->{learn_caller_will_untie} = 1;
438              
439 2         12 my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
440              
441             # reset the value post-forget() ...
442 2         10 $self->{main}->{learn_caller_will_untie} = $orig;
443            
444             # forget() gave us a fatal error, so propagate that up
445 2 50       10 if ($fatal) {
446 0         0 dbg("bayes: forget() returned a fatal error, so learn() will too");
447 0         0 return;
448             }
449             }
450              
451             # we're only going to have seen this once, so stop if it's been
452             # seen already
453 2         9 last;
454             }
455             }
456              
457             # Now that we're sure we haven't seen this message before ...
458 6         18 $msgid = $msgid[0];
459              
460 6         44 my $msgatime = $msg->receive_date();
461              
462             # If the message atime comes back as being more than 1 day in the
463             # future, something's messed up and we should revert to current time as
464             # a safety measure.
465             #
466 6 50       23 $msgatime = time if ( $msgatime - time > 86400 );
467              
468 6         42 my $tokens = $self->tokenize($msg, $msgdata);
469              
470 6         19 { my $timer = $self->{main}->time_method('b_count_change');
  6         39  
471 6 100       25 if ($isspam) {
472 4         48 $self->{store}->nspam_nham_change(1, 0);
473 4         34 $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
474             } else {
475 2         16 $self->{store}->nspam_nham_change(0, 1);
476 2         14 $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
477             }
478             }
479              
480 6 100       147 $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
481 6         50 $self->{store}->cleanup();
482              
483 6         74 $self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
484             isspam => $isspam,
485             msgid => $msgid,
486             msgatime => $msgatime,
487             });
488              
489 6         69 dbg("bayes: learned '$msgid', atime: $msgatime");
490              
491 6         282 1;
492             }
493              
494             ###########################################################################
495              
496             # Plugin hook.
497             sub forget_message {
498 4     4 1 15 my ($self, $params) = @_;
499 4         11 my $msg = $params->{msg};
500 4         8 my $id = $params->{id};
501              
502 4 50       18 if (!$self->{conf}->{use_bayes}) { return; }
  0         0  
503              
504 4         17 my $msgdata = $self->get_body_from_msg ($msg);
505 4         9 my $ret;
506              
507             # we still tie for writing here, since we write to the seen db
508             # synchronously
509             eval {
510 4         28 local $SIG{'__DIE__'}; # do not run user die() traps in here
511 4         17 my $timer = $self->{main}->time_method("b_learn");
512              
513 4         11 my $ok;
514 4 50       19 if ($self->{main}->{learn_to_journal}) {
515             # If we're going to learn to journal, we'll try going r/o first...
516             # If that fails for some reason, let's try going r/w. This happens
517             # if the DB doesn't exist yet.
518 0   0     0 $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
519             } else {
520 4         17 $ok = $self->{store}->tie_db_writable();
521             }
522              
523 4 50       22 if ($ok) {
524 4         24 $ret = $self->_forget_trapped ($msg, $msgdata, $id);
525              
526 4 100       22 if (!$self->{main}->{learn_caller_will_untie}) {
527 2         12 $self->{store}->untie_db();
528             }
529             }
530 4         38 1;
531 4 50       9 } or do { # if we died, untie the dbs.
532 0 0       0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
533 0         0 $self->{store}->untie_db();
534 0         0 die "bayes: (in forget) $eval_stat\n";
535             };
536              
537 4         29 return $ret;
538             }
539              
540             # this function is trapped by the wrapper above
541             sub _forget_trapped {
542 4     4   15 my ($self, $msg, $msgdata, $msgid) = @_;
543 4         10 my @msgid = ( $msgid );
544 4         7 my $isspam;
545              
546 4 50       14 if (!defined $msgid) {
547 4         15 @msgid = $self->get_msgid($msg);
548             }
549              
550 4         32 while( $msgid = shift @msgid ) {
551 4         22 my $seen = $self->{store}->seen_get ($msgid);
552              
553 4 50       20 if (defined ($seen)) {
554 4 100       90 if ($seen eq 's') {
    50          
555 2         7 $isspam = 1;
556             } elsif ($seen eq 'h') {
557 2         4 $isspam = 0;
558             } else {
559 0         0 dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
560 0         0 return 0;
561             }
562              
563             # messages should only be learned once, so stop if we find a msgid
564             # which was seen before
565 4         12 last;
566             }
567             else {
568 0         0 dbg("bayes: forget: msgid $msgid not learnt, ignored");
569             }
570             }
571              
572             # This message wasn't learnt before, so return
573 4 50       17 if (!defined $isspam) {
    100          
574 0         0 dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
575 0         0 return 0;
576             }
577             elsif ($isspam) {
578 2         13 $self->{store}->nspam_nham_change (-1, 0);
579             }
580             else {
581 2         12 $self->{store}->nspam_nham_change (0, -1);
582             }
583              
584 4         42 my $tokens = $self->tokenize($msg, $msgdata);
585              
586 4 100       15 if ($isspam) {
587 2         15 $self->{store}->multi_tok_count_change (-1, 0, $tokens);
588             } else {
589 2         16 $self->{store}->multi_tok_count_change (0, -1, $tokens);
590             }
591              
592 4         107 $self->{store}->seen_delete ($msgid);
593 4         36 $self->{store}->cleanup();
594              
595 4         48 $self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
596             isspam => $isspam,
597             msgid => $msgid,
598             });
599              
600 4         223 1;
601             }
602              
603             ###########################################################################
604              
605             # Plugin hook.
606             sub learner_sync {
607 2     2 1 13 my ($self, $params) = @_;
608 2 50       16 if (!$self->{conf}->{use_bayes}) { return 0; }
  0         0  
609 2         12 dbg("bayes: bayes journal sync starting");
610 2         29 $self->{store}->sync($params);
611 2         16 dbg("bayes: bayes journal sync completed");
612             }
613              
614             ###########################################################################
615              
616             # Plugin hook.
617             sub learner_expire_old_training {
618 0     0 1 0 my ($self, $params) = @_;
619 0 0       0 if (!$self->{conf}->{use_bayes}) { return 0; }
  0         0  
620 0         0 dbg("bayes: expiry starting");
621 0         0 my $timer = $self->{main}->time_method("expire_bayes");
622 0         0 $self->{store}->expire_old_tokens($params);
623 0         0 dbg("bayes: expiry completed");
624             }
625              
626             ###########################################################################
627              
628             # Plugin hook.
629             # Check to make sure we can tie() the DB, and we have enough entries to do a scan
630             # if we're told the caller will untie(), go ahead and leave the db tied.
631             sub learner_is_scan_available {
632 146     146 1 461 my ($self, $params) = @_;
633              
634 146 50       580 return 0 unless $self->{conf}->{use_bayes};
635 146 100       812 return 0 unless $self->{store}->tie_db_readonly();
636              
637             # We need the DB to stay tied, so if the journal sync occurs, don't untie!
638 6         22 my $caller_untie = $self->{main}->{learn_caller_will_untie};
639 6         25 $self->{main}->{learn_caller_will_untie} = 1;
640              
641             # Do a journal sync if necessary. Do this before the nspam_nham_get()
642             # call since the sync may cause an update in the number of messages
643             # learnt.
644 6         43 $self->_opportunistic_calls(1);
645              
646             # Reset the variable appropriately
647 6         21 $self->{main}->{learn_caller_will_untie} = $caller_untie;
648              
649 6         63 my ($ns, $nn) = $self->{store}->nspam_nham_get();
650              
651 6 50       41 if ($ns < $self->{conf}->{bayes_min_spam_num}) {
652 0         0 dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
653 0 0       0 if (!$self->{main}->{learn_caller_will_untie}) {
654 0         0 $self->{store}->untie_db();
655             }
656 0         0 return 0;
657             }
658 6 50       27 if ($nn < $self->{conf}->{bayes_min_ham_num}) {
659 0         0 dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
660 0 0       0 if (!$self->{main}->{learn_caller_will_untie}) {
661 0         0 $self->{store}->untie_db();
662             }
663 0         0 return 0;
664             }
665              
666 6         29 return 1;
667             }
668              
669             ###########################################################################
670              
671             sub scan {
672 4     4 0 17 my ($self, $permsgstatus, $msg) = @_;
673 4         8 my $score;
674              
675 4 50       27 return unless $self->{conf}->{use_learner};
676              
677             # When we're doing a scan, we'll guarantee that we'll do the untie,
678             # so override the global setting until we're done.
679 4         18 my $caller_untie = $self->{main}->{learn_caller_will_untie};
680 4         12 $self->{main}->{learn_caller_will_untie} = 1;
681              
682 4 50       39 goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
683              
684 4 50       22 goto skip unless $self->learner_is_scan_available();
685              
686 4         17 my ($ns, $nn) = $self->{store}->nspam_nham_get();
687              
688             ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
689             ## $self->{raw_counts} = " ns=$ns nn=$nn ";
690             ## }
691              
692 4         30 dbg("bayes: corpus size: nspam = $ns, nham = $nn");
693              
694 4         7 my $msgtokens;
695 4         8 { my $timer = $self->{main}->time_method('b_tokenize');
  4         25  
696 4         21 my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
697 4         37 $msgtokens = $self->tokenize($msg, $msgdata);
698             }
699              
700 4         11 my $tokensdata;
701 4         10 { my $timer = $self->{main}->time_method('b_tok_get_all');
  4         48  
702 4         16 $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
  4         213  
703             }
704              
705 4         73 my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
706              
707 4         26 my $probabilities_ref =
708             $self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
709              
710 4         18 my %pw;
711 4         10 foreach my $tokendata (@{$tokensdata}) {
  4         19  
712 1104         1325 my $prob = shift(@$probabilities_ref);
713 1104 100       1667 next unless defined $prob;
714 502         546 my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
  502         800  
715 502         1514 $pw{$token} = {
716             prob => $prob,
717             spam_count => $tok_spam,
718             ham_count => $tok_ham,
719             atime => $atime
720             };
721             }
722              
723 4         95 my @pw_keys = keys %pw;
724              
725             # If none of the tokens were found in the DB, we're going to skip
726             # this message...
727 4 50       28 if (!@pw_keys) {
728 0         0 dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
729 0         0 goto skip;
730             }
731              
732 4         8 my $tcount_total = keys %{$msgtokens};
  4         13  
733 4         10 my $tcount_learned = scalar @pw_keys;
734              
735             # Figure out the message receive time (used as atime below)
736             # If the message atime comes back as being in the future, something's
737             # messed up and we should revert to current time as a safety measure.
738             #
739 4         39 my $msgatime = $msg->receive_date();
740 4         12 my $now = time;
741 4 50       22 $msgatime = $now if ( $msgatime > $now );
742              
743 4         9 my @touch_tokens;
744 4         23 my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
745 4         29 my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
746              
747 4         367 my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
748 4         57 my $log_each_token = (would_log('dbg', 'bayes') > 1);
749              
750             # now take the most significant tokens and calculate probs using
751             # Robinson's formula.
752              
753 4         27 @pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
  2933         3462  
754              
755 4 100       20 if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
  2         16  
756              
757 4         16 my @sorted;
758 4         14 foreach my $tok (@pw_keys) {
759 414 100       657 next if $tok_strength{$tok} <
760             $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
761              
762 272         331 my $pw_tok = $pw{$tok};
763 272         364 my $pw_prob = $pw_tok->{prob};
764              
765             # What's more expensive, scanning headers for HAMMYTOKENS and
766             # SPAMMYTOKENS tags that aren't there or collecting data that
767             # won't be used? Just collecting the data is certainly simpler.
768             #
769 272   50     564 my $raw_token = $msgtokens->{$tok} || "(unknown)";
770 272         308 my $s = $pw_tok->{spam_count};
771 272         303 my $n = $pw_tok->{ham_count};
772 272         309 my $a = $pw_tok->{atime};
773              
774 272 100       299 push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
  272         710  
775             [$raw_token, $pw_prob, $s, $n, $a] );
776              
777 272         395 push(@sorted, $pw_prob);
778              
779             # update the atime on this token, it proved useful
780 272         341 push(@touch_tokens, $tok);
781              
782 272 50       500 if ($log_each_token) {
783 0         0 dbg("bayes: token '$raw_token' => $pw_prob");
784             }
785             }
786              
787 4 50 50     44 if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
788             $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
789             {
790 0         0 dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
791 0         0 goto skip;
792             }
793              
794 4         38 $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
795 4         11 undef $timer_compute_prob; # end a timing section
796              
797             # Couldn't come up with a probability?
798 4 50       15 goto skip unless defined $score;
799              
800 4         67 dbg("bayes: score = $score");
801              
802             # no need to call tok_touch_all unless there were significant
803             # tokens and a score was returned
804             # we don't really care about the return value here
805              
806 4         8 { my $timer = $self->{main}->time_method('b_tok_touch_all');
  4         26  
807 4         82 $self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
808             }
809              
810 4         19 my $timer_finish = $self->{main}->time_method('b_finish');
811              
812 4         13 $permsgstatus->{bayes_nspam} = $ns;
813 4         12 $permsgstatus->{bayes_nham} = $nn;
814              
815             ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
816             ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
817             ## }
818              
819 4         63 $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
820             probsref => \%pw,
821             score => $score,
822             msgatime => $msgatime,
823             significant_tokens => \@touch_tokens,
824             });
825              
826             skip:
827 4 50       22 if (!defined $score) {
828 0         0 dbg("bayes: not scoring message, returning undef");
829             }
830              
831 4         17 undef $timer_compute_prob; # end a timing section if still running
832 4 50       14 if (!defined $timer_finish) {
833 4         17 $timer_finish = $self->{main}->time_method('b_finish');
834             }
835              
836             # Take any opportunistic actions we can take
837 4 50       17 if ($self->{main}->{opportunistic_expire_check_only}) {
838             # we're supposed to report on expiry only -- so do the
839             # _opportunistic_calls() run for the journal only.
840 0         0 $self->_opportunistic_calls(1);
841 0         0 $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
842             }
843             else {
844 4         25 $self->_opportunistic_calls();
845             }
846              
847             # Do any cleanup we need to do
848 4         27 $self->{store}->cleanup();
849              
850             # Reset the value accordingly
851 4         15 $self->{main}->{learn_caller_will_untie} = $caller_untie;
852              
853             # If our caller won't untie the db, we need to do it.
854 4 50       23 if (!$caller_untie) {
855 4         30 $self->{store}->untie_db();
856             }
857              
858             $permsgstatus->set_tag ('BAYESTCHAMMY',
859 4 50       27 ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));
  4         39  
860             $permsgstatus->set_tag ('BAYESTCSPAMMY',
861 4 50       18 ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));
  4         19  
862 4         14 $permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);
863 4         23 $permsgstatus->set_tag ('BAYESTC', $tcount_total);
864              
865             $permsgstatus->set_tag ('HAMMYTOKENS', sub {
866 0     0   0 my $pms = shift;
867             $self->bayes_report_make_list
868 0         0 ($pms, $pms->{bayes_token_info_hammy}, shift);
869 4         53 });
870              
871             $permsgstatus->set_tag ('SPAMMYTOKENS', sub {
872 0     0   0 my $pms = shift;
873             $self->bayes_report_make_list
874 0         0 ($pms, $pms->{bayes_token_info_spammy}, shift);
875 4         25 });
876              
877             $permsgstatus->set_tag ('TOKENSUMMARY', sub {
878 0     0   0 my $pms = shift;
879 0 0       0 if ( defined $pms->{tag_data}{BAYESTC} )
880             {
881             my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED}
882             - $pms->{tag_data}{BAYESTCSPAMMY}
883 0         0 - $pms->{tag_data}{BAYESTCHAMMY};
884             my $tcount_new = $pms->{tag_data}{BAYESTC}
885 0         0 - $pms->{tag_data}{BAYESTCLEARNED};
886 0         0 "Tokens: new, $tcount_new; "
887             ."hammy, $pms->{tag_data}{BAYESTCHAMMY}; "
888             ."neutral, $tcount_neutral; "
889             ."spammy, $pms->{tag_data}{BAYESTCSPAMMY}."
890             } else {
891 0         0 "Bayes not run.";
892             }
893 4         36 });
894              
895              
896 4         645 return $score;
897             }
898              
899             ###########################################################################
900              
901             # Plugin hook.
902             sub learner_dump_database {
903 0     0 1 0 my ($self, $params) = @_;
904 0         0 my $magic = $params->{magic};
905 0         0 my $toks = $params->{toks};
906 0         0 my $regex = $params->{regex};
907              
908             # allow dump to occur even if use_bayes disables everything else ...
909             #return 0 unless $self->{conf}->{use_bayes};
910 0 0       0 return 0 unless $self->{store}->tie_db_readonly();
911            
912 0         0 my @vars = $self->{store}->get_storage_variables();
913              
914 0         0 my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
915              
916 0         0 my $template = '%3.3f %10u %10u %10u %s'."\n";
917              
918 0 0       0 if ( $magic ) {
919 0 0       0 printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version')
920             or die "Error writing: $!";
921 0 0       0 printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam')
922             or die "Error writing: $!";
923 0 0       0 printf($template, 0.0, 0, $nh, 0, 'non-token data: nham')
924             or die "Error writing: $!";
925 0 0       0 printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens')
926             or die "Error writing: $!";
927 0 0       0 printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime')
928             or die "Error writing: $!";
929 0 0       0 if ( $bv >= 2 ) {
930 0 0       0 printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime')
931             or die "Error writing: $!";
932             }
933 0 0       0 if ( $bv < 2 ) {
934 0 0       0 printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count')
935             or die "Error writing: $!";
936             }
937 0 0       0 if ( $bv >= 2 ) {
938 0 0       0 printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime')
939             or die "Error writing: $!";
940             }
941 0 0       0 printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime')
942             or die "Error writing: $!";
943 0 0       0 if ( $bv >= 2 ) {
944 0 0       0 printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta')
945             or die "Error writing: $!";
946              
947 0 0       0 printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count')
948             or die "Error writing: $!";
949             }
950             }
951              
952 0 0       0 if ( $toks ) {
953             # let the store sort out the db_toks
954 0         0 $self->{store}->dump_db_toks($template, $regex, @vars);
955             }
956              
957 0 0       0 if (!$self->{main}->{learn_caller_will_untie}) {
958 0         0 $self->{store}->untie_db();
959             }
960 0         0 return 1;
961             }
962              
963             ###########################################################################
964             # TODO: these are NOT public, but the test suite needs to call them.
965              
966             sub get_msgid {
967 14     14 0 45 my ($self, $msg) = @_;
968              
969 14         22 my @msgid;
970              
971 14         47 my $msgid = $msg->get_header("Message-Id");
972 14 50 33     187 if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
      33        
973             # remove \r and < and > prefix/suffixes
974 14         36 chomp $msgid;
975 14         63 $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
  14         71  
976 14         49 push(@msgid, $msgid);
977             }
978              
979             # Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation
980              
981             # Use sha1_hex(Date: and top N bytes of body)
982             # where N is MIN(1024 bytes, 1/2 of body length)
983             #
984 14         42 my $date = $msg->get_header("Date");
985 14 50 33     70 $date = "None" if (!defined $date || $date eq ''); # No Date?
986              
987             #Removed per bug 5185
988             #my @rcvd = $msg->get_header("Received");
989             #my $rcvd = $rcvd[$#rcvd];
990             #$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
991              
992             # Make a copy since pristine_body is a reference ...
993 14         64 my $body = join('', $msg->get_pristine_body());
994              
995 14 50       47 if (length($body) > 64) { # Small Body?
996 14 50       32 my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
997 14         42 substr($body, $keep) = '';
998             }
999              
1000             #Stripping all CR and LF so that testing midstream from MTA and post delivery don't
1001             #generate different id's simply because of LF<->CR<->CRLF changes.
1002 14         257 $body =~ s/[\r\n]//g;
1003              
1004 14         217 unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');
1005              
1006 14 50       69 return wantarray ? @msgid : $msgid[0];
1007             }
1008              
1009             sub get_body_from_msg {
1010 20     20 0 73 my ($self, $msg) = @_;
1011              
1012 20 50       74 if (!ref $msg) {
1013             # I have no idea why this seems to happen. TODO
1014 0         0 warn "bayes: msg not a ref: '$msg'";
1015 0         0 return { };
1016             }
1017              
1018             my $permsgstatus =
1019 20         174 Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
1020 20         153 $msg->extract_message_metadata ($permsgstatus);
1021 20         116 my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
1022 20         110 $permsgstatus->finish();
1023              
1024 20 50       76 if (!defined $msgdata) {
1025             # why?!
1026 0         0 warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
1027 0         0 return { };
1028             }
1029              
1030 20         80 return $msgdata;
1031             }
1032              
1033             sub _get_msgdata_from_permsgstatus {
1034 24     24   63 my ($self, $pms) = @_;
1035              
1036 24         63 my $t_src = $self->{conf}->{bayes_token_sources};
1037 24         59 my $msgdata = { };
1038             $msgdata->{bayes_token_body} =
1039 24 50       184 $pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible};
1040             $msgdata->{bayes_token_inviz} =
1041 24 50       158 $pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible};
1042             $msgdata->{bayes_mimepart_digests} =
1043 24 50       69 $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
1044 24         75 @{$msgdata->{bayes_token_uris}} =
1045 24 50       171 $pms->get_uri_list() if $t_src->{uri};
1046 24         62 return $msgdata;
1047             }
1048              
1049             ###########################################################################
1050              
1051             # The calling functions expect a uniq'ed array of tokens ...
1052             sub tokenize {
1053 16     16 0 56 my ($self, $msg, $msgdata) = @_;
1054              
1055 16         49 my $t_src = $self->{conf}->{bayes_token_sources};
1056 16         32 my @tokens;
1057              
1058             # visible tokens from the body
1059 16 50       60 if ($msgdata->{bayes_token_body}) {
1060             my(@t) = map($self->_tokenize_line ($_, '', 1),
1061 16         34 @{$msgdata->{bayes_token_body}} );
  16         92  
1062 16         111 dbg("bayes: tokenized body: %d tokens", scalar @t);
1063 16         1099 push(@tokens, @t);
1064             }
1065             # the URI list
1066 16 50       64 if ($msgdata->{bayes_token_uris}) {
1067             my(@t) = map($self->_tokenize_line ($_, '', 2),
1068 16         32 @{$msgdata->{bayes_token_uris}} );
  16         79  
1069 16         78 dbg("bayes: tokenized uri: %d tokens", scalar @t);
1070 16         87 push(@tokens, @t);
1071             }
1072             # add invisible tokens
1073 16 50       59 if ($msgdata->{bayes_token_inviz}) {
1074 16         36 my $tokprefix;
1075 16         24 if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' }
  16         48  
1076 16         33 if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' }
1077 16 50       39 if (defined $tokprefix) {
1078             my(@t) = map($self->_tokenize_line ($_, $tokprefix, 1),
1079 16         31 @{$msgdata->{bayes_token_inviz}} );
  16         48  
1080 16         50 dbg("bayes: tokenized invisible: %d tokens", scalar @t);
1081 16         43 push(@tokens, @t);
1082             }
1083             }
1084              
1085             # add digests and Content-Type of all MIME parts
1086 16 50       55 if ($msgdata->{bayes_mimepart_digests}) {
1087 0         0 my %shorthand = ( # some frequent MIME part contents for human readability
1088             'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
1089             'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
1090             'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML',
1091             'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
1092             'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
1093             '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
1094             '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
1095             );
1096             my(@t) = map('MIME:' . ($shorthand{$_} || $_),
1097 0   0     0 @{ $msgdata->{bayes_mimepart_digests} });
  0         0  
1098 0         0 dbg("bayes: tokenized mime parts: %d tokens", scalar @t);
1099 0         0 dbg("bayes: mime-part token %s", $_) for @t;
1100 0         0 push(@tokens, @t);
1101             }
1102              
1103             # Tokenize the headers
1104 16 50       59 if ($t_src->{header}) {
1105 16         28 my(@t);
1106 16         81 my %hdrs = $self->_tokenize_headers ($msg);
1107 16         98 while( my($prefix, $value) = each %hdrs ) {
1108 200         522 push(@t, $self->_tokenize_line ($value, "H$prefix:", 0));
1109             }
1110 16         74 dbg("bayes: tokenized header: %d tokens", scalar @t);
1111 16         341 push(@tokens, @t);
1112             }
1113              
1114             # Go ahead and uniq the array, skip null tokens (can happen sometimes)
1115             # generate an SHA1 hash and take the lower 40 bits as our token
1116 16         44 my %tokens;
1117 16         67 foreach my $token (@tokens) {
1118             # dbg("bayes: token: %s", $token);
1119 6746 50       28857 $tokens{substr(sha1($token), -5)} = $token if $token ne '';
1120             }
1121              
1122             # return the keys == tokens ...
1123 16         399 return \%tokens;
1124             }
1125              
1126             sub _tokenize_line {
1127 538     538   738 my $self = $_[0];
1128 538         663 my $tokprefix = $_[2];
1129 538         592 my $region = $_[3];
1130 538         810 local ($_) = $_[1];
1131              
1132 538         663 my @rettokens;
1133              
1134             # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
1135             # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
1136             # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
1137              
1138             ### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
1139              
1140             ### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic)
1141 538         1838 s{ ( [A-Za-z0-9,@*!_'"\$. -]+ |
1142             [\xC0-\xDF][\x80-\xBF] |
1143             [\xE0-\xEF][\x80-\xBF]{2} |
1144             [\xF0-\xF4][\x80-\xBF]{3} |
1145 4844 100       13359 [\xA1-\xFF] ) | . }
1146             { defined $1 ? $1 : ' ' }xsge;
1147             # should we also turn NBSP ( \xC2\xA0 ) into space?
1148              
1149             # DO split on "..." or "--" or "---"; common formatting error resulting in
1150             # hapaxes. Keep the separator itself as a token, though, as long ones can
1151 538         1222 # be good spamsigns.
1152 538         792 s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
1153             s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
1154 538         595  
1155 538 100 100     1378 if (IGNORE_TITLE_CASE) {
1156             if ($region == 1 || $region == 2) {
1157             # lower-case Title Case at start of a full-stop-delimited line (as would
1158 338         2058 # be seen in a Western language).
  242         1825  
1159             s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
1160             }
1161             }
1162 538         1449  
1163             my $magic_re = $self->{store}->get_magic_re();
1164              
1165             # Note that split() in scope of 'use bytes' results in words with utf8 flag
1166             # cleared, even if the source string has perl characters semantics !!!
1167             # Is this really still desirable?
1168 538         2483  
1169 8598         12986 foreach my $token (split) {
1170 8598         11129 $token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
1171             $token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
1172              
1173             # Skip false magic tokens
1174             # TVD: we need to do a defined() check since SQL doesn't have magic
1175             # tokens, so the SQL BayesStore returns undef. I really want a way
1176             # of optimizing that out, but I haven't come up with anything yet.
1177 8598 50 33     28654 #
1178             next if ( defined $magic_re && $token =~ /$magic_re/ );
1179              
1180 8598         10334 # *do* keep 3-byte tokens; there's some solid signs in there
1181             my $len = length($token);
1182              
1183             # but extend the stop-list. These are squarely in the gray
1184             # area, and it just slows us down to record them.
1185             # See http://wiki.apache.org/spamassassin/BayesStopList for more info.
1186 8598 100 100     24715 #
1187             next if $len < 3 ||
1188             ($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
1189              
1190 5236 100 100     10043 # are we in the body? If so, apply some body-specific breakouts
1191 3540 100       7234 if ($region == 1 || $region == 2) {
    100          
1192 44         175 if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
1193             push (@rettokens, $self->_tokenize_mail_addrs ($token));
1194             }
1195 110         253 elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
1196 110         152 push (@rettokens, "UD:".$token); # the full token
  110         468  
1197 132         473 my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
1198             push (@rettokens, "UD:".$1); # UD = URL domain
1199             }
1200             }
1201             }
1202              
1203             # note: do not trim down overlong tokens if they contain '*'. This is
1204             # used as part of split tokens such as "HTo:D*net" indicating that
1205             # the domain ".net" appeared in the To header.
1206 5236 100 100     9040 #
1207             if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
1208 256 50       492  
1209             if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
1210             # Bug 7135
1211 0         0 # collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences
1212             my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] )
1213 0 0       0 [\x80-\xBF]{2} )/xsg;
1214 0         0 if (@t) {
1215 0         0 push (@rettokens, map($tokprefix.'u8:'.$_, @t));
1216             next;
1217             }
1218             }
1219 256         267  
1220             if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
1221             # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
1222             # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
1223             # to me! (jm)
1224             while ($token =~ s/^(..?)//) {
1225             push (@rettokens, $tokprefix.'8:'.$1);
1226             }
1227             next;
1228             }
1229 256 100 100     924  
      100        
      100        
      50        
      66        
1230             if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1231             || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1232             || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
1233             {
1234             # if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
1235             # Spambayes trick via Matt: Just retain 7 chars. Do not retain the
1236             # length, it does not help; see jm's mail to -devel on Nov 20 2002 at
1237             # http://sourceforge.net/p/spamassassin/mailman/message/12977605/
1238             # "sk:" stands for "skip".
1239             # Bug 7141: retain seven UTF-8 chars (or other bytes),
1240 232         1571 # if followed by at least two bytes
1241             $token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] |
1242             [\xC0-\xDF][\x80-\xBF] |
1243             [\xE0-\xEF][\x80-\xBF]{2} |
1244             [\xF0-\xF4][\x80-\xBF]{3} | . ){7} ))
1245             .{2,} \z }{sk:$1}xs;
1246             ## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes
1247             }
1248             }
1249              
1250 5236 100 100     9603 # decompose tokens? do this after shortening long tokens
1251 3540         3969 if ($region == 1 || $region == 2) {
1252 3540 100       6049 if (DECOMPOSE_BODY_TOKENS) {
1253 364         485 if ($token =~ /[^\w:\*]/) {
1254 364         1065 my $decompd = $token; # "Foo!"
1255 364         768 $decompd =~ s/[^\w:\*]//gs;
1256             push (@rettokens, $tokprefix.$decompd); # "Foo"
1257             }
1258 3540 100       5798  
1259 712         913 if ($token =~ /[A-Z]/) {
  712         914  
1260 712         1229 my $decompd = $token; $decompd = lc $decompd;
1261             push (@rettokens, $tokprefix.$decompd); # "foo!"
1262 712 100       1371  
1263 76         165 if ($token =~ /[^\w:\*]/) {
1264 76         164 $decompd =~ s/[^\w:\*]//gs;
1265             push (@rettokens, $tokprefix.$decompd); # "foo"
1266             }
1267             }
1268             }
1269             }
1270 5236         10090  
1271             push (@rettokens, $tokprefix.$token);
1272             }
1273 538         3801  
1274             return @rettokens;
1275             }
1276              
1277 16     16   46 sub _tokenize_headers {
1278             my ($self, $msg) = @_;
1279 16         33  
1280             my %parsed;
1281              
1282 16         34 my %user_ignore;
  16         87  
1283             $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
1284              
1285 16         51 # get headers in array context
1286             my @hdrs;
1287 16         104 my @rcvdlines;
1288             for ($msg->get_all_headers()) {
1289 202 100       406 # first, keep a copy of Received headers, so we can strip down to last 2
1290 52         87 if (/^Received:/i) {
1291 52         78 push(@rcvdlines, $_);
1292             next;
1293             }
1294 150 100       1820 # and now skip lines for headers we don't want (including all Received)
1295 102         132 next if /^${IGNORED_HDRS}:/i;
1296 102         207 next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
1297             push(@hdrs, $_);
1298 16         142 }
1299             push(@hdrs, $msg->get_all_metadata());
1300              
1301             # and re-add the last 2 received lines: usually a good source of
1302 16 100       63 # spamware tokens and HELO names.
  14         38  
1303 16 100       49 if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
  14         36  
1304             if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
1305 16         42  
1306 194 50       524 for (@hdrs) {
1307 194         521 next unless /\S/;
1308             my ($hdr, $val) = split(/:/, $_, 2);
1309              
1310             # remove user-specified headers here, after Received, in case they
1311 194 50       406 # want to ignore that too
1312             next if exists $user_ignore{lc $hdr};
1313              
1314 194   50     303 # Prep the header value
1315 194         267 $val ||= '';
1316             chomp($val);
1317              
1318 194 100       1255 # special tokenization for some headers:
    100          
    100          
    100          
    100          
    50          
1319 16         90 if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {
1320             $val = $self->_pre_chew_message_id ($val);
1321             }
1322             elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)
1323             (?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
1324 60         157 {
1325             $val = $self->_pre_chew_addr_header ($val);
1326             }
1327 28         96 elsif ($hdr eq 'Received') {
1328             $val = $self->_pre_chew_received ($val);
1329             }
1330 4         23 elsif ($hdr eq 'Content-Type') {
1331             $val = $self->_pre_chew_content_type ($val);
1332             }
1333 2         10 elsif ($hdr eq 'MIME-Version') {
1334             $val =~ s/1\.0//; # totally innocuous
1335             }
1336 0         0 elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
1337             $val = "1"; # just mark the presence, they create lots of hapaxen
1338             }
1339 194         285  
1340 194 100       435 if (MAP_HEADERS_MID) {
1341 16         50 if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {
1342             $parsed{"*MI"} = $val;
1343             }
1344 194         216 }
1345 194 100       430 if (MAP_HEADERS_FROMTOCC) {
1346 32         73 if ($hdr =~ /^(?:From|To|Cc)$/i) {
1347             $parsed{"*Ad"} = $val;
1348             }
1349 194         220 }
1350 194 50       440 if (MAP_HEADERS_USERAGENT) {
1351 0         0 if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {
1352             $parsed{"*UA"} = $val;
1353             }
1354             }
1355              
1356 194 100       408 # replace hdr name with "compressed" version if possible
1357 126         232 if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
1358             $hdr = $HEADER_NAME_COMPRESSION{$hdr};
1359             }
1360 194 100       336  
1361 26         72 if (exists $parsed{$hdr}) {
1362             $parsed{$hdr} .= " ".$val;
1363 168         369 } else {
1364             $parsed{$hdr} = $val;
1365 194 50       395 }
1366 0         0 if (would_log('dbg', 'bayes') > 1) {
1367             dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
1368             }
1369             }
1370 16         204  
1371             return %parsed;
1372             }
1373              
1374 4     4   14 sub _pre_chew_content_type {
1375             my ($self, $val) = @_;
1376              
1377 4 50       30 # hopefully this will retain good bits without too many hapaxen
1378 0         0 if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
1379 0 0       0 my $boundary = $1;
1380 0         0 $boundary = '' if !defined $boundary; # avoid a warning
1381             $boundary =~ s/[a-fA-F0-9]/H/gs;
1382 0         0 # break up blocks of separator chars so they become their own tokens
1383 0         0 $boundary =~ s/([-_\.=]+)/ $1 /gs;
1384             $val .= $boundary;
1385             }
1386              
1387 4         36 # stop-list words for Content-Type header: these wind up totally gray
1388             $val =~ s/\b(?:text|charset)\b//;
1389 4         13  
1390             $val;
1391             }
1392              
1393 16     16   47 sub _pre_chew_message_id {
1394             my ($self, $val) = @_;
1395             # we can (a) get rid of a lot of hapaxen and (b) increase the token
1396             # specificity by pre-parsing some common formats.
1397              
1398 16         52 # Outlook Express format:
1399             $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
1400             ([0-9a-f]{4})[0-9a-f]{4}\$
1401             ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
1402              
1403 16         30 # Exim:
1404             $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
1405              
1406 16         39 # Sendmail:
1407             $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
1408             [A-F0-9]{10,12}\@//gx;
1409              
1410             # try to split Message-ID segments on probable ID boundaries. Note that
1411             # Outlook message-ids seem to contain a server identifier ID in the last
1412             # 8 bytes before the @. Make sure this becomes its own token, it's a
1413 16         90 # great spam-sign for a learning system! Be sure to split on ".".
1414 16         44 $val =~ s/[^_A-Za-z0-9]/ /g;
1415             $val;
1416             }
1417              
1418 28     28   59 sub _pre_chew_received {
1419             my ($self, $val) = @_;
1420              
1421             # Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs
1422             # and valid-format RFC-822/2822 dates
1423 28         106  
1424 28         88 $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail
1425 28         185 $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail
1426 28         70 $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail
1427             $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
1428 28         211  
1429             $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
1430             [0-3\s]?[0-9]\s
1431             (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
1432             (?:19|20)?[0-9]{2}\s
1433             [0-2][0-9](?:\:[0-5][0-9]){1,2}\s
1434             (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
1435             //gx;
1436              
1437             # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
1438             # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
1439             # (on both sides)
1440             # also make a dup with the full IP, as fodder for
1441 28         214 # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"
1442 30 50 33     218 $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
      33        
1443 0         0 if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {
1444             $1.$2.$3.$4.
1445             " ip*".$1.$2.$3.$4." ";
1446 30         287 } else {
1447             $1.$2.$3.
1448             " ip*".$1.$2.$3.$4." ";
1449             }
1450             }gex;
1451              
1452             # trim these: they turn out as the most common tokens, but with a
1453 28         294 # prob of about .5. waste of space!
1454             $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;
1455 28         70  
1456             $val;
1457             }
1458              
1459 60     60   114 sub _pre_chew_addr_header {
1460 60         98 my ($self, $val) = @_;
1461             local ($_);
1462 60         191  
1463 60         86 my @addrs = $self->{main}->find_all_addrs_in_line ($val);
1464 60         111 my @toks;
1465 48         110 foreach (@addrs) {
1466             push (@toks, $self->_tokenize_mail_addrs ($_));
1467 60         212 }
1468             return join (' ', @toks);
1469             }
1470              
1471 92     92   179 sub _tokenize_mail_addrs {
1472             my ($self, $addr) = @_;
1473 92 50       399  
1474 92         131 ($addr =~ /(.+)\@(.+)$/) or return ();
1475 92         334 my @toks;
1476 92         169 push(@toks, "U*".$1, "D*".$2);
  92         379  
  84         303  
1477 92         305 $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
1478             return @toks;
1479             }
1480              
1481              
1482             ###########################################################################
1483              
1484             # compute the probability that a token is spammish for each token
1485 4     4   22 sub _compute_prob_for_all_tokens {
1486 4         9 my ($self, $tokensdata, $ns, $nn) = @_;
1487             my @probabilities;
1488 4 50 33     47  
1489             return if !$ns || !$nn;
1490 4         27  
1491 4         11 my $threshold = 1; # ignore low-freq tokens below this s+n threshold
1492             if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1493             $threshold = 10;
1494 4 50       27 }
1495 0         0 if (!$self->{use_hapaxes}) {
1496             $threshold = 2;
1497             }
1498 4         10  
  4         15  
1499 1104         1326 foreach my $tokendata (@{$tokensdata}) {
1500 1104         1184 my $s = $tokendata->[1]; # spam count
1501 1104         1193 my $n = $tokendata->[2]; # ham count
1502             my $prob;
1503 22     22   225  
  22         55  
  22         25589  
1504 1104 100       1589 no warnings 'uninitialized'; # treat undef as zero in addition
1505             if ($s + $n >= $threshold) {
1506             # ignoring low-freq tokens, also covers the (!$s && !$n) case
1507              
1508             # my $ratios = $s / $ns;
1509             # my $ration = $n / $nn;
1510             # $prob = $ratios / ($ration + $ratios);
1511 502         685 #
1512             $prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster
1513 502         526  
1514             if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1515             # use Robinson's f(x) equation for low-n tokens, instead of just
1516 502         590 # ignoring them
1517 502         707 my $robn = $s + $n;
1518             $prob =
1519             ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
1520             /
1521             ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
1522             }
1523             }
1524              
1525             # 'log_raw_counts' is used to log the raw data for the Bayes equations
1526             # during a mass-check, allowing the S and X constants to be optimized
1527             # quickly without requiring re-tokenization of the messages for each
1528             # attempt. There's really no need for this code to be uncommented in
1529             # normal use, however. It has never been publicly documented, so
1530             # commenting it out is fine. ;)
1531             #
1532             ## if ($self->{log_raw_counts}) {
1533             ## $self->{raw_counts} .= " s=$s,n=$n ";
1534             ## }
1535 1104         1622  
1536             push(@probabilities, $prob);
1537 4         19 }
1538             return \@probabilities;
1539             }
1540              
1541             # compute the probability that a token is spammish
1542 0     0   0 sub _compute_prob_for_token {
1543             my ($self, $token, $ns, $nn, $s, $n) = @_;
1544              
1545             # we allow the caller to give us the token information, just
1546 0 0 0     0 # to save a potentially expensive lookup
1547 0         0 if (!defined($s) || !defined($n)) {
1548             ($s, $n, undef) = $self->{store}->tok_get($token);
1549 0 0 0     0 }
1550             return if !$s && !$n;
1551 0         0  
1552             my $probabilities_ref =
1553             $self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);
1554 0         0  
1555             return $probabilities_ref->[0];
1556             }
1557              
1558             ###########################################################################
1559             # If a token is neither hammy nor spammy, return 0.
1560             # For a spammy token, return the minimum number of additional ham messages
1561             # it would have had to appear in to no longer be spammy. Hammy tokens
1562             # are handled similarly. That's what the function does (at the time
1563             # of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly
1564             # more useful if it returned the number of /additional/ ham messages
1565             # a spammy token would have to appear in to no longer be spammy but I
1566             # fear that might require the solution to a cubic equation, and I
1567             # just don't have the time for that now.
1568              
1569 0     0   0 sub _compute_declassification_distance {
1570             my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
1571 0 0 0     0  
1572             return 0 if $ns == 0 && $nn == 0;
1573 0         0  
1574 0 0       0 if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
  0 0       0  
1575             if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
1576 0 0 0     0  
1577 0 0       0 return 0 if $Ns == 0 || $Nn == 0;
1578             return 0 if abs( $prob - 0.5 ) <
1579             $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1580 0 0       0  
1581 0         0 my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
1582             my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1583 0         0  
1584             return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
1585             unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
1586 0         0  
1587 0         0 my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
1588 0         0 my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
1589 0         0 my $a = $Nb * ( 1 - $p );
1590 0         0 my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
1591 0         0 my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
1592 0 0       0 my $discrim = $b * $b - 4 * $a * $c;
1593 0         0 my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
1594             my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
1595              
1596 0 0       0 # This shouldn't be necessary. Should not be < 1
1597             return $dd_exact < 1 ? 1 : int($dd_exact);
1598             }
1599              
1600             ###########################################################################
1601              
1602 10     10   30 sub _opportunistic_calls {
1603             my($self, $journal_only) = @_;
1604              
1605 10 50       133 # If we're not already tied, abort.
1606 0         0 if (!$self->{store}->db_readable()) {
1607 0         0 dbg("bayes: opportunistic call attempt failed, DB not readable");
1608             return;
1609             }
1610              
1611 10         60 # Is an expire or sync running?
1612 10 50 33     50 my $running_expire = $self->{store}->get_running_expire_tok();
1613 0         0 if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
1614 0         0 dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
1615             return;
1616             }
1617              
1618 10 50 66     223 # handle expiry and syncing
    50          
1619 0         0 if (!$journal_only && $self->{store}->expiry_due()) {
1620             dbg("bayes: opportunistic call found expiry due");
1621              
1622             # sync will bring the DB R/W as necessary, and the expire will remove
1623 0         0 # the running_expire token, may untie as well.
1624             $self->{main}->{bayes_scanner}->sync(1,1);
1625             }
1626 0         0 elsif ( $self->{store}->sync_due() ) {
1627             dbg("bayes: opportunistic call found journal sync due");
1628              
1629 0         0 # sync will bring the DB R/W as necessary, may untie as well
1630             $self->{main}->{bayes_scanner}->sync(1,0);
1631              
1632 0 0       0 # We can only remove the running_expire token if we're doing R/W
1633 0         0 if ($self->{store}->db_writable()) {
1634             $self->{store}->remove_running_expire_tok();
1635             }
1636             }
1637 10         30  
1638             return;
1639             }
1640              
1641             ###########################################################################
1642              
1643 63     63 1 206 sub learner_new {
1644             my ($self) = @_;
1645 63         142  
1646 63         332 my $store;
1647 63 100       300 my $module = untaint_var($self->{conf}->{bayes_store_module});
1648             $module = 'Mail::SpamAssassin::BayesStore::DBM' if !$module;
1649 63         266  
1650 63         206 dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module);
1651             undef $self->{store}; # DESTROYs previous object, if any
1652             eval '
1653             require '.$module.';
1654             $store = '.$module.'->new($self);
1655 63 50       6736 1;
1656 0 0       0 ' or do {
  0         0  
1657 0         0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1658             die "bayes: learner_new $module new() failed: $eval_stat\n";
1659             };
1660 63         485  
1661 63         174 dbg("bayes: learner_new: got store=%s", $store);
1662             $self->{store} = $store;
1663 63         270  
1664             $self;
1665             }
1666              
1667             ###########################################################################
1668              
1669 0     0 0   sub bayes_report_make_list {
1670 0 0         my ($self, $pms, $info, $param) = @_;
1671             return "Tokens not available." unless defined $info;
1672 0   0        
1673             my ($limit,$fmt_arg,$more) = split /,/, ($param || '5');
1674 0            
1675             my %formats = (
1676             short => '$t',
1677             Short => 'Token: \"$t\"',
1678             compact => '$p-$D--$t',
1679             Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
1680             medium => '$p-$D-$N--$t',
1681             long => '$p-$d--${h}h-${s}s--${a}d--$t',
1682             Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
1683             );
1684 0 0          
1685             my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
1686 0 0          
1687             return "Invalid format, must be one of: ".join(",",keys %formats)
1688             unless defined $raw_fmt;
1689 0            
1690 0 0         my $fmt = '"'.$raw_fmt.'"';
1691 0 0         my $amt = $limit < @$info ? $limit : @$info;
1692             return "" unless $amt;
1693 0            
1694 0           my $ns = $pms->{bayes_nspam};
1695 0 0   0     my $nh = $pms->{bayes_nham};
  0            
1696 0           my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
1697             my $now = time;
1698              
1699 0           join ', ', map {
1700 0           my($t,$prob,$s,$h,$u) = @$_;
1701 0           my $a = int(($now - $u)/(3600 * 24));
1702 0           my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);
1703 0           my $p = sprintf "%.3f", $prob;
1704 0 0         my $n = $s + $h;
1705 0           my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
1706 0           my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
1707 0           eval $fmt; ## no critic
  0            
1708             } @{$info}[0..$amt-1];
1709             }
1710              
1711             1;