File Coverage

blib/lib/Mail/SpamAssassin/Plugin/Bayes.pm
Criterion Covered Total %
statement 557 758 73.4
branch 161 350 46.0
condition 45 119 37.8
subroutine 58 71 81.6
pod 11 21 52.3
total 832 1319 63.0


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier
21              
22             =head1 DESCRIPTION
23              
24             This is a Bayesian-style probabilistic classifier, using an algorithm based on
25             the one detailed in Paul Graham's I<A Plan For Spam> paper at:
26              
27             http://www.paulgraham.com/spam.html
28              
29             It also incorporates some other aspects taken from Graham Robinson's webpage
30             on the subject at:
31              
32             http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
33              
34             And the chi-square probability combiner as described here:
35              
36             http://www.linuxjournal.com/print.php?sid=6467
37              
38             The results are incorporated into SpamAssassin as the BAYES_* rules.
39              
40             =head1 METHODS
41              
42             =cut
43              
44              
45             use strict;
46 22     22   141 use warnings;
  22         44  
  22         640  
47 22     22   107 # use bytes;
  22         39  
  22         702  
48             use re 'taint';
49 22     22   124  
  22         56  
  22         1865  
50             BEGIN {
51             eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
52 22         114 or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
  22         1353  
  22         554  
53 22 50   22   75 }
  0         0  
  0         0  
54              
55             use Mail::SpamAssassin;
56 22     22   124 use Mail::SpamAssassin::Plugin;
  22         46  
  22         543  
57 22     22   108 use Mail::SpamAssassin::PerMsgStatus;
  22         60  
  22         659  
58 22     22   126 use Mail::SpamAssassin::Logger;
  22         42  
  22         599  
59 22     22   116 use Mail::SpamAssassin::Util qw(untaint_var);
  22         46  
  22         1203  
60 22     22   195  
  22         38  
  22         996  
61             # pick ONLY ONE of these combining implementations.
62             use Mail::SpamAssassin::Bayes::CombineChi;
63 22     22   6640 # use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
  22         57  
  22         12009  
64              
65             our @ISA = qw(Mail::SpamAssassin::Plugin);
66              
67             # Which headers should we scan for tokens? Don't use all of them, as it's easy
68             # to pick up spurious clues from some. What we now do is use all of them
69             # *less* these well-known headers; that way we can pick up spammers' tracking
70             # headers (which are obviously not well-known in advance!).
71              
72             # Received is handled specially
73             our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise
74             |Delivered-To |Delivery-Date
75             |(?:X-)?Envelope-To
76             |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text
77              
78             |Subject # not worth a tiny gain vs. to db size increase
79              
80             # Date: can provide invalid cues if your spam corpus is
81             # older/newer than ham
82             |Date
83              
84             # List headers: ignore. a spamfiltering mailing list will
85             # become a nonspam sign.
86             |X-List|(?:X-)?Mailing-List
87             |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe
88             |Unsubscribe|Host|Id|Manager|Admin|Comment
89             |Name|Url)
90             |X-Unsub(?:scribe)?
91             |X-Mailman-Version |X-Been[Tt]here |X-Loop
92             |Mail-Followup-To
93             |X-eGroups-(?:Return|From)
94             |X-MDMailing-List
95             |X-XEmacs-List
96             |X-Sympa-To
97              
98             # gatewayed through mailing list (thanks to Allen Smith)
99             |(?:X-)?Resent-(?:From|To|Date)
100             |(?:X-)?Original-(?:From|To|Date)
101              
102             # Spamfilter/virus-scanner headers: too easy to chain from
103             # these
104             |X-MailScanner(?:-SpamCheck)?
105             |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))?
106             |X-Antispam |X-RBL-Warning |X-Mailscanner
107             |X-MDaemon-Deliver-To |X-Virus-Scanned
108             |X-Mass-Check-Id
109             |X-Pyzor |X-DCC-\S{2,25}-Metrics
110             |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner
111             |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status
112             |X-SpamCop-[^:]+
113             |X-SMTPD |(?:X-)?Spam-Apparently-To
114             |SPAM |X-Perlmx-Spam
115             |X-Bogosity
116              
117             # some noisy Outlook headers that add no good clues:
118             |Content-Class |Thread-(?:Index|Topic)
119             |X-Original[Aa]rrival[Tt]ime
120              
121             # Annotations from IMAP, POP, and MH:
122             |(?:X-)?Status |X-Flags |X-Keywords |Replied |Forwarded
123             |Lines |Content-Length
124             |X-UIDL? |X-IMAPbase
125              
126             # Annotations from Bugzilla
127             |X-Bugzilla-[^:]+
128              
129             # Annotations from VM: (thanks to Allen Smith)
130             |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified
131             |Summary-Format|VHeader|v\d-Data|Message-Order)
132              
133             # Annotations from Gnus:
134             | X-Gnus-Mail-Source
135             | Xref
136              
137             )}x;
138              
139             # Note only the presence of these headers, in order to reduce the
140             # hapaxen they generate.
141             our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
142             |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint
143             |D(?:KIM|omainKey)-Signature
144             )}ix;
145              
146             # tweaks tested as of Nov 18 2002 by jm posted to -devel at
147             # http://sourceforge.net/p/spamassassin/mailman/message/12977556/
148             # for results. The winners are now the default settings.
149             use constant IGNORE_TITLE_CASE => 1;
150 22     22   178 use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0;
  22         51  
  22         1074  
151 22     22   113 use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1;
  22         41  
  22         827  
152 22     22   135 use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
  22         43  
  22         791  
153 22     22   111  
  22         39  
  22         811  
154             # tweaks by jm on May 12 2003, see -devel email at
155             # http://sourceforge.net/p/spamassassin/mailman/message/14844556/
156             use constant PRE_CHEW_ADDR_HEADERS => 1;
157 22     22   150 use constant CHEW_BODY_URIS => 1;
  22         40  
  22         758  
158 22     22   102 use constant CHEW_BODY_MAILADDRS => 1;
  22         40  
  22         763  
159 22     22   105 use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
  22         37  
  22         755  
160 22     22   105 use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
  22         44  
  22         725  
161 22     22   105 use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
  22         41  
  22         709  
162 22     22   104 use constant IGNORE_MSGID_TOKENS => 0;
  22         37  
  22         695  
163 22     22   100  
  22         38  
  22         708  
164             # tweaks of 12 March 2004, see bug 2129.
165             use constant DECOMPOSE_BODY_TOKENS => 1;
166 22     22   100 use constant MAP_HEADERS_MID => 1;
  22         41  
  22         732  
167 22     22   102 use constant MAP_HEADERS_FROMTOCC => 1;
  22         40  
  22         758  
168 22     22   103 use constant MAP_HEADERS_USERAGENT => 1;
  22         40  
  22         705  
169 22     22   99  
  22         41  
  22         794  
170             # tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
171             use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
172 22     22   111 use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
  22         40  
  22         851  
173 22     22   126  
  22         40  
  22         2432  
174             # We store header-mined tokens in the db with a "HHeaderName:val" format.
175             # some headers may contain lots of gibberish tokens, so allow a little basic
176             # compression by mapping the header name at least here. these are the headers
177             # which appear with the most frequency in my db. note: this doesn't have to
178             # be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing
179             # tokens from multiple different headers may impact accuracy, so might as well
180             # avoid this if possible. These are the top ones from my corpus, BTW (jm).
181             our %HEADER_NAME_COMPRESSION = (
182             'Message-Id' => '*m',
183             'Message-ID' => '*M',
184             'Received' => '*r',
185             'User-Agent' => '*u',
186             'References' => '*f',
187             'In-Reply-To' => '*i',
188             'From' => '*F',
189             'Reply-To' => '*R',
190             'Return-Path' => '*p',
191             'Return-path' => '*rp',
192             'X-Mailer' => '*x',
193             'X-Authentication-Warning' => '*a',
194             'Organization' => '*o',
195             'Organisation' => '*o',
196             'Content-Type' => '*ct',
197             'Content-Disposition' => '*cd',
198             'Content-Transfer-Encoding' => '*ce',
199             'x-spam-relays-trusted' => '*RT',
200             'x-spam-relays-untrusted' => '*RU',
201             );
202              
203             # How many seconds should the opportunistic_expire lock be valid?
204             our $OPPORTUNISTIC_LOCK_VALID = 300;
205              
206             # Should we use the Robinson f(w) equation from
207             # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?
208             # It gives better results, in that scores are more likely to distribute
209             # into the <0.5 range for nonspam and >0.5 for spam.
210             use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
211 22     22   143  
  22         43  
  22         1034  
212             # How many of the most significant tokens should we use for the p(w)
213             # calculation?
214             use constant N_SIGNIFICANT_TOKENS => 150;
215 22     22   123  
  22         38  
  22         1044  
216             # How many significant tokens are required for a classifier score to
217             # be considered usable?
218             use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
219 22     22   121  
  22         47  
  22         1021  
220             # How long a token should we hold onto? (note: German speakers typically
221             # will require a longer token than English ones.)
222             use constant MAX_TOKEN_LENGTH => 15;
223 22     22   127  
  22         39  
  22         154837  
224             ###########################################################################
225              
226             my $class = shift;
227             my ($main) = @_;
228 63     63 1 218  
229 63         200 $class = ref($class) || $class;
230             my $self = $class->SUPER::new($main);
231 63   33     420 bless ($self, $class);
232 63         348  
233 63         176 $self->{main} = $main;
234             $self->{conf} = $main->{conf};
235 63         218 $self->{use_ignores} = 1;
236 63         158  
237 63         180 $self->register_eval_rule("check_bayes");
238             $self;
239 63         263 }
240 63         519  
241             my $self = shift;
242             if ($self->{store}) {
243             $self->{store}->untie_db();
244 40     40 1 113 }
245 40 100       171 %{$self} = ();
246 39         259 }
247              
248 40         88 ###########################################################################
  40         242  
249              
250             # Plugin hook.
251             # Return this implementation object, for callers that need to know
252             # it. TODO: callers shouldn't *need* to know it!
253             # used only in test suite to get access to {store}, internal APIs.
254             #
255              
256             ###########################################################################
257              
258 1160     1160 0 1838 # Plugin hook.
259             # Called in the parent process shortly before forking off child processes.
260             my ($self) = @_;
261              
262             if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) {
263             $self->{store}->prefork_init;
264             }
265 0     0 0 0 }
266              
267 0 0 0     0 ###########################################################################
268 0         0  
269             # Plugin hook.
270             # Called in a child process shortly after being spawned.
271             my ($self) = @_;
272              
273             if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) {
274             $self->{store}->spamd_child_init;
275             }
276             }
277 0     0 1 0  
278             ###########################################################################
279 0 0 0     0  
280 0         0 # Plugin hook.
281             my ($self, $pms, $fulltext, $min, $max) = @_;
282              
283             return 0 if (!$self->{conf}->{use_learner});
284             return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules});
285              
286             if (!exists ($pms->{bayes_score})) {
287             my $timer = $self->{main}->time_method("check_bayes");
288 0     0 0 0 $pms->{bayes_score} = $self->scan($pms, $pms->{msg});
289             }
290 0 0       0  
291 0 0 0     0 if (defined $pms->{bayes_score} &&
292             ($min == 0 || $pms->{bayes_score} > $min) &&
293 0 0       0 ($max eq "undef" || $pms->{bayes_score} <= $max))
294 0         0 {
295 0         0 if ($self->{conf}->{detailed_bayes_score}) {
296             $pms->test_log(sprintf ("score: %3.4f, hits: %s",
297             $pms->{bayes_score},
298 0 0 0     0 $pms->{bayes_hits}));
      0        
      0        
      0        
299             }
300             else {
301             $pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score}));
302 0 0       0 }
303             return 1;
304             }
305 0         0  
306             return 0;
307             }
308 0         0  
309             ###########################################################################
310 0         0  
311             # Plugin hook.
312             my ($self, $params) = @_;
313 0         0 my $quiet = $params->{quiet};
314              
315             # do a sanity check here. Weird things happen if we remain tied
316             # after compiling; for example, spamd will never see that the
317             # number of messages has reached the bayes-scanning threshold.
318             if ($self->{store}->db_readable()) {
319             warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
320 6     6 1 13 $self->{store}->untie_db();
321 6         9 }
322             }
323              
324             ###########################################################################
325              
326 6 50       20 # read configuration items to control bayes behaviour. Called by
327 0 0       0 # BayesStore::read_db_configs().
328 0         0 my ($self) = @_;
329              
330             # use of hapaxes. Set on bayes object, since it controls prob
331             # computation.
332             $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
333             }
334             ###########################################################################
335              
336             my ($self,$PMS) = @_;
337 42     42 0 100  
338             return 0 unless $self->{use_ignores};
339              
340             my $ig_from = $self->{main}->call_plugins ("check_wb_list",
341 42         160 { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
342             my $ig_to = $self->{main}->call_plugins ("check_wb_list",
343             { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
344              
345             my $ignore = $ig_from || $ig_to;
346 0     0 0 0  
347             dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
348 0 0       0  
349             return $ignore;
350 0         0 }
351              
352 0         0 ###########################################################################
353              
354             # Plugin hook.
355 0   0     0 my ($self, $params) = @_;
356             my $isspam = $params->{isspam};
357 0 0       0 my $msg = $params->{msg};
358             my $id = $params->{id};
359 0         0  
360             if (!$self->{conf}->{use_bayes}) { return; }
361              
362             my $msgdata = $self->get_body_from_msg ($msg);
363             my $ret;
364              
365             eval {
366 10     10 1 28 local $SIG{'__DIE__'}; # do not run user die() traps in here
367 10         27 my $timer = $self->{main}->time_method("b_learn");
368 10         18  
369 10         21 my $ok;
370             if ($self->{main}->{learn_to_journal}) {
371 10 50       46 # If we're going to learn to journal, we'll try going r/o first...
  0         0  
372             # If that fails for some reason, let's try going r/w. This happens
373 10         82 # if the DB doesn't exist yet.
374 10         26 $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
375             } else {
376             $ok = $self->{store}->tie_db_writable();
377 10         76 }
378 10         57  
379             if ($ok) {
380 10         24 $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
381 10 100       36  
382             if (!$self->{main}->{learn_caller_will_untie}) {
383             $self->{store}->untie_db();
384             }
385 2   33     18 }
386             1;
387 8         63 } or do { # if we died, untie the dbs.
388             my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
389             $self->{store}->untie_db();
390 10 100       36 die "bayes: (in learn) $eval_stat\n";
391 8         50 };
392              
393 8 50       50 return $ret;
394 8         59 }
395              
396             # this function is trapped by the wrapper above
397 10         97 my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
398 10 50       19 my @msgid = ( $msgid );
399 0 0       0  
  0         0  
400 0         0 if (!defined $msgid) {
401 0         0 @msgid = $self->get_msgid($msg);
402             }
403              
404 10         69 foreach my $msgid_t ( @msgid ) {
405             my $seen = $self->{store}->seen_get ($msgid_t);
406              
407             if (defined ($seen)) {
408             if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
409 8     8   30 dbg("bayes: $msgid_t already learnt correctly, not learning twice");
410 8         31 return 0;
411             } elsif ($seen !~ /^[hs]$/) {
412 8 50       25 warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored");
413 8         48 } else {
414             # bug 3704: If the message was already learned, don't try learning it again.
415             # this prevents, for instance, manually learning as spam, then autolearning
416 8         36 # as ham, or visa versa.
417 12         73 if ($self->{main}->{learn_no_relearn}) {
418             dbg("bayes: $msgid_t already learnt as opposite, not re-learning");
419 12 100       66 return 0;
420 4 100 66     83 }
    50 33        
      66        
421 2         17  
422 2         10 dbg("bayes: $msgid_t already learnt as opposite, forgetting first");
423              
424 0         0 # kluge so that forget() won't untie the db on us ...
425             my $orig = $self->{main}->{learn_caller_will_untie};
426             $self->{main}->{learn_caller_will_untie} = 1;
427              
428             my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
429 2 50       13  
430 0         0 # reset the value post-forget() ...
431 0         0 $self->{main}->{learn_caller_will_untie} = $orig;
432            
433             # forget() gave us a fatal error, so propagate that up
434 2         18 if ($fatal) {
435             dbg("bayes: forget() returned a fatal error, so learn() will too");
436             return;
437 2         13 }
438 2         7 }
439              
440 2         18 # we're only going to have seen this once, so stop if it's been
441             # seen already
442             last;
443 2         12 }
444             }
445              
446 2 50       11 # Now that we're sure we haven't seen this message before ...
447 0         0 $msgid = $msgid[0];
448 0         0  
449             my $msgatime = $msg->receive_date();
450              
451             # If the message atime comes back as being more than 1 day in the
452             # future, something's messed up and we should revert to current time as
453             # a safety measure.
454 2         11 #
455             $msgatime = time if ( $msgatime - time > 86400 );
456              
457             my $tokens = $self->tokenize($msg, $msgdata);
458              
459 6         22 { my $timer = $self->{main}->time_method('b_count_change');
460             if ($isspam) {
461 6         53 $self->{store}->nspam_nham_change(1, 0);
462             $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
463             } else {
464             $self->{store}->nspam_nham_change(0, 1);
465             $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
466             }
467 6 50       35 }
468              
469 6         43 $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
470             $self->{store}->cleanup();
471 6         18  
  6         67  
472 6 100       31 $self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
473 4         61 isspam => $isspam,
474 4         44 msgid => $msgid,
475             msgatime => $msgatime,
476 2         19 });
477 2         19  
478             dbg("bayes: learned '$msgid', atime: $msgatime");
479              
480             1;
481 6 100       240 }
482 6         61  
483             ###########################################################################
484 6         120  
485             # Plugin hook.
486             my ($self, $params) = @_;
487             my $msg = $params->{msg};
488             my $id = $params->{id};
489              
490 6         92 if (!$self->{conf}->{use_bayes}) { return; }
491              
492 6         567 my $msgdata = $self->get_body_from_msg ($msg);
493             my $ret;
494              
495             # we still tie for writing here, since we write to the seen db
496             # synchronously
497             eval {
498             local $SIG{'__DIE__'}; # do not run user die() traps in here
499 4     4 1 14 my $timer = $self->{main}->time_method("b_learn");
500 4         10  
501 4         9 my $ok;
502             if ($self->{main}->{learn_to_journal}) {
503 4 50       21 # If we're going to learn to journal, we'll try going r/o first...
  0         0  
504             # If that fails for some reason, let's try going r/w. This happens
505 4         23 # if the DB doesn't exist yet.
506 4         11 $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
507             } else {
508             $ok = $self->{store}->tie_db_writable();
509             }
510              
511 4         27 if ($ok) {
512 4         23 $ret = $self->_forget_trapped ($msg, $msgdata, $id);
513              
514 4         8 if (!$self->{main}->{learn_caller_will_untie}) {
515 4 50       24 $self->{store}->untie_db();
516             }
517             }
518             1;
519 0   0     0 } or do { # if we died, untie the dbs.
520             my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
521 4         32 $self->{store}->untie_db();
522             die "bayes: (in forget) $eval_stat\n";
523             };
524 4 50       17  
525 4         25 return $ret;
526             }
527 4 100       26  
528 2         17 # this function is trapped by the wrapper above
529             my ($self, $msg, $msgdata, $msgid) = @_;
530             my @msgid = ( $msgid );
531 4         43 my $isspam;
532 4 50       10  
533 0 0       0 if (!defined $msgid) {
  0         0  
534 0         0 @msgid = $self->get_msgid($msg);
535 0         0 }
536              
537             while( $msgid = shift @msgid ) {
538 4         33 my $seen = $self->{store}->seen_get ($msgid);
539              
540             if (defined ($seen)) {
541             if ($seen eq 's') {
542             $isspam = 1;
543 4     4   16 } elsif ($seen eq 'h') {
544 4         16 $isspam = 0;
545 4         9 } else {
546             dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
547 4 50       16 return 0;
548 4         21 }
549              
550             # messages should only be learned once, so stop if we find a msgid
551 4         24 # which was seen before
552 4         27 last;
553             }
554 4 50       26 else {
555 4 100       25 dbg("bayes: forget: msgid $msgid not learnt, ignored");
    50          
556 2         7 }
557             }
558 2         7  
559             # This message wasn't learnt before, so return
560 0         0 if (!defined $isspam) {
561 0         0 dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
562             return 0;
563             }
564             elsif ($isspam) {
565             $self->{store}->nspam_nham_change (-1, 0);
566 4         12 }
567             else {
568             $self->{store}->nspam_nham_change (0, -1);
569 0         0 }
570              
571             my $tokens = $self->tokenize($msg, $msgdata);
572              
573             if ($isspam) {
574 4 50       24 $self->{store}->multi_tok_count_change (-1, 0, $tokens);
    100          
575 0         0 } else {
576 0         0 $self->{store}->multi_tok_count_change (0, -1, $tokens);
577             }
578              
579 2         13 $self->{store}->seen_delete ($msgid);
580             $self->{store}->cleanup();
581              
582 2         16 $self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
583             isspam => $isspam,
584             msgid => $msgid,
585 4         39 });
586              
587 4 100       25 1;
588 2         80 }
589              
590 2         24 ###########################################################################
591              
592             # Plugin hook.
593 4         198 my ($self, $params) = @_;
594 4         36 if (!$self->{conf}->{use_bayes}) { return 0; }
595             dbg("bayes: bayes journal sync starting");
596 4         59 $self->{store}->sync($params);
597             dbg("bayes: bayes journal sync completed");
598             }
599              
600             ###########################################################################
601 4         413  
602             # Plugin hook.
603             my ($self, $params) = @_;
604             if (!$self->{conf}->{use_bayes}) { return 0; }
605             dbg("bayes: expiry starting");
606             my $timer = $self->{main}->time_method("expire_bayes");
607             $self->{store}->expire_old_tokens($params);
608 2     2 1 8 dbg("bayes: expiry completed");
609 2 50       11 }
  0         0  
610 2         15  
611 2         37 ###########################################################################
612 2         10  
613             # Plugin hook.
614             # Check to make sure we can tie() the DB, and we have enough entries to do a scan
615             # if we're told the caller will untie(), go ahead and leave the db tied.
616             my ($self, $params) = @_;
617              
618             return 0 unless $self->{conf}->{use_bayes};
619 0     0 1 0 return 0 unless $self->{store}->tie_db_readonly();
620 0 0       0  
  0         0  
621 0         0 # We need the DB to stay tied, so if the journal sync occurs, don't untie!
622 0         0 my $caller_untie = $self->{main}->{learn_caller_will_untie};
623 0         0 $self->{main}->{learn_caller_will_untie} = 1;
624 0         0  
625             # Do a journal sync if necessary. Do this before the nspam_nham_get()
626             # call since the sync may cause an update in the number of messages
627             # learnt.
628             $self->_opportunistic_calls(1);
629              
630             # Reset the variable appropriately
631             $self->{main}->{learn_caller_will_untie} = $caller_untie;
632              
633 146     146 1 414 my ($ns, $nn) = $self->{store}->nspam_nham_get();
634              
635 146 50       543 if ($ns < $self->{conf}->{bayes_min_spam_num}) {
636 146 100       691 dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
637             if (!$self->{main}->{learn_caller_will_untie}) {
638             $self->{store}->untie_db();
639 6         20 }
640 6         25 return 0;
641             }
642             if ($nn < $self->{conf}->{bayes_min_ham_num}) {
643             dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
644             if (!$self->{main}->{learn_caller_will_untie}) {
645 6         29 $self->{store}->untie_db();
646             }
647             return 0;
648 6         19 }
649              
650 6         55 return 1;
651             }
652 6 50       34  
653 0         0 ###########################################################################
654 0 0       0  
655 0         0 my ($self, $permsgstatus, $msg) = @_;
656             my $score;
657 0         0  
658             return unless $self->{conf}->{use_learner};
659 6 50       22  
660 0         0 # When we're doing a scan, we'll guarantee that we'll do the untie,
661 0 0       0 # so override the global setting until we're done.
662 0         0 my $caller_untie = $self->{main}->{learn_caller_will_untie};
663             $self->{main}->{learn_caller_will_untie} = 1;
664 0         0  
665             goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
666              
667 6         22 goto skip unless $self->learner_is_scan_available();
668              
669             my ($ns, $nn) = $self->{store}->nspam_nham_get();
670              
671             ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
672             ## $self->{raw_counts} = " ns=$ns nn=$nn ";
673 4     4 0 21 ## }
674 4         11  
675             dbg("bayes: corpus size: nspam = $ns, nham = $nn");
676 4 50       17  
677             my $msgtokens;
678             { my $timer = $self->{main}->time_method('b_tokenize');
679             my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
680 4         10 $msgtokens = $self->tokenize($msg, $msgdata);
681 4         12 }
682              
683 4 50       31 my $tokensdata;
684             { my $timer = $self->{main}->time_method('b_tok_get_all');
685 4 50       29 $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
686             }
687 4         18  
688             my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
689              
690             my $probabilities_ref =
691             $self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
692              
693 4         51 my %pw;
694             foreach my $tokendata (@{$tokensdata}) {
695 4         11 my $prob = shift(@$probabilities_ref);
696 4         10 next unless defined $prob;
  4         18  
697 4         16 my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
698 4         24 $pw{$token} = {
699             prob => $prob,
700             spam_count => $tok_spam,
701 4         13 ham_count => $tok_ham,
702 4         9 atime => $atime
  4         31  
703 4         11 };
  4         197  
704             }
705              
706 4         54 my @pw_keys = keys %pw;
707              
708 4         20 # If none of the tokens were found in the DB, we're going to skip
709             # this message...
710             if (!@pw_keys) {
711 4         14 dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
712 4         13 goto skip;
  4         18  
713 1104         1149 }
714 1104 100       1583  
715 502         477 my $tcount_total = keys %{$msgtokens};
  502         665  
716 502         1353 my $tcount_learned = scalar @pw_keys;
717              
718             # Figure out the message receive time (used as atime below)
719             # If the message atime comes back as being in the future, something's
720             # messed up and we should revert to current time as a safety measure.
721             #
722             my $msgatime = $msg->receive_date();
723             my $now = time;
724 4         83 $msgatime = $now if ( $msgatime > $now );
725              
726             my @touch_tokens;
727             my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
728 4 50       25 my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
729 0         0  
730 0         0 my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
731             my $log_each_token = (would_log('dbg', 'bayes') > 1);
732              
733 4         8 # now take the most significant tokens and calculate probs using
  4         11  
734 4         9 # Robinson's formula.
735              
736             @pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
737              
738             if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
739              
740 4         29 my @sorted;
741 4         16 foreach my $tok (@pw_keys) {
742 4 50       16 next if $tok_strength{$tok} <
743             $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
744 4         9  
745 4         23 my $pw_tok = $pw{$tok};
746 4         16 my $pw_prob = $pw_tok->{prob};
747              
748 4         306 # What's more expensive, scanning headers for HAMMYTOKENS and
749 4         47 # SPAMMYTOKENS tags that aren't there or collecting data that
750             # won't be used? Just collecting the data is certainly simpler.
751             #
752             my $raw_token = $msgtokens->{$tok} || "(unknown)";
753             my $s = $pw_tok->{spam_count};
754 4         31 my $n = $pw_tok->{ham_count};
  2935         3018  
755             my $a = $pw_tok->{atime};
756 4 100       17  
  2         12  
757             push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
758 4         9 [$raw_token, $pw_prob, $s, $n, $a] );
759 4         8  
760 414 100       653 push(@sorted, $pw_prob);
761              
762             # update the atime on this token, it proved useful
763 282         290 push(@touch_tokens, $tok);
764 282         298  
765             if ($log_each_token) {
766             dbg("bayes: token '$raw_token' => $pw_prob");
767             }
768             }
769              
770 282   50     599 if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
771 282         318 $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
772 282         288 {
773 282         290 dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
774             goto skip;
775 282 100       275 }
  282         806  
776              
777             $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
778 282         391 undef $timer_compute_prob; # end a timing section
779              
780             # Couldn't come up with a probability?
781 282         313 goto skip unless defined $score;
782              
783 282 50       602 dbg("bayes: score = $score");
784 0         0  
785             # no need to call tok_touch_all unless there were significant
786             # tokens and a score was returned
787             # we don't really care about the return value here
788 4 50 50     34  
789             { my $timer = $self->{main}->time_method('b_tok_touch_all');
790             $self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
791 0         0 }
792 0         0  
793             my $timer_finish = $self->{main}->time_method('b_finish');
794              
795 4         33 $permsgstatus->{bayes_nspam} = $ns;
796 4         13 $permsgstatus->{bayes_nham} = $nn;
797              
798             ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
799 4 50       10 ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
800             ## }
801 4         46  
802             $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
803             probsref => \%pw,
804             score => $score,
805             msgatime => $msgatime,
806             significant_tokens => \@touch_tokens,
807 4         7 });
  4         20  
808 4         59  
809             skip:
810             if (!defined $score) {
811 4         18 dbg("bayes: not scoring message, returning undef");
812             }
813 4         11  
814 4         28 undef $timer_compute_prob; # end a timing section if still running
815             if (!defined $timer_finish) {
816             $timer_finish = $self->{main}->time_method('b_finish');
817             }
818              
819             # Take any opportunistic actions we can take
820 4         50 if ($self->{main}->{opportunistic_expire_check_only}) {
821             # we're supposed to report on expiry only -- so do the
822             # _opportunistic_calls() run for the journal only.
823             $self->_opportunistic_calls(1);
824             $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
825             }
826             else {
827             $self->_opportunistic_calls();
828 4 50       19 }
829 0         0  
830             # Do any cleanup we need to do
831             $self->{store}->cleanup();
832 4         15  
833 4 50       12 # Reset the value accordingly
834 4         26 $self->{main}->{learn_caller_will_untie} = $caller_untie;
835              
836             # If our caller won't untie the db, we need to do it.
837             if (!$caller_untie) {
838 4 50       16 $self->{store}->untie_db();
839             }
840              
841 0         0 $permsgstatus->set_tag ('BAYESTCHAMMY',
842 0         0 ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));
843             $permsgstatus->set_tag ('BAYESTCSPAMMY',
844             ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));
845 4         20 $permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);
846             $permsgstatus->set_tag ('BAYESTC', $tcount_total);
847              
848             $permsgstatus->set_tag ('HAMMYTOKENS', sub {
849 4         28 my $pms = shift;
850             $self->bayes_report_make_list
851             ($pms, $pms->{bayes_token_info_hammy}, shift);
852 4         13 });
853              
854             $permsgstatus->set_tag ('SPAMMYTOKENS', sub {
855 4 50       15 my $pms = shift;
856 4         38 $self->bayes_report_make_list
857             ($pms, $pms->{bayes_token_info_spammy}, shift);
858             });
859              
860 4 50       17 $permsgstatus->set_tag ('TOKENSUMMARY', sub {
  4         26  
861             my $pms = shift;
862 4 50       14 if ( defined $pms->{tag_data}{BAYESTC} )
  4         22  
863 4         12 {
864 4         13 my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED}
865             - $pms->{tag_data}{BAYESTCSPAMMY}
866             - $pms->{tag_data}{BAYESTCHAMMY};
867 0     0   0 my $tcount_new = $pms->{tag_data}{BAYESTC}
868             - $pms->{tag_data}{BAYESTCLEARNED};
869 0         0 "Tokens: new, $tcount_new; "
870 4         37 ."hammy, $pms->{tag_data}{BAYESTCHAMMY}; "
871             ."neutral, $tcount_neutral; "
872             ."spammy, $pms->{tag_data}{BAYESTCSPAMMY}."
873 0     0   0 } else {
874             "Bayes not run.";
875 0         0 }
876 4         21 });
877              
878              
879 0     0   0 return $score;
880 0 0       0 }
881              
882             ###########################################################################
883              
884 0         0 # Plugin hook.
885             my ($self, $params) = @_;
886 0         0 my $magic = $params->{magic};
887 0         0 my $toks = $params->{toks};
888             my $regex = $params->{regex};
889              
890             # allow dump to occur even if use_bayes disables everything else ...
891             #return 0 unless $self->{conf}->{use_bayes};
892 0         0 return 0 unless $self->{store}->tie_db_readonly();
893            
894 4         42 my @vars = $self->{store}->get_storage_variables();
895              
896             my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
897 4         548  
898             my $template = '%3.3f %10u %10u %10u %s'."\n";
899              
900             if ( $magic ) {
901             printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version')
902             or die "Error writing: $!";
903             printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam')
904 0     0 1 0 or die "Error writing: $!";
905 0         0 printf($template, 0.0, 0, $nh, 0, 'non-token data: nham')
906 0         0 or die "Error writing: $!";
907 0         0 printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens')
908             or die "Error writing: $!";
909             printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime')
910             or die "Error writing: $!";
911 0 0       0 if ( $bv >= 2 ) {
912             printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime')
913 0         0 or die "Error writing: $!";
914             }
915 0         0 if ( $bv < 2 ) {
916             printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count')
917 0         0 or die "Error writing: $!";
918             }
919 0 0       0 if ( $bv >= 2 ) {
920 0 0       0 printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime')
921             or die "Error writing: $!";
922 0 0       0 }
923             printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime')
924 0 0       0 or die "Error writing: $!";
925             if ( $bv >= 2 ) {
926 0 0       0 printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta')
927             or die "Error writing: $!";
928 0 0       0  
929             printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count')
930 0 0       0 or die "Error writing: $!";
931 0 0       0 }
932             }
933              
934 0 0       0 if ( $toks ) {
935 0 0       0 # let the store sort out the db_toks
936             $self->{store}->dump_db_toks($template, $regex, @vars);
937             }
938 0 0       0  
939 0 0       0 if (!$self->{main}->{learn_caller_will_untie}) {
940             $self->{store}->untie_db();
941             }
942 0 0       0 return 1;
943             }
944 0 0       0  
945 0 0       0 ###########################################################################
946             # TODO: these are NOT public, but the test suite needs to call them.
947              
948 0 0       0 my ($self, $msg) = @_;
949              
950             my @msgid;
951              
952             my $msgid = $msg->get_header("Message-Id");
953 0 0       0 if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
954             # remove \r and < and > prefix/suffixes
955 0         0 chomp $msgid;
956             $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
957             push(@msgid, $msgid);
958 0 0       0 }
959 0         0  
960             # Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation
961 0         0  
962             # Use sha1_hex(Date: and top N bytes of body)
963             # where N is MIN(1024 bytes, 1/2 of body length)
964             #
965             my $date = $msg->get_header("Date");
966             $date = "None" if (!defined $date || $date eq ''); # No Date?
967              
968 14     14 0 42 #Removed per bug 5185
969             #my @rcvd = $msg->get_header("Received");
970 14         28 #my $rcvd = $rcvd[$#rcvd];
971             #$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
972 14         97  
973 14 50 33     315 # Make a copy since pristine_body is a reference ...
      33        
974             my $body = join('', $msg->get_pristine_body());
975 14         57  
976 14         92 if (length($body) > 64) { # Small Body?
  14         81  
977 14         52 my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
978             substr($body, $keep) = '';
979             }
980              
981             #Stripping all CR and LF so that testing midstream from MTA and post delivery don't
982             #generate different id's simply because of LF<->CR<->CRLF changes.
983             $body =~ s/[\r\n]//g;
984              
985 14         57 unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');
986 14 50 33     140  
987             return wantarray ? @msgid : $msgid[0];
988             }
989              
990             my ($self, $msg) = @_;
991              
992             if (!ref $msg) {
993             # I have no idea why this seems to happen. TODO
994 14         96 warn "bayes: msg not a ref: '$msg'";
995             return { };
996 14 50       76 }
997 14 50       61  
998 14         44 my $permsgstatus =
999             Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
1000             $msg->extract_message_metadata ($permsgstatus);
1001             my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
1002             $permsgstatus->finish();
1003 14         249  
1004             if (!defined $msgdata) {
1005 14         295 # why?!
1006             warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
1007 14 50       111 return { };
1008             }
1009              
1010             return $msgdata;
1011 20     20 0 65 }
1012              
1013 20 50       86 my ($self, $pms) = @_;
1014              
1015 0         0 my $t_src = $self->{conf}->{bayes_token_sources};
1016 0         0 my $msgdata = { };
1017             $msgdata->{bayes_token_body} =
1018             $pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible};
1019             $msgdata->{bayes_token_inviz} =
1020 20         205 $pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible};
1021 20         137 $msgdata->{bayes_mimepart_digests} =
1022 20         97 $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
1023 20         113 @{$msgdata->{bayes_token_uris}} =
1024             $pms->get_uri_list() if $t_src->{uri};
1025 20 50       71 return $msgdata;
1026             }
1027 0         0  
1028 0         0 ###########################################################################
1029              
1030             # The calling functions expect a uniq'ed array of tokens ...
1031 20         123 my ($self, $msg, $msgdata) = @_;
1032              
1033             my $t_src = $self->{conf}->{bayes_token_sources};
1034              
1035 24     24   63 # visible tokens from the body
1036             my @tokens_body;
1037 24         75 if ($msgdata->{bayes_token_body}) {
1038 24         66 foreach (@{$msgdata->{bayes_token_body}}) {
1039             push(@tokens_body, $self->_tokenize_line ($_, '', 1));
1040 24 50       189 last if scalar @tokens_body >= 50000;
1041             }
1042 24 50       158 dbg("bayes: tokenized body: %d tokens", scalar @tokens_body);
1043             }
1044 24 50       84 # the URI list
1045 24         77 my @tokens_uri;
1046 24 50       186 if ($msgdata->{bayes_token_uris}) {
1047 24         79 foreach (@{$msgdata->{bayes_token_uris}}) {
1048             push(@tokens_uri, $self->_tokenize_line ($_, '', 2));
1049             last if scalar @tokens_uri >= 10000;
1050             }
1051             dbg("bayes: tokenized uri: %d tokens", scalar @tokens_uri);
1052             }
1053             # add invisible tokens
1054 16     16 0 58 my @tokens_inviz;
1055             if ($msgdata->{bayes_token_inviz}) {
1056 16         60 my $tokprefix;
1057             if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' }
1058             if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' }
1059 16         36 if (defined $tokprefix) {
1060 16 50       63 foreach (@{$msgdata->{bayes_token_inviz}}) {
1061 16         30 push(@tokens_inviz, $self->_tokenize_line ($_, $tokprefix, 1));
  16         82  
1062 278         668 last if scalar @tokens_inviz >= 50000;
1063 278 50       1119 }
1064             }
1065 16         136 dbg("bayes: tokenized invisible: %d tokens", scalar @tokens_inviz);
1066             }
1067              
1068 16         41 # add digests and Content-Type of all MIME parts
1069 16 50       91 my @tokens_mimepart;
1070 16         39 if ($msgdata->{bayes_mimepart_digests}) {
  16         65  
1071 54         135 my %shorthand = ( # some frequent MIME part contents for human readability
1072 54 50       172 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
1073             'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
1074 16         60 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML',
1075             'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
1076             'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
1077 16         29 '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
1078 16 50       57 '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
1079 16         27 );
1080 16         26 @tokens_mimepart = map('MIME:' . ($shorthand{$_} || $_),
  16         49  
1081 16         33 @{ $msgdata->{bayes_mimepart_digests} });
1082 16 50       60 dbg("bayes: tokenized mime parts: %d tokens", scalar @tokens_mimepart);
1083 16         31 dbg("bayes: mime-part token %s", $_) for @tokens_mimepart;
  16         56  
1084 0         0 }
1085 0 0       0  
1086             # Tokenize the headers
1087             my @tokens_header;
1088 16         44 if ($t_src->{header}) {
1089             my %hdrs = $self->_tokenize_headers ($msg);
1090             while( my($prefix, $value) = each %hdrs ) {
1091             push(@tokens_header, $self->_tokenize_line ($value, "H$prefix:", 0));
1092 16         33 last if scalar @tokens_header >= 10000;
1093 16 50       58 }
1094 0         0 dbg("bayes: tokenized header: %d tokens", scalar @tokens_header);
1095             }
1096              
1097             # Go ahead and uniq the array, skip null tokens (can happen sometimes)
1098             # generate an SHA1 hash and take the lower 40 bits as our token
1099             my %tokens;
1100             foreach my $token
1101             (@tokens_body, @tokens_uri, @tokens_inviz, @tokens_mimepart, @tokens_header)
1102             {
1103             # dbg("bayes: token: %s", $token);
1104 0   0     0 $tokens{substr(sha1($token), -5)} = $token if $token ne '';
  0         0  
1105 0         0 }
1106 0         0  
1107             # return the keys == tokens ...
1108             return \%tokens;
1109             }
1110 16         34  
1111 16 50       70 my $self = $_[0];
1112 16         67 my $tokprefix = $_[2];
1113 16         136 my $region = $_[3];
1114 200         548 local ($_) = $_[1];
1115 200 50       1020  
1116             my @rettokens;
1117 16         101  
1118             # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
1119             # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
1120             # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
1121              
1122 16         56 ### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
1123 16         77  
1124             ### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic)
1125             s{ ( [A-Za-z0-9,@*!_'"\$. -]+ |
1126             [\xC0-\xDF][\x80-\xBF] |
1127 6714 50       35244 [\xE0-\xEF][\x80-\xBF]{2} |
1128             [\xF0-\xF4][\x80-\xBF]{3} |
1129             [\xA1-\xFF] ) | . }
1130             { defined $1 ? $1 : ' ' }xsge;
1131 16         879 # should we also turn NBSP ( \xC2\xA0 ) into space?
1132              
1133             # DO split on "..." or "--" or "---"; common formatting error resulting in
1134             # hapaxes. Keep the separator itself as a token, though, as long ones can
1135 532     532   701 # be good spamsigns.
1136 532         711 s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
1137 532         584 s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
1138 532         1044  
1139             if (IGNORE_TITLE_CASE) {
1140 532         586 if ($region == 1 || $region == 2) {
1141             # lower-case Title Case at start of a full-stop-delimited line (as would
1142             # be seen in a Western language).
1143             s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
1144             }
1145             }
1146              
1147             my $magic_re = $self->{store}->get_magic_re();
1148              
1149 532         1823 # Note that split() in scope of 'use bytes' results in words with utf8 flag
1150             # cleared, even if the source string has perl characters semantics !!!
1151             # Is this really still desirable?
1152              
1153 4844 100       14792 foreach my $token (split) {
1154             $token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
1155             $token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
1156              
1157             # Skip false magic tokens
1158             # TVD: we need to do a defined() check since SQL doesn't have magic
1159 532         1474 # tokens, so the SQL BayesStore returns undef. I really want a way
1160 532         756 # of optimizing that out, but I haven't come up with anything yet.
1161             #
1162 532         552 next if ( defined $magic_re && $token =~ /$magic_re/ );
1163 532 100 100     1388  
1164             # *do* keep 3-byte tokens; there's some solid signs in there
1165             my $len = length($token);
1166 332         1868  
  242         2033  
1167             # but extend the stop-list. These are squarely in the gray
1168             # area, and it just slows us down to record them.
1169             # See http://wiki.apache.org/spamassassin/BayesStopList for more info.
1170 532         1705 #
1171             next if $len < 3 ||
1172             ($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
1173              
1174             # are we in the body? If so, apply some body-specific breakouts
1175             if ($region == 1 || $region == 2) {
1176 532         4139 if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
1177 8592         15089 push (@rettokens, $self->_tokenize_mail_addrs ($token));
1178 8592         11820 }
1179             elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
1180             push (@rettokens, "UD:".$token); # the full token
1181             my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
1182             push (@rettokens, "UD:".$1); # UD = URL domain
1183             }
1184             }
1185 8592 50 33     29609 }
1186              
1187             # note: do not trim down overlong tokens if they contain '*'. This is
1188 8592         14048 # used as part of split tokens such as "HTo:D*net" indicating that
1189             # the domain ".net" appeared in the To header.
1190             #
1191             if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
1192              
1193             if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
1194 8592 100 100     34595 # Bug 7135
1195             # collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences
1196             my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] )
1197             [\x80-\xBF]{2} )/xsg;
1198 5230 100 100     11367 if (@t) {
1199 3534 100       7816 push (@rettokens, map($tokprefix.'u8:'.$_, @t));
    100          
1200 38         152 next;
1201             }
1202             }
1203 110         306  
1204 110         164 if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
  110         472  
1205 132         529 # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
1206             # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
1207             # to me! (jm)
1208             while ($token =~ s/^(..?)//) {
1209             push (@rettokens, $tokprefix.'8:'.$1);
1210             }
1211             next;
1212             }
1213              
1214 5230 100 100     10121 if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1215             || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1216 250 50       580 || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
1217             {
1218             # if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
1219 0         0 # Spambayes trick via Matt: Just retain 7 chars. Do not retain the
1220             # length, it does not help; see jm's mail to -devel on Nov 20 2002 at
1221 0 0       0 # http://sourceforge.net/p/spamassassin/mailman/message/12977605/
1222 0         0 # "sk:" stands for "skip".
1223 0         0 # Bug 7141: retain seven UTF-8 chars (or other bytes),
1224             # if followed by at least two bytes
1225             $token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] |
1226             [\xC0-\xDF][\x80-\xBF] |
1227 250         272 [\xE0-\xEF][\x80-\xBF]{2} |
1228             [\xF0-\xF4][\x80-\xBF]{3} | . ){7} ))
1229             .{2,} \z }{sk:$1}xs;
1230             ## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes
1231             }
1232             }
1233              
1234             # decompose tokens? do this after shortening long tokens
1235             if ($region == 1 || $region == 2) {
1236             if (DECOMPOSE_BODY_TOKENS) {
1237 250 100 100     949 if ($token =~ /[^\w:\*]/) {
      100        
      100        
      50        
      66        
1238             my $decompd = $token; # "Foo!"
1239             $decompd =~ s/[^\w:\*]//gs;
1240             push (@rettokens, $tokprefix.$decompd); # "Foo"
1241             }
1242              
1243             if ($token =~ /[A-Z]/) {
1244             my $decompd = $token; $decompd = lc $decompd;
1245             push (@rettokens, $tokprefix.$decompd); # "foo!"
1246              
1247             if ($token =~ /[^\w:\*]/) {
1248 232         1478 $decompd =~ s/[^\w:\*]//gs;
1249             push (@rettokens, $tokprefix.$decompd); # "foo"
1250             }
1251             }
1252             }
1253             }
1254              
1255             push (@rettokens, $tokprefix.$token);
1256             }
1257              
1258 5230 100 100     9781 return @rettokens;
1259 3534         3702 }
1260 3534 100       6427  
1261 358         708 my ($self, $msg) = @_;
1262 358         1178  
1263 358         990 my %parsed;
1264              
1265             my %user_ignore;
1266 3534 100       6969 $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
1267 712         1219  
  712         1082  
1268 712         1670 # get headers in array context
1269             my @hdrs;
1270 712 100       1962 my @rcvdlines;
1271 76         265 for ($msg->get_all_headers()) {
1272 76         240 # first, keep a copy of Received headers, so we can strip down to last 2
1273             if (/^Received:/i) {
1274             push(@rcvdlines, $_);
1275             next;
1276             }
1277             # and now skip lines for headers we don't want (including all Received)
1278 5230         17162 next if /^${IGNORED_HDRS}:/i;
1279             next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
1280             push(@hdrs, $_);
1281 532         5352 }
1282             push(@hdrs, $msg->get_all_metadata());
1283              
1284             # and re-add the last 2 received lines: usually a good source of
1285 16     16   50 # spamware tokens and HELO names.
1286             if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
1287 16         52 if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
1288              
1289             for (@hdrs) {
1290 16         29 next unless /\S/;
  16         101  
1291             my ($hdr, $val) = split(/:/, $_, 2);
1292              
1293 16         35 # remove user-specified headers here, after Received, in case they
1294             # want to ignore that too
1295 16         145 next if exists $user_ignore{lc $hdr};
1296              
1297 202 100       484 # Prep the header value
1298 52         132 $val ||= '';
1299 52         86 chomp($val);
1300              
1301             # special tokenization for some headers:
1302 150 100       1949 if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {
1303 102         144 $val = $self->_pre_chew_message_id ($val);
1304 102         242 }
1305             elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)
1306 16         130 (?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
1307             {
1308             $val = $self->_pre_chew_addr_header ($val);
1309             }
1310 16 100       80 elsif ($hdr eq 'Received') {
  14         42  
1311 16 100       67 $val = $self->_pre_chew_received ($val);
  14         50  
1312             }
1313 16         55 elsif ($hdr eq 'Content-Type') {
1314 194 50       593 $val = $self->_pre_chew_content_type ($val);
1315 194         707 }
1316             elsif ($hdr eq 'MIME-Version') {
1317             $val =~ s/1\.0//; # totally innocuous
1318             }
1319 194 50       528 elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
1320             $val = "1"; # just mark the presence, they create lots of hapaxen
1321             }
1322 194   50     368  
1323 194         300 if (MAP_HEADERS_MID) {
1324             if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {
1325             $parsed{"*MI"} = $val;
1326 194 100       1377 }
    100          
    100          
    100          
    100          
    50          
1327 16         73 }
1328             if (MAP_HEADERS_FROMTOCC) {
1329             if ($hdr =~ /^(?:From|To|Cc)$/i) {
1330             $parsed{"*Ad"} = $val;
1331             }
1332 60         183 }
1333             if (MAP_HEADERS_USERAGENT) {
1334             if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {
1335 28         102 $parsed{"*UA"} = $val;
1336             }
1337             }
1338 4         32  
1339             # replace hdr name with "compressed" version if possible
1340             if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
1341 2         9 $hdr = $HEADER_NAME_COMPRESSION{$hdr};
1342             }
1343              
1344 0         0 if (exists $parsed{$hdr}) {
1345             $parsed{$hdr} .= " ".$val;
1346             } else {
1347 194         278 $parsed{$hdr} = $val;
1348 194 100       474 }
1349 16         63 if (would_log('dbg', 'bayes') > 1) {
1350             dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
1351             }
1352 194         255 }
1353 194 100       479  
1354 32         94 return %parsed;
1355             }
1356              
1357 194         247 my ($self, $val) = @_;
1358 194 50       388  
1359 0         0 # hopefully this will retain good bits without too many hapaxen
1360             if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
1361             my $boundary = $1;
1362             $boundary = '' if !defined $boundary; # avoid a warning
1363             $boundary =~ s/[a-fA-F0-9]/H/gs;
1364 194 100       471 # break up blocks of separator chars so they become their own tokens
1365 126         247 $boundary =~ s/([-_\.=]+)/ $1 /gs;
1366             $val .= $boundary;
1367             }
1368 194 100       395  
1369 26         102 # stop-list words for Content-Type header: these wind up totally gray
1370             $val =~ s/\b(?:text|charset)\b//;
1371 168         477  
1372             $val;
1373 194 50       450 }
1374 0         0  
1375             my ($self, $val) = @_;
1376             # we can (a) get rid of a lot of hapaxen and (b) increase the token
1377             # specificity by pre-parsing some common formats.
1378 16         283  
1379             # Outlook Express format:
1380             $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
1381             ([0-9a-f]{4})[0-9a-f]{4}\$
1382 4     4   17 ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
1383              
1384             # Exim:
1385 4 50       21 $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
1386 0         0  
1387 0 0       0 # Sendmail:
1388 0         0 $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
1389             [A-F0-9]{10,12}\@//gx;
1390 0         0  
1391 0         0 # try to split Message-ID segments on probable ID boundaries. Note that
1392             # Outlook message-ids seem to contain a server identifier ID in the last
1393             # 8 bytes before the @. Make sure this becomes its own token, it's a
1394             # great spam-sign for a learning system! Be sure to split on ".".
1395 4         24 $val =~ s/[^_A-Za-z0-9]/ /g;
1396             $val;
1397 4         15 }
1398              
1399             my ($self, $val) = @_;
1400              
1401 16     16   62 # Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs
1402             # and valid-format RFC-822/2822 dates
1403              
1404             $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail
1405             $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail
1406 16         53 $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail
1407             $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
1408              
1409             $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
1410             [0-3\s]?[0-9]\s
1411 16         38 (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
1412             (?:19|20)?[0-9]{2}\s
1413             [0-2][0-9](?:\:[0-5][0-9]){1,2}\s
1414 16         32 (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
1415             //gx;
1416              
1417             # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
1418             # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
1419             # (on both sides)
1420             # also make a dup with the full IP, as fodder for
1421 16         116 # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"
1422 16         54 $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
1423             if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {
1424             $1.$2.$3.$4.
1425             " ip*".$1.$2.$3.$4." ";
1426 28     28   168 } else {
1427             $1.$2.$3.
1428             " ip*".$1.$2.$3.$4." ";
1429             }
1430             }gex;
1431 28         112  
1432 28         83 # trim these: they turn out as the most common tokens, but with a
1433 28         174 # prob of about .5. waste of space!
1434 28         65 $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;
1435              
1436 28         252 $val;
1437             }
1438              
1439             my ($self, $val) = @_;
1440             local ($_);
1441              
1442             my @addrs = $self->{main}->find_all_addrs_in_line ($val);
1443             my @toks;
1444             foreach (@addrs) {
1445             push (@toks, $self->_tokenize_mail_addrs ($_));
1446             }
1447             return join (' ', @toks);
1448             }
1449 28         210  
1450 30 50 33     244 my ($self, $addr) = @_;
      33        
1451 0         0  
1452             ($addr =~ /(.+)\@(.+)$/) or return ();
1453             my @toks;
1454 30         371 push(@toks, "U*".$1, "D*".$2);
1455             $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
1456             return @toks;
1457             }
1458              
1459              
1460             ###########################################################################
1461 28         293  
1462             # compute the probability that a token is spammish for each token
1463 28         88 my ($self, $tokensdata, $ns, $nn) = @_;
1464             my @probabilities;
1465              
1466             return if !$ns || !$nn;
1467 60     60   157  
1468 60         110 my $threshold = 1; # ignore low-freq tokens below this s+n threshold
1469             if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1470 60         207 $threshold = 10;
1471 60         78 }
1472 60         116 if (!$self->{use_hapaxes}) {
1473 48         111 $threshold = 2;
1474             }
1475 60         309  
1476             foreach my $tokendata (@{$tokensdata}) {
1477             my $s = $tokendata->[1]; # spam count
1478             my $n = $tokendata->[2]; # ham count
1479 86     86   262 my $prob;
1480              
1481 86 50       443 no warnings 'uninitialized'; # treat undef as zero in addition
1482 86         146 if ($s + $n >= $threshold) {
1483 86         368 # ignoring low-freq tokens, also covers the (!$s && !$n) case
1484 86         176  
  86         429  
  76         343  
1485 86         430 # my $ratios = $s / $ns;
1486             # my $ration = $n / $nn;
1487             # $prob = $ratios / ($ration + $ratios);
1488             #
1489             $prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster
1490              
1491             if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1492             # use Robinson's f(x) equation for low-n tokens, instead of just
1493 4     4   21 # ignoring them
1494 4         9 my $robn = $s + $n;
1495             $prob =
1496 4 50 33     34 ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
1497             /
1498 4         8 ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
1499 4         11 }
1500             }
1501              
1502 4 50       21 # 'log_raw_counts' is used to log the raw data for the Bayes equations
1503 0         0 # during a mass-check, allowing the S and X constants to be optimized
1504             # quickly without requiring re-tokenization of the messages for each
1505             # attempt. There's really no need for this code to be uncommented in
1506 4         10 # normal use, however. It has never been publicly documented, so
  4         20  
1507 1104         1089 # commenting it out is fine. ;)
1508 1104         982 #
1509 1104         1025 ## if ($self->{log_raw_counts}) {
1510             ## $self->{raw_counts} .= " s=$s,n=$n ";
1511 22     22   253 ## }
  22         83  
  22         25493  
1512 1104 100       1494  
1513             push(@probabilities, $prob);
1514             }
1515             return \@probabilities;
1516             }
1517              
1518             # compute the probability that a token is spammish
1519 502         561 my ($self, $token, $ns, $nn, $s, $n) = @_;
1520              
1521 502         465 # we allow the caller to give us the token information, just
1522             # to save a potentially expensive lookup
1523             if (!defined($s) || !defined($n)) {
1524 502         498 ($s, $n, undef) = $self->{store}->tok_get($token);
1525 502         649 }
1526             return if !$s && !$n;
1527              
1528             my $probabilities_ref =
1529             $self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);
1530              
1531             return $probabilities_ref->[0];
1532             }
1533              
1534             ###########################################################################
1535             # If a token is neither hammy nor spammy, return 0.
1536             # For a spammy token, return the minimum number of additional ham messages
1537             # it would have had to appear in to no longer be spammy. Hammy tokens
1538             # are handled similarly. That's what the function does (at the time
1539             # of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly
1540             # more useful if it returned the number of /additional/ ham messages
1541             # a spammy token would have to appear in to no longer be spammy but I
1542             # fear that might require the solution to a cubic equation, and I
1543 1104         1637 # just don't have the time for that now.
1544              
1545 4         17 my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
1546              
1547             return 0 if $ns == 0 && $nn == 0;
1548              
1549             if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
1550 0     0   0 if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
1551              
1552             return 0 if $Ns == 0 || $Nn == 0;
1553             return 0 if abs( $prob - 0.5 ) <
1554 0 0 0     0 $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1555 0         0  
1556             my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
1557 0 0 0     0 my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1558              
1559 0         0 return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
1560             unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
1561              
1562 0         0 my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
1563             my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
1564             my $a = $Nb * ( 1 - $p );
1565             my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
1566             my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
1567             my $discrim = $b * $b - 4 * $a * $c;
1568             my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
1569             my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
1570              
1571             # This shouldn't be necessary. Should not be < 1
1572             return $dd_exact < 1 ? 1 : int($dd_exact);
1573             }
1574              
1575             ###########################################################################
1576              
1577 0     0   0 my($self, $journal_only) = @_;
1578              
1579 0 0 0     0 # If we're not already tied, abort.
1580             if (!$self->{store}->db_readable()) {
1581 0         0 dbg("bayes: opportunistic call attempt failed, DB not readable");
1582 0 0       0 return;
  0 0       0  
1583             }
1584 0 0 0     0  
1585 0 0       0 # Is an expire or sync running?
1586             my $running_expire = $self->{store}->get_running_expire_tok();
1587             if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
1588 0 0       0 dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
1589 0         0 return;
1590             }
1591 0         0  
1592             # handle expiry and syncing
1593             if (!$journal_only && $self->{store}->expiry_due()) {
1594 0         0 dbg("bayes: opportunistic call found expiry due");
1595 0         0  
1596 0         0 # sync will bring the DB R/W as necessary, and the expire will remove
1597 0         0 # the running_expire token, may untie as well.
1598 0         0 $self->{main}->{bayes_scanner}->sync(1,1);
1599 0         0 }
1600 0 0       0 elsif ( $self->{store}->sync_due() ) {
1601 0         0 dbg("bayes: opportunistic call found journal sync due");
1602              
1603             # sync will bring the DB R/W as necessary, may untie as well
1604 0 0       0 $self->{main}->{bayes_scanner}->sync(1,0);
1605              
1606             # We can only remove the running_expire token if we're doing R/W
1607             if ($self->{store}->db_writable()) {
1608             $self->{store}->remove_running_expire_tok();
1609             }
1610 10     10   40 }
1611              
1612             return;
1613 10 50       120 }
1614 0         0  
1615 0         0 ###########################################################################
1616              
1617             my ($self) = @_;
1618              
1619 10         63 my $store;
1620 10 50 33     49 my $module = $self->{conf}->{bayes_store_module};
1621 0         0 if (!$module) {
1622 0         0 $module = 'Mail::SpamAssassin::BayesStore::DBM';
1623             } elsif ($module =~ /^([_A-Za-z0-9:]+)$/) {
1624             $module = untaint_var($module);
1625             } else {
1626 10 50 66     137 die "bayes: invalid module: $module\n";
    50          
1627 0         0 }
1628              
1629             dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module);
1630             undef $self->{store}; # DESTROYs previous object, if any
1631 0         0 eval '
1632             require '.$module.';
1633             $store = '.$module.'->new($self);
1634 0         0 1;
1635             ' or do {
1636             my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1637 0         0 die "bayes: learner_new $module new() failed: $eval_stat\n";
1638             };
1639              
1640 0 0       0 dbg("bayes: learner_new: got store=%s", $store);
1641 0         0 $self->{store} = $store;
1642              
1643             $self;
1644             }
1645 10         35  
1646             ###########################################################################
1647              
1648             my ($self, $pms, $info, $param) = @_;
1649             return "Tokens not available." unless defined $info;
1650              
1651 63     63 1 187 my ($limit,$fmt_arg,$more) = split /,/, ($param || '5');
1652              
1653 63         160 my %formats = (
1654 63         235 short => '$t',
1655 63 100       376 Short => 'Token: \"$t\"',
    50          
1656 57         147 compact => '$p-$D--$t',
1657             Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
1658 6         44 medium => '$p-$D-$N--$t',
1659             long => '$p-$d--${h}h-${s}s--${a}d--$t',
1660 0         0 Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
1661             );
1662              
1663 63         267 my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
1664 63         168  
1665             return "Invalid format, must be one of: ".join(",",keys %formats)
1666             unless defined $raw_fmt;
1667              
1668             my $fmt = '"'.$raw_fmt.'"';
1669 63 50       7525 my $amt = $limit < @$info ? $limit : @$info;
1670 0 0       0 return "" unless $amt;
  0         0  
1671 0         0  
1672             my $ns = $pms->{bayes_nspam};
1673             my $nh = $pms->{bayes_nham};
1674 63         446 my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
1675 63         148 my $now = time;
1676              
1677 63         215 join ', ', map {
1678             my($t,$prob,$s,$h,$u) = @$_;
1679             my $a = int(($now - $u)/(3600 * 24));
1680             my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);
1681             my $p = sprintf "%.3f", $prob;
1682             my $n = $s + $h;
1683 0     0 0   my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
1684 0 0         my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
1685             eval $fmt; ## no critic
1686 0   0       } @{$info}[0..$amt-1];
1687             }
1688 0            
1689             1;