File Coverage

blib/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Criterion	Covered	Total	%
statement	557	758	73.4
branch	161	350	46.0
condition	45	119	37.8
subroutine	58	71	81.6
pod	11	21	52.3
total	832	1319	63.0

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier
21
22							=head1 DESCRIPTION
23
24							This is a Bayesian-style probabilistic classifier, using an algorithm based on
25							the one detailed in Paul Graham's I<A Plan For Spam> paper at:
26
27							http://www.paulgraham.com/spam.html
28
29							It also incorporates some other aspects taken from Graham Robinson's webpage
30							on the subject at:
31
32							http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
33
34							And the chi-square probability combiner as described here:
35
36							http://www.linuxjournal.com/print.php?sid=6467
37
38							The results are incorporated into SpamAssassin as the BAYES_* rules.
39
40							=head1 METHODS
41
42							=cut
43
44
45							use strict;
46	22			22		141	use warnings;
	22					44
	22					640
47	22			22		107	# use bytes;
	22					39
	22					702
48							use re 'taint';
49	22			22		124
	22					56
	22					1865
50							BEGIN {
51							eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
52	22					114	or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
	22					1353
	22					554
53	22	50		22		75	}
	0					0
	0					0
54
55							use Mail::SpamAssassin;
56	22			22		124	use Mail::SpamAssassin::Plugin;
	22					46
	22					543
57	22			22		108	use Mail::SpamAssassin::PerMsgStatus;
	22					60
	22					659
58	22			22		126	use Mail::SpamAssassin::Logger;
	22					42
	22					599
59	22			22		116	use Mail::SpamAssassin::Util qw(untaint_var);
	22					46
	22					1203
60	22			22		195
	22					38
	22					996
61							# pick ONLY ONE of these combining implementations.
62							use Mail::SpamAssassin::Bayes::CombineChi;
63	22			22		6640	# use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
	22					57
	22					12009
64
65							our @ISA = qw(Mail::SpamAssassin::Plugin);
66
67							# Which headers should we scan for tokens? Don't use all of them, as it's easy
68							# to pick up spurious clues from some. What we now do is use all of them
69							# less these well-known headers; that way we can pick up spammers' tracking
70							# headers (which are obviously not well-known in advance!).
71
72							# Received is handled specially
73							our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise
74							\|Delivered-To \|Delivery-Date
75							\|(?:X-)?Envelope-To
76							\|X-MIME-Auto[Cc]onverted \|X-Converted-To-Plain-Text
77
78							\|Subject # not worth a tiny gain vs. to db size increase
79
80							# Date: can provide invalid cues if your spam corpus is
81							# older/newer than ham
82							\|Date
83
84							# List headers: ignore. a spamfiltering mailing list will
85							# become a nonspam sign.
86							\|X-List\|(?:X-)?Mailing-List
87							\|(?:X-)?List-(?:Archive\|Help\|Id\|Owner\|Post\|Subscribe
88							\|Unsubscribe\|Host\|Id\|Manager\|Admin\|Comment
89							\|Name\|Url)
90							\|X-Unsub(?:scribe)?
91							\|X-Mailman-Version \|X-Been[Tt]here \|X-Loop
92							\|Mail-Followup-To
93							\|X-eGroups-(?:Return\|From)
94							\|X-MDMailing-List
95							\|X-XEmacs-List
96							\|X-Sympa-To
97
98							# gatewayed through mailing list (thanks to Allen Smith)
99							\|(?:X-)?Resent-(?:From\|To\|Date)
100							\|(?:X-)?Original-(?:From\|To\|Date)
101
102							# Spamfilter/virus-scanner headers: too easy to chain from
103							# these
104							\|X-MailScanner(?:-SpamCheck)?
105							\|X-Spam(?:-(?:Status\|Level\|Flag\|Report\|Hits\|Score\|Checker-Version))?
106							\|X-Antispam \|X-RBL-Warning \|X-Mailscanner
107							\|X-MDaemon-Deliver-To \|X-Virus-Scanned
108							\|X-Mass-Check-Id
109							\|X-Pyzor \|X-DCC-\S{2,25}-Metrics
110							\|X-Filtered-B[Yy] \|X-Scanned-By \|X-Scanner
111							\|X-AP-Spam-(?:Score\|Status) \|X-RIPE-Spam-Status
112							\|X-SpamCop-[^:]+
113							\|X-SMTPD \|(?:X-)?Spam-Apparently-To
114							\|SPAM \|X-Perlmx-Spam
115							\|X-Bogosity
116
117							# some noisy Outlook headers that add no good clues:
118							\|Content-Class \|Thread-(?:Index\|Topic)
119							\|X-Original[Aa]rrival[Tt]ime
120
121							# Annotations from IMAP, POP, and MH:
122							\|(?:X-)?Status \|X-Flags \|X-Keywords \|Replied \|Forwarded
123							\|Lines \|Content-Length
124							\|X-UIDL? \|X-IMAPbase
125
126							# Annotations from Bugzilla
127							\|X-Bugzilla-[^:]+
128
129							# Annotations from VM: (thanks to Allen Smith)
130							\|X-VM-(?:Bookmark\|(?:POP\|IMAP)-Retrieved\|Labels\|Last-Modified
131							\|Summary-Format\|VHeader\|v\d-Data\|Message-Order)
132
133							# Annotations from Gnus:
134							\| X-Gnus-Mail-Source
135							\| Xref
136
137							)}x;
138
139							# Note only the presence of these headers, in order to reduce the
140							# hapaxen they generate.
141							our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
142							\|X-(?:Gnu-?PG\|PGP\|GPG)(?:-Key)?-Fingerprint
143							\|D(?:KIM\|omainKey)-Signature
144							)}ix;
145
146							# tweaks tested as of Nov 18 2002 by jm posted to -devel at
147							# http://sourceforge.net/p/spamassassin/mailman/message/12977556/
148							# for results. The winners are now the default settings.
149							use constant IGNORE_TITLE_CASE => 1;
150	22			22		178	use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0;
	22					51
	22					1074
151	22			22		113	use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1;
	22					41
	22					827
152	22			22		135	use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
	22					43
	22					791
153	22			22		111
	22					39
	22					811
154							# tweaks by jm on May 12 2003, see -devel email at
155							# http://sourceforge.net/p/spamassassin/mailman/message/14844556/
156							use constant PRE_CHEW_ADDR_HEADERS => 1;
157	22			22		150	use constant CHEW_BODY_URIS => 1;
	22					40
	22					758
158	22			22		102	use constant CHEW_BODY_MAILADDRS => 1;
	22					40
	22					763
159	22			22		105	use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
	22					37
	22					755
160	22			22		105	use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
	22					44
	22					725
161	22			22		105	use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
	22					41
	22					709
162	22			22		104	use constant IGNORE_MSGID_TOKENS => 0;
	22					37
	22					695
163	22			22		100
	22					38
	22					708
164							# tweaks of 12 March 2004, see bug 2129.
165							use constant DECOMPOSE_BODY_TOKENS => 1;
166	22			22		100	use constant MAP_HEADERS_MID => 1;
	22					41
	22					732
167	22			22		102	use constant MAP_HEADERS_FROMTOCC => 1;
	22					40
	22					758
168	22			22		103	use constant MAP_HEADERS_USERAGENT => 1;
	22					40
	22					705
169	22			22		99
	22					41
	22					794
170							# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
171							use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
172	22			22		111	use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
	22					40
	22					851
173	22			22		126
	22					40
	22					2432
174							# We store header-mined tokens in the db with a "HHeaderName:val" format.
175							# some headers may contain lots of gibberish tokens, so allow a little basic
176							# compression by mapping the header name at least here. these are the headers
177							# which appear with the most frequency in my db. note: this doesn't have to
178							# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing
179							# tokens from multiple different headers may impact accuracy, so might as well
180							# avoid this if possible. These are the top ones from my corpus, BTW (jm).
181							our %HEADER_NAME_COMPRESSION = (
182							'Message-Id' => '*m',
183							'Message-ID' => '*M',
184							'Received' => '*r',
185							'User-Agent' => '*u',
186							'References' => '*f',
187							'In-Reply-To' => '*i',
188							'From' => '*F',
189							'Reply-To' => '*R',
190							'Return-Path' => '*p',
191							'Return-path' => '*rp',
192							'X-Mailer' => '*x',
193							'X-Authentication-Warning' => '*a',
194							'Organization' => '*o',
195							'Organisation' => '*o',
196							'Content-Type' => '*ct',
197							'Content-Disposition' => '*cd',
198							'Content-Transfer-Encoding' => '*ce',
199							'x-spam-relays-trusted' => '*RT',
200							'x-spam-relays-untrusted' => '*RU',
201							);
202
203							# How many seconds should the opportunistic_expire lock be valid?
204							our $OPPORTUNISTIC_LOCK_VALID = 300;
205
206							# Should we use the Robinson f(w) equation from
207							# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?
208							# It gives better results, in that scores are more likely to distribute
209							# into the <0.5 range for nonspam and >0.5 for spam.
210							use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
211	22			22		143
	22					43
	22					1034
212							# How many of the most significant tokens should we use for the p(w)
213							# calculation?
214							use constant N_SIGNIFICANT_TOKENS => 150;
215	22			22		123
	22					38
	22					1044
216							# How many significant tokens are required for a classifier score to
217							# be considered usable?
218							use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
219	22			22		121
	22					47
	22					1021
220							# How long a token should we hold onto? (note: German speakers typically
221							# will require a longer token than English ones.)
222							use constant MAX_TOKEN_LENGTH => 15;
223	22			22		127
	22					39
	22					154837
224							###########################################################################
225
226							my $class = shift;
227							my ($main) = @_;
228	63			63	1	218
229	63					200	$class = ref($class) \|\| $class;
230							my $self = $class->SUPER::new($main);
231	63		33			420	bless ($self, $class);
232	63					348
233	63					176	$self->{main} = $main;
234							$self->{conf} = $main->{conf};
235	63					218	$self->{use_ignores} = 1;
236	63					158
237	63					180	$self->register_eval_rule("check_bayes");
238							$self;
239	63					263	}
240	63					519
241							my $self = shift;
242							if ($self->{store}) {
243							$self->{store}->untie_db();
244	40			40	1	113	}
245	40	100				171	%{$self} = ();
246	39					259	}
247
248	40					88	###########################################################################
	40					242
249
250							# Plugin hook.
251							# Return this implementation object, for callers that need to know
252							# it. TODO: callers shouldn't need to know it!
253							# used only in test suite to get access to {store}, internal APIs.
254							#
255
256							###########################################################################
257
258	1160			1160	0	1838	# Plugin hook.
259							# Called in the parent process shortly before forking off child processes.
260							my ($self) = @_;
261
262							if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) {
263							$self->{store}->prefork_init;
264							}
265	0			0	0	0	}
266
267	0	0	0			0	###########################################################################
268	0					0
269							# Plugin hook.
270							# Called in a child process shortly after being spawned.
271							my ($self) = @_;
272
273							if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) {
274							$self->{store}->spamd_child_init;
275							}
276							}
277	0			0	1	0
278							###########################################################################
279	0	0	0			0
280	0					0	# Plugin hook.
281							my ($self, $pms, $fulltext, $min, $max) = @_;
282
283							return 0 if (!$self->{conf}->{use_learner});
284							return 0 if (!$self->{conf}->{use_bayes} \|\| !$self->{conf}->{use_bayes_rules});
285
286							if (!exists ($pms->{bayes_score})) {
287							my $timer = $self->{main}->time_method("check_bayes");
288	0			0	0	0	$pms->{bayes_score} = $self->scan($pms, $pms->{msg});
289							}
290	0	0				0
291	0	0	0			0	if (defined $pms->{bayes_score} &&
292							($min == 0 \|\| $pms->{bayes_score} > $min) &&
293	0	0				0	($max eq "undef" \|\| $pms->{bayes_score} <= $max))
294	0					0	{
295	0					0	if ($self->{conf}->{detailed_bayes_score}) {
296							$pms->test_log(sprintf ("score: %3.4f, hits: %s",
297							$pms->{bayes_score},
298	0	0	0			0	$pms->{bayes_hits}));
			0
			0
			0
299							}
300							else {
301							$pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score}));
302	0	0				0	}
303							return 1;
304							}
305	0					0
306							return 0;
307							}
308	0					0
309							###########################################################################
310	0					0
311							# Plugin hook.
312							my ($self, $params) = @_;
313	0					0	my $quiet = $params->{quiet};
314
315							# do a sanity check here. Weird things happen if we remain tied
316							# after compiling; for example, spamd will never see that the
317							# number of messages has reached the bayes-scanning threshold.
318							if ($self->{store}->db_readable()) {
319							warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
320	6			6	1	13	$self->{store}->untie_db();
321	6					9	}
322							}
323
324							###########################################################################
325
326	6	50				20	# read configuration items to control bayes behaviour. Called by
327	0	0				0	# BayesStore::read_db_configs().
328	0					0	my ($self) = @_;
329
330							# use of hapaxes. Set on bayes object, since it controls prob
331							# computation.
332							$self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
333							}
334							###########################################################################
335
336							my ($self,$PMS) = @_;
337	42			42	0	100
338							return 0 unless $self->{use_ignores};
339
340							my $ig_from = $self->{main}->call_plugins ("check_wb_list",
341	42					160	{ permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
342							my $ig_to = $self->{main}->call_plugins ("check_wb_list",
343							{ permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
344
345							my $ignore = $ig_from \|\| $ig_to;
346	0			0	0	0
347							dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
348	0	0				0
349							return $ignore;
350	0					0	}
351
352	0					0	###########################################################################
353
354							# Plugin hook.
355	0		0			0	my ($self, $params) = @_;
356							my $isspam = $params->{isspam};
357	0	0				0	my $msg = $params->{msg};
358							my $id = $params->{id};
359	0					0
360							if (!$self->{conf}->{use_bayes}) { return; }
361
362							my $msgdata = $self->get_body_from_msg ($msg);
363							my $ret;
364
365							eval {
366	10			10	1	28	local $SIG{'__DIE__'}; # do not run user die() traps in here
367	10					27	my $timer = $self->{main}->time_method("b_learn");
368	10					18
369	10					21	my $ok;
370							if ($self->{main}->{learn_to_journal}) {
371	10	50				46	# If we're going to learn to journal, we'll try going r/o first...
	0					0
372							# If that fails for some reason, let's try going r/w. This happens
373	10					82	# if the DB doesn't exist yet.
374	10					26	$ok = $self->{store}->tie_db_readonly() \|\| $self->{store}->tie_db_writable();
375							} else {
376							$ok = $self->{store}->tie_db_writable();
377	10					76	}
378	10					57
379							if ($ok) {
380	10					24	$ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
381	10	100				36
382							if (!$self->{main}->{learn_caller_will_untie}) {
383							$self->{store}->untie_db();
384							}
385	2		33			18	}
386							1;
387	8					63	} or do { # if we died, untie the dbs.
388							my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
389							$self->{store}->untie_db();
390	10	100				36	die "bayes: (in learn) $eval_stat\n";
391	8					50	};
392
393	8	50				50	return $ret;
394	8					59	}
395
396							# this function is trapped by the wrapper above
397	10					97	my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
398	10	50				19	my @msgid = ( $msgid );
399	0	0				0
	0					0
400	0					0	if (!defined $msgid) {
401	0					0	@msgid = $self->get_msgid($msg);
402							}
403
404	10					69	foreach my $msgid_t ( @msgid ) {
405							my $seen = $self->{store}->seen_get ($msgid_t);
406
407							if (defined ($seen)) {
408							if (($seen eq 's' && $isspam) \|\| ($seen eq 'h' && !$isspam)) {
409	8			8		30	dbg("bayes: $msgid_t already learnt correctly, not learning twice");
410	8					31	return 0;
411							} elsif ($seen !~ /^[hs]$/) {
412	8	50				25	warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored");
413	8					48	} else {
414							# bug 3704: If the message was already learned, don't try learning it again.
415							# this prevents, for instance, manually learning as spam, then autolearning
416	8					36	# as ham, or visa versa.
417	12					73	if ($self->{main}->{learn_no_relearn}) {
418							dbg("bayes: $msgid_t already learnt as opposite, not re-learning");
419	12	100				66	return 0;
420	4	100	66			83	}
		50	33
			66
421	2					17
422	2					10	dbg("bayes: $msgid_t already learnt as opposite, forgetting first");
423
424	0					0	# kluge so that forget() won't untie the db on us ...
425							my $orig = $self->{main}->{learn_caller_will_untie};
426							$self->{main}->{learn_caller_will_untie} = 1;
427
428							my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
429	2	50				13
430	0					0	# reset the value post-forget() ...
431	0					0	$self->{main}->{learn_caller_will_untie} = $orig;
432
433							# forget() gave us a fatal error, so propagate that up
434	2					18	if ($fatal) {
435							dbg("bayes: forget() returned a fatal error, so learn() will too");
436							return;
437	2					13	}
438	2					7	}
439
440	2					18	# we're only going to have seen this once, so stop if it's been
441							# seen already
442							last;
443	2					12	}
444							}
445
446	2	50				11	# Now that we're sure we haven't seen this message before ...
447	0					0	$msgid = $msgid[0];
448	0					0
449							my $msgatime = $msg->receive_date();
450
451							# If the message atime comes back as being more than 1 day in the
452							# future, something's messed up and we should revert to current time as
453							# a safety measure.
454	2					11	#
455							$msgatime = time if ( $msgatime - time > 86400 );
456
457							my $tokens = $self->tokenize($msg, $msgdata);
458
459	6					22	{ my $timer = $self->{main}->time_method('b_count_change');
460							if ($isspam) {
461	6					53	$self->{store}->nspam_nham_change(1, 0);
462							$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
463							} else {
464							$self->{store}->nspam_nham_change(0, 1);
465							$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
466							}
467	6	50				35	}
468
469	6					43	$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
470							$self->{store}->cleanup();
471	6					18
	6					67
472	6	100				31	$self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
473	4					61	isspam => $isspam,
474	4					44	msgid => $msgid,
475							msgatime => $msgatime,
476	2					19	});
477	2					19
478							dbg("bayes: learned '$msgid', atime: $msgatime");
479
480							1;
481	6	100				240	}
482	6					61
483							###########################################################################
484	6					120
485							# Plugin hook.
486							my ($self, $params) = @_;
487							my $msg = $params->{msg};
488							my $id = $params->{id};
489
490	6					92	if (!$self->{conf}->{use_bayes}) { return; }
491
492	6					567	my $msgdata = $self->get_body_from_msg ($msg);
493							my $ret;
494
495							# we still tie for writing here, since we write to the seen db
496							# synchronously
497							eval {
498							local $SIG{'__DIE__'}; # do not run user die() traps in here
499	4			4	1	14	my $timer = $self->{main}->time_method("b_learn");
500	4					10
501	4					9	my $ok;
502							if ($self->{main}->{learn_to_journal}) {
503	4	50				21	# If we're going to learn to journal, we'll try going r/o first...
	0					0
504							# If that fails for some reason, let's try going r/w. This happens
505	4					23	# if the DB doesn't exist yet.
506	4					11	$ok = $self->{store}->tie_db_readonly() \|\| $self->{store}->tie_db_writable();
507							} else {
508							$ok = $self->{store}->tie_db_writable();
509							}
510
511	4					27	if ($ok) {
512	4					23	$ret = $self->_forget_trapped ($msg, $msgdata, $id);
513
514	4					8	if (!$self->{main}->{learn_caller_will_untie}) {
515	4	50				24	$self->{store}->untie_db();
516							}
517							}
518							1;
519	0		0			0	} or do { # if we died, untie the dbs.
520							my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
521	4					32	$self->{store}->untie_db();
522							die "bayes: (in forget) $eval_stat\n";
523							};
524	4	50				17
525	4					25	return $ret;
526							}
527	4	100				26
528	2					17	# this function is trapped by the wrapper above
529							my ($self, $msg, $msgdata, $msgid) = @_;
530							my @msgid = ( $msgid );
531	4					43	my $isspam;
532	4	50				10
533	0	0				0	if (!defined $msgid) {
	0					0
534	0					0	@msgid = $self->get_msgid($msg);
535	0					0	}
536
537							while( $msgid = shift @msgid ) {
538	4					33	my $seen = $self->{store}->seen_get ($msgid);
539
540							if (defined ($seen)) {
541							if ($seen eq 's') {
542							$isspam = 1;
543	4			4		16	} elsif ($seen eq 'h') {
544	4					16	$isspam = 0;
545	4					9	} else {
546							dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
547	4	50				16	return 0;
548	4					21	}
549
550							# messages should only be learned once, so stop if we find a msgid
551	4					24	# which was seen before
552	4					27	last;
553							}
554	4	50				26	else {
555	4	100				25	dbg("bayes: forget: msgid $msgid not learnt, ignored");
		50
556	2					7	}
557							}
558	2					7
559							# This message wasn't learnt before, so return
560	0					0	if (!defined $isspam) {
561	0					0	dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
562							return 0;
563							}
564							elsif ($isspam) {
565							$self->{store}->nspam_nham_change (-1, 0);
566	4					12	}
567							else {
568							$self->{store}->nspam_nham_change (0, -1);
569	0					0	}
570
571							my $tokens = $self->tokenize($msg, $msgdata);
572
573							if ($isspam) {
574	4	50				24	$self->{store}->multi_tok_count_change (-1, 0, $tokens);
		100
575	0					0	} else {
576	0					0	$self->{store}->multi_tok_count_change (0, -1, $tokens);
577							}
578
579	2					13	$self->{store}->seen_delete ($msgid);
580							$self->{store}->cleanup();
581
582	2					16	$self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
583							isspam => $isspam,
584							msgid => $msgid,
585	4					39	});
586
587	4	100				25	1;
588	2					80	}
589
590	2					24	###########################################################################
591
592							# Plugin hook.
593	4					198	my ($self, $params) = @_;
594	4					36	if (!$self->{conf}->{use_bayes}) { return 0; }
595							dbg("bayes: bayes journal sync starting");
596	4					59	$self->{store}->sync($params);
597							dbg("bayes: bayes journal sync completed");
598							}
599
600							###########################################################################
601	4					413
602							# Plugin hook.
603							my ($self, $params) = @_;
604							if (!$self->{conf}->{use_bayes}) { return 0; }
605							dbg("bayes: expiry starting");
606							my $timer = $self->{main}->time_method("expire_bayes");
607							$self->{store}->expire_old_tokens($params);
608	2			2	1	8	dbg("bayes: expiry completed");
609	2	50				11	}
	0					0
610	2					15
611	2					37	###########################################################################
612	2					10
613							# Plugin hook.
614							# Check to make sure we can tie() the DB, and we have enough entries to do a scan
615							# if we're told the caller will untie(), go ahead and leave the db tied.
616							my ($self, $params) = @_;
617
618							return 0 unless $self->{conf}->{use_bayes};
619	0			0	1	0	return 0 unless $self->{store}->tie_db_readonly();
620	0	0				0
	0					0
621	0					0	# We need the DB to stay tied, so if the journal sync occurs, don't untie!
622	0					0	my $caller_untie = $self->{main}->{learn_caller_will_untie};
623	0					0	$self->{main}->{learn_caller_will_untie} = 1;
624	0					0
625							# Do a journal sync if necessary. Do this before the nspam_nham_get()
626							# call since the sync may cause an update in the number of messages
627							# learnt.
628							$self->_opportunistic_calls(1);
629
630							# Reset the variable appropriately
631							$self->{main}->{learn_caller_will_untie} = $caller_untie;
632
633	146			146	1	414	my ($ns, $nn) = $self->{store}->nspam_nham_get();
634
635	146	50				543	if ($ns < $self->{conf}->{bayes_min_spam_num}) {
636	146	100				691	dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
637							if (!$self->{main}->{learn_caller_will_untie}) {
638							$self->{store}->untie_db();
639	6					20	}
640	6					25	return 0;
641							}
642							if ($nn < $self->{conf}->{bayes_min_ham_num}) {
643							dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
644							if (!$self->{main}->{learn_caller_will_untie}) {
645	6					29	$self->{store}->untie_db();
646							}
647							return 0;
648	6					19	}
649
650	6					55	return 1;
651							}
652	6	50				34
653	0					0	###########################################################################
654	0	0				0
655	0					0	my ($self, $permsgstatus, $msg) = @_;
656							my $score;
657	0					0
658							return unless $self->{conf}->{use_learner};
659	6	50				22
660	0					0	# When we're doing a scan, we'll guarantee that we'll do the untie,
661	0	0				0	# so override the global setting until we're done.
662	0					0	my $caller_untie = $self->{main}->{learn_caller_will_untie};
663							$self->{main}->{learn_caller_will_untie} = 1;
664	0					0
665							goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
666
667	6					22	goto skip unless $self->learner_is_scan_available();
668
669							my ($ns, $nn) = $self->{store}->nspam_nham_get();
670
671							## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
672							## $self->{raw_counts} = " ns=$ns nn=$nn ";
673	4			4	0	21	## }
674	4					11
675							dbg("bayes: corpus size: nspam = $ns, nham = $nn");
676	4	50				17
677							my $msgtokens;
678							{ my $timer = $self->{main}->time_method('b_tokenize');
679							my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
680	4					10	$msgtokens = $self->tokenize($msg, $msgdata);
681	4					12	}
682
683	4	50				31	my $tokensdata;
684							{ my $timer = $self->{main}->time_method('b_tok_get_all');
685	4	50				29	$tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
686							}
687	4					18
688							my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
689
690							my $probabilities_ref =
691							$self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
692
693	4					51	my %pw;
694							foreach my $tokendata (@{$tokensdata}) {
695	4					11	my $prob = shift(@$probabilities_ref);
696	4					10	next unless defined $prob;
	4					18
697	4					16	my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
698	4					24	$pw{$token} = {
699							prob => $prob,
700							spam_count => $tok_spam,
701	4					13	ham_count => $tok_ham,
702	4					9	atime => $atime
	4					31
703	4					11	};
	4					197
704							}
705
706	4					54	my @pw_keys = keys %pw;
707
708	4					20	# If none of the tokens were found in the DB, we're going to skip
709							# this message...
710							if (!@pw_keys) {
711	4					14	dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
712	4					13	goto skip;
	4					18
713	1104					1149	}
714	1104	100				1583
715	502					477	my $tcount_total = keys %{$msgtokens};
	502					665
716	502					1353	my $tcount_learned = scalar @pw_keys;
717
718							# Figure out the message receive time (used as atime below)
719							# If the message atime comes back as being in the future, something's
720							# messed up and we should revert to current time as a safety measure.
721							#
722							my $msgatime = $msg->receive_date();
723							my $now = time;
724	4					83	$msgatime = $now if ( $msgatime > $now );
725
726							my @touch_tokens;
727							my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
728	4	50				25	my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
729	0					0
730	0					0	my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
731							my $log_each_token = (would_log('dbg', 'bayes') > 1);
732
733	4					8	# now take the most significant tokens and calculate probs using
	4					11
734	4					9	# Robinson's formula.
735
736							@pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
737
738							if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
739
740	4					29	my @sorted;
741	4					16	foreach my $tok (@pw_keys) {
742	4	50				16	next if $tok_strength{$tok} <
743							$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
744	4					9
745	4					23	my $pw_tok = $pw{$tok};
746	4					16	my $pw_prob = $pw_tok->{prob};
747
748	4					306	# What's more expensive, scanning headers for HAMMYTOKENS and
749	4					47	# SPAMMYTOKENS tags that aren't there or collecting data that
750							# won't be used? Just collecting the data is certainly simpler.
751							#
752							my $raw_token = $msgtokens->{$tok} \|\| "(unknown)";
753							my $s = $pw_tok->{spam_count};
754	4					31	my $n = $pw_tok->{ham_count};
	2935					3018
755							my $a = $pw_tok->{atime};
756	4	100				17
	2					12
757							push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
758	4					9	[$raw_token, $pw_prob, $s, $n, $a] );
759	4					8
760	414	100				653	push(@sorted, $pw_prob);
761
762							# update the atime on this token, it proved useful
763	282					290	push(@touch_tokens, $tok);
764	282					298
765							if ($log_each_token) {
766							dbg("bayes: token '$raw_token' => $pw_prob");
767							}
768							}
769
770	282		50			599	if (!@sorted \|\| (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
771	282					318	$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
772	282					288	{
773	282					290	dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
774							goto skip;
775	282	100				275	}
	282					806
776
777							$score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
778	282					391	undef $timer_compute_prob; # end a timing section
779
780							# Couldn't come up with a probability?
781	282					313	goto skip unless defined $score;
782
783	282	50				602	dbg("bayes: score = $score");
784	0					0
785							# no need to call tok_touch_all unless there were significant
786							# tokens and a score was returned
787							# we don't really care about the return value here
788	4	50	50			34
789							{ my $timer = $self->{main}->time_method('b_tok_touch_all');
790							$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
791	0					0	}
792	0					0
793							my $timer_finish = $self->{main}->time_method('b_finish');
794
795	4					33	$permsgstatus->{bayes_nspam} = $ns;
796	4					13	$permsgstatus->{bayes_nham} = $nn;
797
798							## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
799	4	50				10	## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
800							## }
801	4					46
802							$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
803							probsref => \%pw,
804							score => $score,
805							msgatime => $msgatime,
806							significant_tokens => \@touch_tokens,
807	4					7	});
	4					20
808	4					59
809							skip:
810							if (!defined $score) {
811	4					18	dbg("bayes: not scoring message, returning undef");
812							}
813	4					11
814	4					28	undef $timer_compute_prob; # end a timing section if still running
815							if (!defined $timer_finish) {
816							$timer_finish = $self->{main}->time_method('b_finish');
817							}
818
819							# Take any opportunistic actions we can take
820	4					50	if ($self->{main}->{opportunistic_expire_check_only}) {
821							# we're supposed to report on expiry only -- so do the
822							# _opportunistic_calls() run for the journal only.
823							$self->_opportunistic_calls(1);
824							$permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
825							}
826							else {
827							$self->_opportunistic_calls();
828	4	50				19	}
829	0					0
830							# Do any cleanup we need to do
831							$self->{store}->cleanup();
832	4					15
833	4	50				12	# Reset the value accordingly
834	4					26	$self->{main}->{learn_caller_will_untie} = $caller_untie;
835
836							# If our caller won't untie the db, we need to do it.
837							if (!$caller_untie) {
838	4	50				16	$self->{store}->untie_db();
839							}
840
841	0					0	$permsgstatus->set_tag ('BAYESTCHAMMY',
842	0					0	($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));
843							$permsgstatus->set_tag ('BAYESTCSPAMMY',
844							($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));
845	4					20	$permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);
846							$permsgstatus->set_tag ('BAYESTC', $tcount_total);
847
848							$permsgstatus->set_tag ('HAMMYTOKENS', sub {
849	4					28	my $pms = shift;
850							$self->bayes_report_make_list
851							($pms, $pms->{bayes_token_info_hammy}, shift);
852	4					13	});
853
854							$permsgstatus->set_tag ('SPAMMYTOKENS', sub {
855	4	50				15	my $pms = shift;
856	4					38	$self->bayes_report_make_list
857							($pms, $pms->{bayes_token_info_spammy}, shift);
858							});
859
860	4	50				17	$permsgstatus->set_tag ('TOKENSUMMARY', sub {
	4					26
861							my $pms = shift;
862	4	50				14	if ( defined $pms->{tag_data}{BAYESTC} )
	4					22
863	4					12	{
864	4					13	my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED}
865							- $pms->{tag_data}{BAYESTCSPAMMY}
866							- $pms->{tag_data}{BAYESTCHAMMY};
867	0			0		0	my $tcount_new = $pms->{tag_data}{BAYESTC}
868							- $pms->{tag_data}{BAYESTCLEARNED};
869	0					0	"Tokens: new, $tcount_new; "
870	4					37	."hammy, $pms->{tag_data}{BAYESTCHAMMY}; "
871							."neutral, $tcount_neutral; "
872							."spammy, $pms->{tag_data}{BAYESTCSPAMMY}."
873	0			0		0	} else {
874							"Bayes not run.";
875	0					0	}
876	4					21	});
877
878
879	0			0		0	return $score;
880	0	0				0	}
881
882							###########################################################################
883
884	0					0	# Plugin hook.
885							my ($self, $params) = @_;
886	0					0	my $magic = $params->{magic};
887	0					0	my $toks = $params->{toks};
888							my $regex = $params->{regex};
889
890							# allow dump to occur even if use_bayes disables everything else ...
891							#return 0 unless $self->{conf}->{use_bayes};
892	0					0	return 0 unless $self->{store}->tie_db_readonly();
893
894	4					42	my @vars = $self->{store}->get_storage_variables();
895
896							my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
897	4					548
898							my $template = '%3.3f %10u %10u %10u %s'."\n";
899
900							if ( $magic ) {
901							printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version')
902							or die "Error writing: $!";
903							printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam')
904	0			0	1	0	or die "Error writing: $!";
905	0					0	printf($template, 0.0, 0, $nh, 0, 'non-token data: nham')
906	0					0	or die "Error writing: $!";
907	0					0	printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens')
908							or die "Error writing: $!";
909							printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime')
910							or die "Error writing: $!";
911	0	0				0	if ( $bv >= 2 ) {
912							printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime')
913	0					0	or die "Error writing: $!";
914							}
915	0					0	if ( $bv < 2 ) {
916							printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count')
917	0					0	or die "Error writing: $!";
918							}
919	0	0				0	if ( $bv >= 2 ) {
920	0	0				0	printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime')
921							or die "Error writing: $!";
922	0	0				0	}
923							printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime')
924	0	0				0	or die "Error writing: $!";
925							if ( $bv >= 2 ) {
926	0	0				0	printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta')
927							or die "Error writing: $!";
928	0	0				0
929							printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count')
930	0	0				0	or die "Error writing: $!";
931	0	0				0	}
932							}
933
934	0	0				0	if ( $toks ) {
935	0	0				0	# let the store sort out the db_toks
936							$self->{store}->dump_db_toks($template, $regex, @vars);
937							}
938	0	0				0
939	0	0				0	if (!$self->{main}->{learn_caller_will_untie}) {
940							$self->{store}->untie_db();
941							}
942	0	0				0	return 1;
943							}
944	0	0				0
945	0	0				0	###########################################################################
946							# TODO: these are NOT public, but the test suite needs to call them.
947
948	0	0				0	my ($self, $msg) = @_;
949
950							my @msgid;
951
952							my $msgid = $msg->get_header("Message-Id");
953	0	0				0	if (defined $msgid && $msgid ne '' && $msgid !~ /^\s<\s(?:\@sa_generated)?>.*$/) {
954							# remove \r and < and > prefix/suffixes
955	0					0	chomp $msgid;
956							$msgid =~ s/^<//; $msgid =~ s/>.*$//g;
957							push(@msgid, $msgid);
958	0	0				0	}
959	0					0
960							# Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation
961	0					0
962							# Use sha1_hex(Date: and top N bytes of body)
963							# where N is MIN(1024 bytes, 1/2 of body length)
964							#
965							my $date = $msg->get_header("Date");
966							$date = "None" if (!defined $date \|\| $date eq ''); # No Date?
967
968	14			14	0	42	#Removed per bug 5185
969							#my @rcvd = $msg->get_header("Received");
970	14					28	#my $rcvd = $rcvd[$#rcvd];
971							#$rcvd = "None" if (!defined $rcvd \|\| $rcvd eq ''); # No Received?
972	14					97
973	14	50	33			315	# Make a copy since pristine_body is a reference ...
			33
974							my $body = join('', $msg->get_pristine_body());
975	14					57
976	14					92	if (length($body) > 64) { # Small Body?
	14					81
977	14					52	my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
978							substr($body, $keep) = '';
979							}
980
981							#Stripping all CR and LF so that testing midstream from MTA and post delivery don't
982							#generate different id's simply because of LF<->CR<->CRLF changes.
983							$body =~ s/[\r\n]//g;
984
985	14					57	unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');
986	14	50	33			140
987							return wantarray ? @msgid : $msgid[0];
988							}
989
990							my ($self, $msg) = @_;
991
992							if (!ref $msg) {
993							# I have no idea why this seems to happen. TODO
994	14					96	warn "bayes: msg not a ref: '$msg'";
995							return { };
996	14	50				76	}
997	14	50				61
998	14					44	my $permsgstatus =
999							Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
1000							$msg->extract_message_metadata ($permsgstatus);
1001							my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
1002							$permsgstatus->finish();
1003	14					249
1004							if (!defined $msgdata) {
1005	14					295	# why?!
1006							warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
1007	14	50				111	return { };
1008							}
1009
1010							return $msgdata;
1011	20			20	0	65	}
1012
1013	20	50				86	my ($self, $pms) = @_;
1014
1015	0					0	my $t_src = $self->{conf}->{bayes_token_sources};
1016	0					0	my $msgdata = { };
1017							$msgdata->{bayes_token_body} =
1018							$pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible};
1019							$msgdata->{bayes_token_inviz} =
1020	20					205	$pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible};
1021	20					137	$msgdata->{bayes_mimepart_digests} =
1022	20					97	$pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
1023	20					113	@{$msgdata->{bayes_token_uris}} =
1024							$pms->get_uri_list() if $t_src->{uri};
1025	20	50				71	return $msgdata;
1026							}
1027	0					0
1028	0					0	###########################################################################
1029
1030							# The calling functions expect a uniq'ed array of tokens ...
1031	20					123	my ($self, $msg, $msgdata) = @_;
1032
1033							my $t_src = $self->{conf}->{bayes_token_sources};
1034
1035	24			24		63	# visible tokens from the body
1036							my @tokens_body;
1037	24					75	if ($msgdata->{bayes_token_body}) {
1038	24					66	foreach (@{$msgdata->{bayes_token_body}}) {
1039							push(@tokens_body, $self->_tokenize_line ($_, '', 1));
1040	24	50				189	last if scalar @tokens_body >= 50000;
1041							}
1042	24	50				158	dbg("bayes: tokenized body: %d tokens", scalar @tokens_body);
1043							}
1044	24	50				84	# the URI list
1045	24					77	my @tokens_uri;
1046	24	50				186	if ($msgdata->{bayes_token_uris}) {
1047	24					79	foreach (@{$msgdata->{bayes_token_uris}}) {
1048							push(@tokens_uri, $self->_tokenize_line ($_, '', 2));
1049							last if scalar @tokens_uri >= 10000;
1050							}
1051							dbg("bayes: tokenized uri: %d tokens", scalar @tokens_uri);
1052							}
1053							# add invisible tokens
1054	16			16	0	58	my @tokens_inviz;
1055							if ($msgdata->{bayes_token_inviz}) {
1056	16					60	my $tokprefix;
1057							if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' }
1058							if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' }
1059	16					36	if (defined $tokprefix) {
1060	16	50				63	foreach (@{$msgdata->{bayes_token_inviz}}) {
1061	16					30	push(@tokens_inviz, $self->_tokenize_line ($_, $tokprefix, 1));
	16					82
1062	278					668	last if scalar @tokens_inviz >= 50000;
1063	278	50				1119	}
1064							}
1065	16					136	dbg("bayes: tokenized invisible: %d tokens", scalar @tokens_inviz);
1066							}
1067
1068	16					41	# add digests and Content-Type of all MIME parts
1069	16	50				91	my @tokens_mimepart;
1070	16					39	if ($msgdata->{bayes_mimepart_digests}) {
	16					65
1071	54					135	my %shorthand = ( # some frequent MIME part contents for human readability
1072	54	50				172	'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
1073							'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
1074	16					60	'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML',
1075							'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
1076							'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
1077	16					29	'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
1078	16	50				57	'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
1079	16					27	);
1080	16					26	@tokens_mimepart = map('MIME:' . ($shorthand{$_} \|\| $_),
	16					49
1081	16					33	@{ $msgdata->{bayes_mimepart_digests} });
1082	16	50				60	dbg("bayes: tokenized mime parts: %d tokens", scalar @tokens_mimepart);
1083	16					31	dbg("bayes: mime-part token %s", $_) for @tokens_mimepart;
	16					56
1084	0					0	}
1085	0	0				0
1086							# Tokenize the headers
1087							my @tokens_header;
1088	16					44	if ($t_src->{header}) {
1089							my %hdrs = $self->_tokenize_headers ($msg);
1090							while( my($prefix, $value) = each %hdrs ) {
1091							push(@tokens_header, $self->_tokenize_line ($value, "H$prefix:", 0));
1092	16					33	last if scalar @tokens_header >= 10000;
1093	16	50				58	}
1094	0					0	dbg("bayes: tokenized header: %d tokens", scalar @tokens_header);
1095							}
1096
1097							# Go ahead and uniq the array, skip null tokens (can happen sometimes)
1098							# generate an SHA1 hash and take the lower 40 bits as our token
1099							my %tokens;
1100							foreach my $token
1101							(@tokens_body, @tokens_uri, @tokens_inviz, @tokens_mimepart, @tokens_header)
1102							{
1103							# dbg("bayes: token: %s", $token);
1104	0		0			0	$tokens{substr(sha1($token), -5)} = $token if $token ne '';
	0					0
1105	0					0	}
1106	0					0
1107							# return the keys == tokens ...
1108							return \%tokens;
1109							}
1110	16					34
1111	16	50				70	my $self = $_[0];
1112	16					67	my $tokprefix = $_[2];
1113	16					136	my $region = $_[3];
1114	200					548	local ($_) = $_[1];
1115	200	50				1020
1116							my @rettokens;
1117	16					101
1118							# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
1119							# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
1120							# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
1121
1122	16					56	### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
1123	16					77
1124							### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic)
1125							s{ ( [A-Za-z0-9,@*!_'"\$. -]+ \|
1126							[\xC0-\xDF][\x80-\xBF] \|
1127	6714	50				35244	[\xE0-\xEF][\x80-\xBF]{2} \|
1128							[\xF0-\xF4][\x80-\xBF]{3} \|
1129							[\xA1-\xFF] ) \| . }
1130							{ defined $1 ? $1 : ' ' }xsge;
1131	16					879	# should we also turn NBSP ( \xC2\xA0 ) into space?
1132
1133							# DO split on "..." or "--" or "---"; common formatting error resulting in
1134							# hapaxes. Keep the separator itself as a token, though, as long ones can
1135	532			532		701	# be good spamsigns.
1136	532					711	s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
1137	532					584	s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
1138	532					1044
1139							if (IGNORE_TITLE_CASE) {
1140	532					586	if ($region == 1 \|\| $region == 2) {
1141							# lower-case Title Case at start of a full-stop-delimited line (as would
1142							# be seen in a Western language).
1143							s/(?:^\|\.\s+)([A-Z])([^A-Z]+)(?:\s\|$)/ ' '. (lc $1) . $2 . ' ' /ge;
1144							}
1145							}
1146
1147							my $magic_re = $self->{store}->get_magic_re();
1148
1149	532					1823	# Note that split() in scope of 'use bytes' results in words with utf8 flag
1150							# cleared, even if the source string has perl characters semantics !!!
1151							# Is this really still desirable?
1152
1153	4844	100				14792	foreach my $token (split) {
1154							$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
1155							$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
1156
1157							# Skip false magic tokens
1158							# TVD: we need to do a defined() check since SQL doesn't have magic
1159	532					1474	# tokens, so the SQL BayesStore returns undef. I really want a way
1160	532					756	# of optimizing that out, but I haven't come up with anything yet.
1161							#
1162	532					552	next if ( defined $magic_re && $token =~ /$magic_re/ );
1163	532	100	100			1388
1164							# do keep 3-byte tokens; there's some solid signs in there
1165							my $len = length($token);
1166	332					1868
	242					2033
1167							# but extend the stop-list. These are squarely in the gray
1168							# area, and it just slows us down to record them.
1169							# See http://wiki.apache.org/spamassassin/BayesStopList for more info.
1170	532					1705	#
1171							next if $len < 3 \|\|
1172							($token =~ /^(?:a(?:ble\|l(?:ready\|l)\|n[dy]\|re)\|b(?:ecause\|oth)\|c(?:an\|ome)\|e(?:ach\|mail\|ven)\|f(?:ew\|irst\|or\|rom)\|give\|h(?:a(?:ve\|s)\|ttp)\|i(?:n(?:formation\|to)\|t\'s)\|just\|know\|l(?:ike\|o(?:ng\|ok))\|m(?:a(?:de\|il(?:(?:ing\|to))?\|ke\|ny)\|o(?:re\|st)\|uch)\|n(?:eed\|o[tw]\|umber)\|o(?:ff\|n(?:ly\|e)\|ut\|wn)\|p(?:eople\|lace)\|right\|s(?:ame\|ee\|uch)\|t(?:h(?:at\|is\|rough\|e)\|ime)\|using\|w(?:eb\|h(?:ere\|y)\|ith(?:out)?\|or(?:ld\|k))\|y(?:ears?\|ou(?:(?:\'re\|r))?))$/i);
1173
1174							# are we in the body? If so, apply some body-specific breakouts
1175							if ($region == 1 \|\| $region == 2) {
1176	532					4139	if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
1177	8592					15089	push (@rettokens, $self->_tokenize_mail_addrs ($token));
1178	8592					11820	}
1179							elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
1180							push (@rettokens, "UD:".$token); # the full token
1181							my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
1182							push (@rettokens, "UD:".$1); # UD = URL domain
1183							}
1184							}
1185	8592	50	33			29609	}
1186
1187							# note: do not trim down overlong tokens if they contain '*'. This is
1188	8592					14048	# used as part of split tokens such as "HTo:D*net" indicating that
1189							# the domain ".net" appeared in the To header.
1190							#
1191							if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
1192
1193							if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
1194	8592	100	100			34595	# Bug 7135
1195							# collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences
1196							my(@t) = $token =~ /( (?: [\xE0-\xEF] \| [\xF0-\xF4][\x80-\xBF] )
1197							[\x80-\xBF]{2} )/xsg;
1198	5230	100	100			11367	if (@t) {
1199	3534	100				7816	push (@rettokens, map($tokprefix.'u8:'.$_, @t));
		100
1200	38					152	next;
1201							}
1202							}
1203	110					306
1204	110					164	if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
	110					472
1205	132					529	# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
1206							# but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
1207							# to me! (jm)
1208							while ($token =~ s/^(..?)//) {
1209							push (@rettokens, $tokprefix.'8:'.$1);
1210							}
1211							next;
1212							}
1213
1214	5230	100	100			10121	if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1215							\|\| ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1216	250	50				580	\|\| ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
1217							{
1218							# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
1219	0					0	# Spambayes trick via Matt: Just retain 7 chars. Do not retain the
1220							# length, it does not help; see jm's mail to -devel on Nov 20 2002 at
1221	0	0				0	# http://sourceforge.net/p/spamassassin/mailman/message/12977605/
1222	0					0	# "sk:" stands for "skip".
1223	0					0	# Bug 7141: retain seven UTF-8 chars (or other bytes),
1224							# if followed by at least two bytes
1225							$token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] \|
1226							[\xC0-\xDF][\x80-\xBF] \|
1227	250					272	[\xE0-\xEF][\x80-\xBF]{2} \|
1228							[\xF0-\xF4][\x80-\xBF]{3} \| . ){7} ))
1229							.{2,} \z }{sk:$1}xs;
1230							## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes
1231							}
1232							}
1233
1234							# decompose tokens? do this after shortening long tokens
1235							if ($region == 1 \|\| $region == 2) {
1236							if (DECOMPOSE_BODY_TOKENS) {
1237	250	100	100			949	if ($token =~ /[^\w:\*]/) {
			100
			100
			50
			66
1238							my $decompd = $token; # "Foo!"
1239							$decompd =~ s/[^\w:\*]//gs;
1240							push (@rettokens, $tokprefix.$decompd); # "Foo"
1241							}
1242
1243							if ($token =~ /[A-Z]/) {
1244							my $decompd = $token; $decompd = lc $decompd;
1245							push (@rettokens, $tokprefix.$decompd); # "foo!"
1246
1247							if ($token =~ /[^\w:\*]/) {
1248	232					1478	$decompd =~ s/[^\w:\*]//gs;
1249							push (@rettokens, $tokprefix.$decompd); # "foo"
1250							}
1251							}
1252							}
1253							}
1254
1255							push (@rettokens, $tokprefix.$token);
1256							}
1257
1258	5230	100	100			9781	return @rettokens;
1259	3534					3702	}
1260	3534	100				6427
1261	358					708	my ($self, $msg) = @_;
1262	358					1178
1263	358					990	my %parsed;
1264
1265							my %user_ignore;
1266	3534	100				6969	$user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
1267	712					1219
	712					1082
1268	712					1670	# get headers in array context
1269							my @hdrs;
1270	712	100				1962	my @rcvdlines;
1271	76					265	for ($msg->get_all_headers()) {
1272	76					240	# first, keep a copy of Received headers, so we can strip down to last 2
1273							if (/^Received:/i) {
1274							push(@rcvdlines, $_);
1275							next;
1276							}
1277							# and now skip lines for headers we don't want (including all Received)
1278	5230					17162	next if /^${IGNORED_HDRS}:/i;
1279							next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
1280							push(@hdrs, $_);
1281	532					5352	}
1282							push(@hdrs, $msg->get_all_metadata());
1283
1284							# and re-add the last 2 received lines: usually a good source of
1285	16			16		50	# spamware tokens and HELO names.
1286							if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
1287	16					52	if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
1288
1289							for (@hdrs) {
1290	16					29	next unless /\S/;
	16					101
1291							my ($hdr, $val) = split(/:/, $_, 2);
1292
1293	16					35	# remove user-specified headers here, after Received, in case they
1294							# want to ignore that too
1295	16					145	next if exists $user_ignore{lc $hdr};
1296
1297	202	100				484	# Prep the header value
1298	52					132	$val \|\|= '';
1299	52					86	chomp($val);
1300
1301							# special tokenization for some headers:
1302	150	100				1949	if ($hdr =~ /^(?:\|X-\|Resent-)Message-Id$/i) {
1303	102					144	$val = $self->_pre_chew_message_id ($val);
1304	102					242	}
1305							elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:\|X-\|Resent-)
1306	16					130	(?:Return-Path\|From\|To\|Cc\|Reply-To\|Errors-To\|Mail-Followup-To\|Sender)$/ix)
1307							{
1308							$val = $self->_pre_chew_addr_header ($val);
1309							}
1310	16	100				80	elsif ($hdr eq 'Received') {
	14					42
1311	16	100				67	$val = $self->_pre_chew_received ($val);
	14					50
1312							}
1313	16					55	elsif ($hdr eq 'Content-Type') {
1314	194	50				593	$val = $self->_pre_chew_content_type ($val);
1315	194					707	}
1316							elsif ($hdr eq 'MIME-Version') {
1317							$val =~ s/1\.0//; # totally innocuous
1318							}
1319	194	50				528	elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
1320							$val = "1"; # just mark the presence, they create lots of hapaxen
1321							}
1322	194		50			368
1323	194					300	if (MAP_HEADERS_MID) {
1324							if ($hdr =~ /^(?:In-Reply-To\|References\|Message-ID)$/i) {
1325							$parsed{"*MI"} = $val;
1326	194	100				1377	}
		100
		100
		100
		100
		50
1327	16					73	}
1328							if (MAP_HEADERS_FROMTOCC) {
1329							if ($hdr =~ /^(?:From\|To\|Cc)$/i) {
1330							$parsed{"*Ad"} = $val;
1331							}
1332	60					183	}
1333							if (MAP_HEADERS_USERAGENT) {
1334							if ($hdr =~ /^(?:X-Mailer\|User-Agent)$/i) {
1335	28					102	$parsed{"*UA"} = $val;
1336							}
1337							}
1338	4					32
1339							# replace hdr name with "compressed" version if possible
1340							if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
1341	2					9	$hdr = $HEADER_NAME_COMPRESSION{$hdr};
1342							}
1343
1344	0					0	if (exists $parsed{$hdr}) {
1345							$parsed{$hdr} .= " ".$val;
1346							} else {
1347	194					278	$parsed{$hdr} = $val;
1348	194	100				474	}
1349	16					63	if (would_log('dbg', 'bayes') > 1) {
1350							dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
1351							}
1352	194					255	}
1353	194	100				479
1354	32					94	return %parsed;
1355							}
1356
1357	194					247	my ($self, $val) = @_;
1358	194	50				388
1359	0					0	# hopefully this will retain good bits without too many hapaxen
1360							if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
1361							my $boundary = $1;
1362							$boundary = '' if !defined $boundary; # avoid a warning
1363							$boundary =~ s/[a-fA-F0-9]/H/gs;
1364	194	100				471	# break up blocks of separator chars so they become their own tokens
1365	126					247	$boundary =~ s/([-_\.=]+)/ $1 /gs;
1366							$val .= $boundary;
1367							}
1368	194	100				395
1369	26					102	# stop-list words for Content-Type header: these wind up totally gray
1370							$val =~ s/\b(?:text\|charset)\b//;
1371	168					477
1372							$val;
1373	194	50				450	}
1374	0					0
1375							my ($self, $val) = @_;
1376							# we can (a) get rid of a lot of hapaxen and (b) increase the token
1377							# specificity by pre-parsing some common formats.
1378	16					283
1379							# Outlook Express format:
1380							$val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
1381							([0-9a-f]{4})[0-9a-f]{4}\$
1382	4			4		17	([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
1383
1384							# Exim:
1385	4	50				21	$val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
1386	0					0
1387	0	0				0	# Sendmail:
1388	0					0	$val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
1389							[A-F0-9]{10,12}\@//gx;
1390	0					0
1391	0					0	# try to split Message-ID segments on probable ID boundaries. Note that
1392							# Outlook message-ids seem to contain a server identifier ID in the last
1393							# 8 bytes before the @. Make sure this becomes its own token, it's a
1394							# great spam-sign for a learning system! Be sure to split on ".".
1395	4					24	$val =~ s/[^_A-Za-z0-9]/ /g;
1396							$val;
1397	4					15	}
1398
1399							my ($self, $val) = @_;
1400
1401	16			16		62	# Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs
1402							# and valid-format RFC-822/2822 dates
1403
1404							$val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail
1405							$val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail
1406	16					53	$val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail
1407							$val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
1408
1409							$val =~ s/(?:(?:Mon\|Tue\|Wed\|Thu\|Fri\|Sat\|Sun),\s)?
1410							[0-3\s]?[0-9]\s
1411	16					38	(?:Jan\|Feb\|Ma[ry]\|Apr\|Ju[nl]\|Aug\|Sep\|Oct\|Nov\|Dec)\s
1412							(?:19\|20)?[0-9]{2}\s
1413							[0-2][0-9](?:\:[0-5][0-9]){1,2}\s
1414	16					32	(?:\s$\|$\|\s(?:[+-][0-9]{4})\|\s(?:UT\|[A-Z]{2,3}T))
1415							//gx;
1416
1417							# IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
1418							# IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
1419							# (on both sides)
1420							# also make a dup with the full IP, as fodder for
1421	16					116	# bayes_dump_to_trusted_networks: "Hr:ipaaa.bbb.ccc.ddd"
1422	16					54	$val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
1423							if ($2 eq '10' \|\| ($2 eq '192' && $3 eq '168')) {
1424							$1.$2.$3.$4.
1425							" ip*".$1.$2.$3.$4." ";
1426	28			28		168	} else {
1427							$1.$2.$3.
1428							" ip*".$1.$2.$3.$4." ";
1429							}
1430							}gex;
1431	28					112
1432	28					83	# trim these: they turn out as the most common tokens, but with a
1433	28					174	# prob of about .5. waste of space!
1434	28					65	$val =~ s/\b(?:with\|from\|for\|SMTP\|ESMTP)\b/ /g;
1435
1436	28					252	$val;
1437							}
1438
1439							my ($self, $val) = @_;
1440							local ($_);
1441
1442							my @addrs = $self->{main}->find_all_addrs_in_line ($val);
1443							my @toks;
1444							foreach (@addrs) {
1445							push (@toks, $self->_tokenize_mail_addrs ($_));
1446							}
1447							return join (' ', @toks);
1448							}
1449	28					210
1450	30	50	33			244	my ($self, $addr) = @_;
			33
1451	0					0
1452							($addr =~ /(.+)\@(.+)$/) or return ();
1453							my @toks;
1454	30					371	push(@toks, "U".$1, "D".$2);
1455							$_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
1456							return @toks;
1457							}
1458
1459
1460							###########################################################################
1461	28					293
1462							# compute the probability that a token is spammish for each token
1463	28					88	my ($self, $tokensdata, $ns, $nn) = @_;
1464							my @probabilities;
1465
1466							return if !$ns \|\| !$nn;
1467	60			60		157
1468	60					110	my $threshold = 1; # ignore low-freq tokens below this s+n threshold
1469							if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1470	60					207	$threshold = 10;
1471	60					78	}
1472	60					116	if (!$self->{use_hapaxes}) {
1473	48					111	$threshold = 2;
1474							}
1475	60					309
1476							foreach my $tokendata (@{$tokensdata}) {
1477							my $s = $tokendata->[1]; # spam count
1478							my $n = $tokendata->[2]; # ham count
1479	86			86		262	my $prob;
1480
1481	86	50				443	no warnings 'uninitialized'; # treat undef as zero in addition
1482	86					146	if ($s + $n >= $threshold) {
1483	86					368	# ignoring low-freq tokens, also covers the (!$s && !$n) case
1484	86					176
	86					429
	76					343
1485	86					430	# my $ratios = $s / $ns;
1486							# my $ration = $n / $nn;
1487							# $prob = $ratios / ($ration + $ratios);
1488							#
1489							$prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster
1490
1491							if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1492							# use Robinson's f(x) equation for low-n tokens, instead of just
1493	4			4		21	# ignoring them
1494	4					9	my $robn = $s + $n;
1495							$prob =
1496	4	50	33			34	($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
1497							/
1498	4					8	($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
1499	4					11	}
1500							}
1501
1502	4	50				21	# 'log_raw_counts' is used to log the raw data for the Bayes equations
1503	0					0	# during a mass-check, allowing the S and X constants to be optimized
1504							# quickly without requiring re-tokenization of the messages for each
1505							# attempt. There's really no need for this code to be uncommented in
1506	4					10	# normal use, however. It has never been publicly documented, so
	4					20
1507	1104					1089	# commenting it out is fine. ;)
1508	1104					982	#
1509	1104					1025	## if ($self->{log_raw_counts}) {
1510							## $self->{raw_counts} .= " s=$s,n=$n ";
1511	22			22		253	## }
	22					83
	22					25493
1512	1104	100				1494
1513							push(@probabilities, $prob);
1514							}
1515							return \@probabilities;
1516							}
1517
1518							# compute the probability that a token is spammish
1519	502					561	my ($self, $token, $ns, $nn, $s, $n) = @_;
1520
1521	502					465	# we allow the caller to give us the token information, just
1522							# to save a potentially expensive lookup
1523							if (!defined($s) \|\| !defined($n)) {
1524	502					498	($s, $n, undef) = $self->{store}->tok_get($token);
1525	502					649	}
1526							return if !$s && !$n;
1527
1528							my $probabilities_ref =
1529							$self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);
1530
1531							return $probabilities_ref->[0];
1532							}
1533
1534							###########################################################################
1535							# If a token is neither hammy nor spammy, return 0.
1536							# For a spammy token, return the minimum number of additional ham messages
1537							# it would have had to appear in to no longer be spammy. Hammy tokens
1538							# are handled similarly. That's what the function does (at the time
1539							# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly
1540							# more useful if it returned the number of /additional/ ham messages
1541							# a spammy token would have to appear in to no longer be spammy but I
1542							# fear that might require the solution to a cubic equation, and I
1543	1104					1637	# just don't have the time for that now.
1544
1545	4					17	my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
1546
1547							return 0 if $ns == 0 && $nn == 0;
1548
1549							if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
1550	0			0		0	if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
1551
1552							return 0 if $Ns == 0 \|\| $Nn == 0;
1553							return 0 if abs( $prob - 0.5 ) <
1554	0	0	0			0	$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1555	0					0
1556							my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
1557	0	0	0			0	my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1558
1559	0					0	return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
1560							unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
1561
1562	0					0	my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
1563							my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
1564							my $a = $Nb * ( 1 - $p );
1565							my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
1566							my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
1567							my $discrim = $b * $b - 4 * $a * $c;
1568							my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
1569							my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
1570
1571							# This shouldn't be necessary. Should not be < 1
1572							return $dd_exact < 1 ? 1 : int($dd_exact);
1573							}
1574
1575							###########################################################################
1576
1577	0			0		0	my($self, $journal_only) = @_;
1578
1579	0	0	0			0	# If we're not already tied, abort.
1580							if (!$self->{store}->db_readable()) {
1581	0					0	dbg("bayes: opportunistic call attempt failed, DB not readable");
1582	0	0				0	return;
	0	0				0
1583							}
1584	0	0	0			0
1585	0	0				0	# Is an expire or sync running?
1586							my $running_expire = $self->{store}->get_running_expire_tok();
1587							if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
1588	0	0				0	dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
1589	0					0	return;
1590							}
1591	0					0
1592							# handle expiry and syncing
1593							if (!$journal_only && $self->{store}->expiry_due()) {
1594	0					0	dbg("bayes: opportunistic call found expiry due");
1595	0					0
1596	0					0	# sync will bring the DB R/W as necessary, and the expire will remove
1597	0					0	# the running_expire token, may untie as well.
1598	0					0	$self->{main}->{bayes_scanner}->sync(1,1);
1599	0					0	}
1600	0	0				0	elsif ( $self->{store}->sync_due() ) {
1601	0					0	dbg("bayes: opportunistic call found journal sync due");
1602
1603							# sync will bring the DB R/W as necessary, may untie as well
1604	0	0				0	$self->{main}->{bayes_scanner}->sync(1,0);
1605
1606							# We can only remove the running_expire token if we're doing R/W
1607							if ($self->{store}->db_writable()) {
1608							$self->{store}->remove_running_expire_tok();
1609							}
1610	10			10		40	}
1611
1612							return;
1613	10	50				120	}
1614	0					0
1615	0					0	###########################################################################
1616
1617							my ($self) = @_;
1618
1619	10					63	my $store;
1620	10	50	33			49	my $module = $self->{conf}->{bayes_store_module};
1621	0					0	if (!$module) {
1622	0					0	$module = 'Mail::SpamAssassin::BayesStore::DBM';
1623							} elsif ($module =~ /^([_A-Za-z0-9:]+)$/) {
1624							$module = untaint_var($module);
1625							} else {
1626	10	50	66			137	die "bayes: invalid module: $module\n";
		50
1627	0					0	}
1628
1629							dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module);
1630							undef $self->{store}; # DESTROYs previous object, if any
1631	0					0	eval '
1632							require '.$module.';
1633							$store = '.$module.'->new($self);
1634	0					0	1;
1635							' or do {
1636							my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1637	0					0	die "bayes: learner_new $module new() failed: $eval_stat\n";
1638							};
1639
1640	0	0				0	dbg("bayes: learner_new: got store=%s", $store);
1641	0					0	$self->{store} = $store;
1642
1643							$self;
1644							}
1645	10					35
1646							###########################################################################
1647
1648							my ($self, $pms, $info, $param) = @_;
1649							return "Tokens not available." unless defined $info;
1650
1651	63			63	1	187	my ($limit,$fmt_arg,$more) = split /,/, ($param \|\| '5');
1652
1653	63					160	my %formats = (
1654	63					235	short => '$t',
1655	63	100				376	Short => 'Token: \"$t\"',
		50
1656	57					147	compact => '$p-$D--$t',
1657							Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
1658	6					44	medium => '$p-$D-$N--$t',
1659							long => '$p-$d--${h}h-${s}s--${a}d--$t',
1660	0					0	Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
1661							);
1662
1663	63					267	my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
1664	63					168
1665							return "Invalid format, must be one of: ".join(",",keys %formats)
1666							unless defined $raw_fmt;
1667
1668							my $fmt = '"'.$raw_fmt.'"';
1669	63	50				7525	my $amt = $limit < @$info ? $limit : @$info;
1670	0	0				0	return "" unless $amt;
	0					0
1671	0					0
1672							my $ns = $pms->{bayes_nspam};
1673							my $nh = $pms->{bayes_nham};
1674	63					446	my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
1675	63					148	my $now = time;
1676
1677	63					215	join ', ', map {
1678							my($t,$prob,$s,$h,$u) = @$_;
1679							my $a = int(($now - $u)/(3600 * 24));
1680							my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);
1681							my $p = sprintf "%.3f", $prob;
1682							my $n = $s + $h;
1683	0			0	0		my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
1684	0	0					my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
1685							eval $fmt; ## no critic
1686	0		0				} @{$info}[0..$amt-1];
1687							}
1688	0
1689							1;