File Coverage

blib/lib/Mail/SpamAssassin/BayesStore/DBM.pm
Criterion Covered Total %
statement 418 1003 41.6
branch 124 390 31.7
condition 51 121 42.1
subroutine 57 74 77.0
pod 33 45 73.3
total 683 1633 41.8


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             package Mail::SpamAssassin::BayesStore::DBM;
19              
20 20     20   209 use strict;
  20         48  
  20         572  
21 20     20   116 use warnings;
  20         50  
  20         525  
22             # use bytes;
23 20     20   118 use re 'taint';
  20         54  
  20         544  
24              
25 20     20   125 use Fcntl;
  20         40  
  20         5226  
26 20     20   153 use Errno qw(EBADF);
  20         45  
  20         830  
27 20     20   133 use File::Basename;
  20         43  
  20         1057  
28 20     20   120 use File::Spec;
  20         38  
  20         549  
29 20     20   127 use File::Path;
  20         50  
  20         1690  
30              
31             BEGIN {
32 20         112 eval { require Digest::SHA; import Digest::SHA qw(sha1); 1 }
  20         501  
  20         431  
33 20 50   20   73 or do { require Digest::SHA1; import Digest::SHA1 qw(sha1) }
  0         0  
  0         0  
34             }
35              
36 20     20   123 use Mail::SpamAssassin;
  20         48  
  20         422  
37 20     20   112 use Mail::SpamAssassin::Util qw(untaint_var am_running_on_windows);
  20         56  
  20         883  
38 20     20   6469 use Mail::SpamAssassin::BayesStore;
  20         950  
  20         939  
39 20     20   159 use Mail::SpamAssassin::Logger;
  20         50  
  20         1583  
40              
41 20     20   155 use constant MAGIC_RE => qr/^\015\001\007\011\003/;
  20         57  
  20         153151  
42              
43             our ( @DBNAMES,
44             $NSPAM_MAGIC_TOKEN, $NHAM_MAGIC_TOKEN, $LAST_EXPIRE_MAGIC_TOKEN, $LAST_JOURNAL_SYNC_MAGIC_TOKEN,
45             $NTOKENS_MAGIC_TOKEN, $OLDEST_TOKEN_AGE_MAGIC_TOKEN, $LAST_EXPIRE_REDUCE_MAGIC_TOKEN,
46             $RUNNING_EXPIRE_MAGIC_TOKEN, $DB_VERSION_MAGIC_TOKEN, $LAST_ATIME_DELTA_MAGIC_TOKEN,
47             $NEWEST_TOKEN_AGE_MAGIC_TOKEN
48             );
49              
50             our @ISA = qw( Mail::SpamAssassin::BayesStore );
51              
52             # db layout (quoting Matt):
53             #
54             # > need five db files though to make it real fast:
55             # [probs] 1. ngood and nbad (two entries, so could be a flat file rather
56             # than a db file). (now 2 entries in db_toks)
57             # [toks] 2. good token -> number seen
58             # [toks] 3. bad token -> number seen (both are packed into 1 entry in 1 db)
59             # [probs] 4. Consolidated good token -> probability
60             # [probs] 5. Consolidated bad token -> probability
61             # > As you add new mails, you update the entry in 2 or 3, then regenerate
62             # > the entry for that token in 4 or 5.
63             # > Then as you test a new mail, you just need to pull the probability
64             # > direct from 4 and 5, and generate the overall probability. A simple and
65             # > very fast operation.
66             #
67             # jm: we use probs as overall probability. <0.5 = ham, >0.5 = spam
68             #
69             # update: probs is no longer maintained as a db, to keep on-disk and in-core
70             # usage down.
71             #
72             # also, added a new one to support forgetting, auto-learning, and
73             # auto-forgetting for refiled mails:
74             # [seen] 6. a list of Message-IDs of messages already learnt from. values
75             # are 's' for learnt-as-spam, 'h' for learnt-as-ham.
76             #
77             # and another, called [scancount] to model the scan-count for expiry.
78             # This is not a database. Instead it increases by one byte for each
79             # message scanned (note: scanned, not learned).
80              
81             @DBNAMES = qw(toks seen);
82              
83             # These are the magic tokens we use to track stuff in the DB.
84             # The format is '^M^A^G^I^C' followed by any string you want.
85             # None of the control chars will be in a real token.
86             $DB_VERSION_MAGIC_TOKEN = "\015\001\007\011\003DBVERSION";
87             $LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
88             $LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
89             $LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
90             $LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
91             $NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
92             $NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
93             $NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
94             $NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
95             $OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
96             $RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
97              
98             sub HAS_DBM_MODULE {
99 135     135 0 320 my ($self) = @_;
100 135 100       454 if (exists($self->{has_dbm_module})) {
101 79         326 return $self->{has_dbm_module};
102             }
103 56         107 $self->{has_dbm_module} = eval { require DB_File; };
  56         11686  
104             }
105              
106             sub DBM_MODULE {
107 135     135 0 618 return "DB_File";
108             }
109              
110             # Possible file extensions used by the kinds of database files DB_File
111             # might create. We need these so we can create a new file and rename
112             # it into place.
113             sub DB_EXTENSIONS {
114 0     0 0 0 return ('', '.db');
115             }
116              
117             ###########################################################################
118              
119             sub new {
120 62     62 1 232 my $class = shift;
121 62   33     1075 $class = ref($class) || $class;
122              
123 62         568 my $self = $class->SUPER::new(@_);
124              
125 62         574 $self->{supported_db_version} = 3;
126              
127 62         2257 $self->{already_tied} = 0;
128 62         193 $self->{is_locked} = 0;
129 62         503 $self->{string_to_journal} = '';
130              
131 62         1333 $self;
132             }
133              
134             ###########################################################################
135              
136             sub tie_db_readonly {
137 147     147 1 377 my ($self) = @_;
138              
139 147 100       624 if (!$self->HAS_DBM_MODULE) {
140 133         530 dbg("bayes: %s module not installed, cannot use bayes", $self->DBM_MODULE);
141 133         813 return 0;
142             }
143              
144             # return if we've already tied to the db's, using the same mode
145             # (locked/unlocked) as before.
146 14 100 66     101 return 1 if ($self->{already_tied} && $self->{is_locked} == 0);
147              
148 12         28 my $main = $self->{bayes}->{main};
149 12 50       70 if (!defined($main->{conf}->{bayes_path})) {
150 0         0 dbg("bayes: bayes_path not defined");
151 0         0 return 0;
152             }
153              
154 12         82 $self->read_db_configs();
155              
156 12         61 my $path = $main->sed_path($main->{conf}->{bayes_path});
157              
158 12         36 my $found = 0;
159 12         51 for my $ext ($self->DB_EXTENSIONS) {
160 20 100       428 if (-f $path.'_toks'.$ext) {
161 4         16 $found = 1;
162 4         18 last;
163             }
164             }
165              
166 12 100       52 if (!$found) {
167 8         54 dbg("bayes: no dbs present, cannot tie DB R/O: %s", $path.'_toks');
168 8         89 return 0;
169             }
170              
171 4         29 foreach my $dbname (@DBNAMES) {
172 8         37 my $name = $path.'_'.$dbname;
173 8         24 my $db_var = 'db_'.$dbname;
174 8         42 dbg("bayes: tie-ing to DB file R/O $name");
175              
176             # Bug 6901, [rt.cpan.org #83060]
177             # DB_File: Repeated tie to the same hash with no untie causes corruption
178 8         26 untie %{$self->{$db_var}}; # has no effect if the variable is not tied
  8         52  
179              
180 8 100       20 if (!tie %{$self->{$db_var}}, $self->DBM_MODULE, $name, O_RDONLY,
  8         42  
181             (oct($main->{conf}->{bayes_file_mode}) & 0666))
182             {
183             # bug 2975: it's acceptable for the db_seen to not be present,
184             # to allow it to be recycled. if that's the case, just create
185             # a new, empty one. we don't need to lock it, since we won't
186             # be writing to it; let the R/W api deal with that case.
187              
188 1 50       19 if ($dbname eq 'seen') {
189             # Bug 6901, [rt.cpan.org #83060]
190 1         2 untie %{$self->{$db_var}}; # has no effect if the variable is not tied
  1         4  
191 1         13 tie %{$self->{$db_var}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT,
192 1 50       7 (oct($main->{conf}->{bayes_file_mode}) & 0666)
193             or goto failed_to_tie;
194             }
195             else {
196 0         0 goto failed_to_tie;
197             }
198             }
199             }
200              
201 4         48 $self->{db_version} = ($self->get_storage_variables())[6];
202 4         22 dbg("bayes: found bayes db version %s", $self->{db_version});
203              
204             # If the DB version is one we don't understand, abort!
205 4 50       43 if ($self->_check_db_version() != 0) {
206 0         0 warn("bayes: bayes db version ".$self->{db_version}." is not able to be used, aborting!");
207 0         0 $self->untie_db();
208 0         0 return 0;
209             }
210              
211 4         17 $self->{already_tied} = 1;
212 4         25 return 1;
213              
214 0         0 failed_to_tie:
215             warn "bayes: cannot open bayes databases ${path}_* R/O: tie failed: $!\n";
216 0         0 foreach my $dbname (@DBNAMES) {
217 0         0 my $db_var = 'db_'.$dbname;
218 0 0       0 next unless exists $self->{$db_var};
219 0         0 dbg("bayes: untie-ing DB file $dbname");
220 0         0 untie %{$self->{$db_var}};
  0         0  
221             }
222              
223 0         0 return 0;
224             }
225              
226             # tie() to the databases, read-write and locked. Any callers of
227             # this should ensure they call untie_db() afterwards!
228             #
229             sub tie_db_writable {
230 30     30 1 90 my ($self) = @_;
231              
232 30 100       117 if (!$self->HAS_DBM_MODULE) {
233 2         9 dbg("bayes: %s module not installed, cannot use bayes", $self->DBM_MODULE);
234 2         7 return 0;
235             }
236              
237             # Useful shortcut ...
238 28         94 my $main = $self->{bayes}->{main};
239              
240             # if we've already tied the db's using the same mode
241             # (locked/unlocked) as we want now, freshen the lock and return.
242 28 100 66     123 if ($self->{already_tied} && $self->{is_locked} == 1) {
243 2         14 $main->{locker}->refresh_lock($self->{locked_file});
244 2         9 return 1;
245             }
246              
247 26 50       124 if (!defined($main->{conf}->{bayes_path})) {
248 0         0 dbg("bayes: bayes_path not defined");
249 0         0 return 0;
250             }
251              
252 26         155 $self->read_db_configs();
253              
254 26         97 my $path = $main->sed_path($main->{conf}->{bayes_path});
255              
256 26         71 my $found = 0;
257 26         91 for my $ext ($self->DB_EXTENSIONS) {
258 30 100       592 if (-f $path.'_toks'.$ext) {
259 22         74 $found = 1;
260 22         53 last;
261             }
262             }
263              
264 26         1338 my $parentdir = dirname($path);
265 26 50       475 if (!-d $parentdir) {
266             # run in an eval(); if mkpath has no perms, it calls die()
267 0         0 eval {
268 0         0 mkpath($parentdir, 0, (oct($main->{conf}->{bayes_file_mode}) & 0777));
269             };
270             }
271              
272 26         71 my $tout;
273 26 50       112 if ($main->{learn_wait_for_lock}) {
274 0         0 $tout = 300; # TODO: Dan to write better lock code
275             } else {
276 26         64 $tout = 10;
277             }
278 26 50       191 if ($main->{locker}->safe_lock($path, $tout, $main->{conf}->{bayes_file_mode}))
279             {
280 26         78 $self->{locked_file} = $path;
281 26         60 $self->{is_locked} = 1;
282             } else {
283 0         0 warn "bayes: cannot open bayes databases ${path}_* R/W: lock failed: $!\n";
284 0         0 return 0;
285             }
286              
287 26         156 my $umask = umask 0;
288 26         106 foreach my $dbname (@DBNAMES) {
289 52         175 my $name = $path.'_'.$dbname;
290 52         96 my $db_var = 'db_'.$dbname;
291 52         208 dbg("bayes: tie-ing to DB file R/W $name");
292              
293 52 50       163 ($self->DBM_MODULE eq 'DB_File') and
294             Mail::SpamAssassin::Util::avoid_db_file_locking_bug ($name);
295              
296             # Bug 6901, [rt.cpan.org #83060]
297 52         91 untie %{$self->{$db_var}}; # has no effect if the variable is not tied
  52         241  
298 52         149 tie %{$self->{$db_var}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT,
299 52 50       81 (oct($main->{conf}->{bayes_file_mode}) & 0666)
300             or goto failed_to_tie;
301             }
302 26         172 umask $umask;
303              
304             # set our cache to what version DB we're using
305 26         135 $self->{db_version} = ($self->get_storage_variables())[6];
306             # don't bother printing this unless found since it would be bogus anyway
307 26 100       171 dbg("bayes: found bayes db version %s", $self->{db_version}) if $found;
308              
309             # figure out if we can read the current DB and if we need to do a
310             # DB version update and do it if necessary if either has a problem,
311             # fail immediately
312             #
313 26 50 66     150 if ($found && !$self->_upgrade_db()) {
    100          
314 0         0 $self->untie_db();
315 0         0 return 0;
316             }
317             elsif (!$found) { # new DB, make sure we know that ...
318 4         91 $self->{db_version} = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
319 4         80 $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN} = 0; # no tokens in the db ...
320 4         25 dbg("bayes: new db, set db version %s and 0 tokens", $self->{db_version});
321             }
322              
323 26         61 $self->{already_tied} = 1;
324 26         177 return 1;
325              
326 0         0 failed_to_tie:
327             my $err = $!;
328 0         0 umask $umask;
329              
330 0         0 foreach my $dbname (@DBNAMES) {
331 0         0 my $db_var = 'db_'.$dbname;
332 0 0       0 next unless exists $self->{$db_var};
333 0         0 dbg("bayes: untie-ing DB file $dbname");
334 0         0 untie %{$self->{$db_var}};
  0         0  
335             }
336              
337 0 0       0 if ($self->{is_locked}) {
338 0         0 $self->{bayes}->{main}->{locker}->safe_unlock($self->{locked_file});
339 0         0 $self->{is_locked} = 0;
340             }
341 0         0 warn "bayes: cannot open bayes databases ${path}_* R/W: tie failed: $err\n";
342 0         0 return 0;
343             }
344              
345             # Do we understand how to deal with this DB version?
346             sub _check_db_version {
347 26     26   61 my ($self) = @_;
348              
349             # return -1 if older, 0 if current, 1 if newer
350 26         112 return $self->{db_version} <=> $self->DB_VERSION;
351             }
352              
353             # Check to see if we need to upgrade the DB, and do so if necessary
354             sub _upgrade_db {
355 22     22   61 my ($self) = @_;
356              
357 22         135 my $verschk = $self->_check_db_version();
358 22         50 my $res = 0; # used later on for tie() checks
359 22         39 my $umask; # used later for umask modifications
360              
361             # If the DB is the latest version, no problem.
362 22 50       164 return 1 if ($verschk == 0);
363              
364             # If the DB is a newer version that we know what to do with ... abort!
365 0 0       0 if ($verschk == 1) {
366 0         0 warn("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
367 0         0 return 0;
368             }
369              
370             # If the current DB version is lower than the new version, upgrade!
371             # Do conversions in order so we can go 1 -> 3, make sure to update
372             # $self->{db_version} along the way
373              
374 0         0 dbg("bayes: detected bayes db format %s, upgrading", $self->{db_version});
375              
376             # since DB_File will not shrink a database (!!), we need to *create*
377             # a new one instead.
378 0         0 my $main = $self->{bayes}->{main};
379 0         0 my $path = $main->sed_path($main->{conf}->{bayes_path});
380 0         0 my $name = $path.'_toks';
381              
382             # older version's journal files are likely not in the same format as the new ones, so remove it.
383 0         0 my $jpath = $self->_get_journal_filename();
384 0 0       0 if (-f $jpath) {
385 0         0 dbg("bayes: old journal file found, removing");
386 0 0       0 warn "bayes: couldn't remove $jpath: $!" if (!unlink $jpath);
387             }
388              
389 0 0       0 if ($self->{db_version} < 2) {
390 0         0 dbg("bayes: upgrading database format from v%s to v2", $self->{db_version});
391 0         0 $self->set_running_expire_tok();
392              
393 0         0 my ($DB_NSPAM_MAGIC_TOKEN, $DB_NHAM_MAGIC_TOKEN, $DB_NTOKENS_MAGIC_TOKEN);
394 0         0 my ($DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN, $DB_LAST_EXPIRE_MAGIC_TOKEN);
395              
396             # Magic tokens for version 0, defined as '**[A-Z]+'
397 0 0       0 if ($self->{db_version} == 0) {
398 0         0 $DB_NSPAM_MAGIC_TOKEN = '**NSPAM';
399 0         0 $DB_NHAM_MAGIC_TOKEN = '**NHAM';
400 0         0 $DB_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
401             #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
402             #$DB_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
403             #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
404             #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = '**RUNNINGEXPIRE';
405             }
406             else {
407 0         0 $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
408 0         0 $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
409 0         0 $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
410             #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
411             #$DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
412             #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE";
413             #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
414             }
415              
416             # remember when we started ...
417 0         0 my $started = time;
418 0         0 my $newatime = $started;
419              
420             # use O_EXCL to avoid races (bonus paranoia, since we should be locked
421             # anyway)
422 0         0 my %new_toks;
423 0         0 $umask = umask 0;
424              
425             $res = tie %new_toks, $self->DBM_MODULE, "${name}.new",
426             O_RDWR|O_CREAT|O_EXCL,
427 0         0 (oct($main->{conf}->{bayes_file_mode}) & 0666);
428 0         0 umask $umask;
429 0 0       0 return 0 unless $res;
430 0         0 undef $res;
431              
432             # add the magic tokens to the new db.
433 0         0 $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
434 0         0 $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
435 0         0 $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
436 0         0 $new_toks{$DB_VERSION_MAGIC_TOKEN} = 2; # we're now a DB version 2 file
437 0         0 $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
438 0         0 $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $newatime;
439 0         0 $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
440 0         0 $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $newatime;
441 0         0 $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
442 0         0 $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
443              
444             # deal with the data tokens
445 0         0 my ($tok, $packed);
446 0         0 my $count = 0;
447 0         0 while (($tok, $packed) = each %{$self->{db_toks}}) {
  0         0  
448 0 0       0 next if ($tok =~ /^(?:\*\*[A-Z]+$|\015\001\007\011\003)/); # skip magic tokens
449              
450 0         0 my ($ts, $th, $atime) = $self->tok_unpack($packed);
451 0         0 $new_toks{$tok} = $self->tok_pack($ts, $th, $newatime);
452              
453             # Refresh the lock every so often...
454 0 0       0 if (($count++ % 1000) == 0) {
455 0         0 $self->set_running_expire_tok();
456             }
457             }
458              
459              
460             # now untie so we can do renames
461 0         0 untie %{$self->{db_toks}};
  0         0  
462 0         0 untie %new_toks;
463              
464             # This is the critical phase (moving files around), so don't allow
465             # it to be interrupted.
466 0         0 local $SIG{'INT'} = 'IGNORE';
467 0         0 local $SIG{'TERM'} = 'IGNORE';
468 0 0       0 local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
469              
470             # older versions used scancount, so kill the stupid little file ...
471 0         0 my $msgc = $path.'_msgcount';
472 0 0       0 if (-f $msgc) {
473 0         0 dbg("bayes: old msgcount file found, removing");
474 0 0       0 if (!unlink $msgc) {
475 0         0 warn "bayes: couldn't remove $msgc: $!";
476             }
477             }
478              
479             # now rename in the new one. Try several extensions
480 0         0 for my $ext ($self->DB_EXTENSIONS) {
481 0         0 my $newf = $name.'.new'.$ext;
482 0         0 my $oldf = $name.$ext;
483 0 0       0 next unless (-f $newf);
484 0 0       0 if (!rename ($newf, $oldf)) {
485 0         0 warn "bayes: rename $newf to $oldf failed: $!\n";
486 0         0 return 0;
487             }
488             }
489              
490             # re-tie to the new db in read-write mode ...
491 0         0 $umask = umask 0;
492             # Bug 6901, [rt.cpan.org #83060]
493 0         0 untie %{$self->{db_toks}}; # has no effect if the variable is not tied
  0         0  
494 0         0 $res = tie %{$self->{db_toks}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT,
495 0         0 (oct($main->{conf}->{bayes_file_mode}) & 0666);
496 0         0 umask $umask;
497 0 0       0 return 0 unless $res;
498 0         0 undef $res;
499              
500             dbg("bayes: upgraded database format from v%s to v2 in %d seconds",
501 0         0 $self->{db_version}, time - $started);
502 0         0 $self->{db_version} = 2; # need this for other functions which check
503             }
504              
505             # Version 3 of the database converts all existing tokens to SHA1 hashes
506 0 0       0 if ($self->{db_version} == 2) {
507 0         0 dbg("bayes: upgrading database format from v%s to v3", $self->{db_version});
508 0         0 $self->set_running_expire_tok();
509              
510 0         0 my $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
511 0         0 my $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
512 0         0 my $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
513 0         0 my $DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
514 0         0 my $DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
515 0         0 my $DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
516 0         0 my $DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
517 0         0 my $DB_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
518 0         0 my $DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
519              
520             # remember when we started ...
521 0         0 my $started = time;
522              
523             # use O_EXCL to avoid races (bonus paranoia, since we should be locked
524             # anyway)
525 0         0 my %new_toks;
526 0         0 $umask = umask 0;
527             $res = tie %new_toks, $self->DBM_MODULE, "${name}.new", O_RDWR|O_CREAT|O_EXCL,
528 0         0 (oct($main->{conf}->{bayes_file_mode}) & 0666);
529 0         0 umask $umask;
530 0 0       0 return 0 unless $res;
531 0         0 undef $res;
532              
533             # add the magic tokens to the new db.
534 0         0 $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
535 0         0 $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
536 0         0 $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
537 0         0 $new_toks{$DB_VERSION_MAGIC_TOKEN} = 3; # we're now a DB version 3 file
538 0         0 $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN};
539 0         0 $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_EXPIRE_MAGIC_TOKEN};
540 0         0 $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN};
541 0         0 $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN};
542 0         0 $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_ATIME_DELTA_MAGIC_TOKEN};
543 0         0 $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} =$self->{db_toks}->{$DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN};
544              
545             # deal with the data tokens
546 0         0 my $count = 0;
547 0         0 while (my ($tok, $packed) = each %{$self->{db_toks}}) {
  0         0  
548 0 0       0 next if ($tok =~ /^\015\001\007\011\003/); # skip magic tokens
549 0         0 my $tok_hash = substr(sha1($tok), -5);
550 0         0 $new_toks{$tok_hash} = $packed;
551              
552             # Refresh the lock every so often...
553 0 0       0 if (($count++ % 1000) == 0) {
554 0         0 $self->set_running_expire_tok();
555             }
556             }
557              
558             # now untie so we can do renames
559 0         0 untie %{$self->{db_toks}};
  0         0  
560 0         0 untie %new_toks;
561              
562             # This is the critical phase (moving files around), so don't allow
563             # it to be interrupted.
564 0         0 local $SIG{'INT'} = 'IGNORE';
565 0         0 local $SIG{'TERM'} = 'IGNORE';
566 0 0       0 local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
567              
568             # now rename in the new one. Try several extensions
569 0         0 for my $ext ($self->DB_EXTENSIONS) {
570 0         0 my $newf = $name.'.new'.$ext;
571 0         0 my $oldf = $name.$ext;
572 0 0       0 next unless (-f $newf);
573 0 0       0 if (!rename($newf, $oldf)) {
574 0         0 warn "bayes: rename $newf to $oldf failed: $!\n";
575 0         0 return 0;
576             }
577             }
578              
579             # re-tie to the new db in read-write mode ...
580 0         0 $umask = umask 0;
581             # Bug 6901, [rt.cpan.org #83060]
582 0         0 untie %{$self->{db_toks}}; # has no effect if the variable is not tied
  0         0  
583 0         0 $res = tie %{$self->{db_toks}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT,
584 0         0 (oct ($main->{conf}->{bayes_file_mode}) & 0666);
585 0         0 umask $umask;
586 0 0       0 return 0 unless $res;
587 0         0 undef $res;
588              
589             dbg("bayes: upgraded database format from v%s to v3 in %d seconds",
590 0         0 $self->{db_version}, time - $started);
591              
592 0         0 $self->{db_version} = 3; # need this for other functions which check
593             }
594              
595             # if ($self->{db_version} == 3) {
596             # ...
597             # $self->{db_version} = 4; # need this for other functions which check
598             # }
599             # ... and so on.
600              
601 0         0 return 1;
602             }
603              
604             ###########################################################################
605              
606             sub untie_db {
607 69     69 1 185 my $self = shift;
608              
609 69 100       347 return if (!$self->{already_tied});
610              
611 30         130 dbg("bayes: untie-ing");
612              
613 30         108 foreach my $dbname (@DBNAMES) {
614 60         160 my $db_var = 'db_'.$dbname;
615              
616 60 100       176 if (exists $self->{$db_var}) {
617             # dbg("bayes: untie-ing $db_var");
618 56         80 untie %{$self->{$db_var}};
  56         939  
619 56         237 delete $self->{$db_var};
620             }
621             }
622              
623 30 100       95 if ($self->{is_locked}) {
624 26         98 dbg("bayes: files locked, now unlocking lock");
625 26         167 $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file});
626 26         80 $self->{is_locked} = 0;
627             }
628              
629 30         109 $self->{already_tied} = 0;
630 30         99 $self->{db_version} = undef;
631             }
632              
633             ###########################################################################
634              
635             sub calculate_expire_delta {
636 0     0 1 0 my ($self, $newest_atime, $start, $max_expire_mult) = @_;
637              
638 0         0 my %delta; # use a hash since an array is going to be very sparse
639              
640             # do the first pass, figure out atime delta
641 0         0 my ($tok, $packed);
642 0         0 while (($tok, $packed) = each %{$self->{db_toks}}) {
  0         0  
643 0 0       0 next if ($tok =~ MAGIC_RE); # skip magic tokens
644            
645 0         0 my ($ts, $th, $atime) = $self->tok_unpack ($packed);
646              
647             # Go through from $start * 1 to $start * 512, mark how many tokens
648             # we would expire
649 0         0 my $token_age = $newest_atime - $atime;
650 0         0 for (my $i = 1; $i <= $max_expire_mult; $i<<=1) {
651 0 0       0 if ($token_age >= $start * $i) {
652 0         0 $delta{$i}++;
653             }
654             else {
655             # If the token age is less than the expire delta, it'll be
656             # less for all upcoming checks too, so abort early.
657 0         0 last;
658             }
659             }
660             }
661 0         0 return %delta;
662             }
663              
664             ###########################################################################
665              
666             sub token_expiration {
667 0     0 1 0 my ($self, $opts, $newdelta, @vars) = @_;
668              
669 0         0 my $deleted = 0;
670 0         0 my $kept = 0;
671 0         0 my $num_hapaxes = 0;
672 0         0 my $num_lowfreq = 0;
673              
674             # since DB_File will not shrink a database (!!), we need to *create*
675             # a new one instead.
676 0         0 my $main = $self->{bayes}->{main};
677 0         0 my $path = $main->sed_path($main->{conf}->{bayes_path});
678              
679             # use a temporary PID-based suffix just in case another one was
680             # created previously by an interrupted expire
681 0         0 my $tmpsuffix = "expire$$";
682 0         0 my $tmpdbname = $path.'_toks.'.$tmpsuffix;
683              
684             # clean out any leftover db copies from previous runs
685 0         0 for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
  0         0  
686              
687             # use O_EXCL to avoid races (bonus paranoia, since we should be locked
688             # anyway)
689 0         0 my %new_toks;
690 0         0 my $umask = umask 0;
691             tie %new_toks, $self->DBM_MODULE, $tmpdbname, O_RDWR|O_CREAT|O_EXCL,
692 0         0 (oct ($main->{conf}->{bayes_file_mode}) & 0666);
693 0         0 umask $umask;
694 0         0 my $oldest;
695              
696 0         0 my $showdots = $opts->{showdots};
697 0 0       0 if ($showdots) { print STDERR "\n"; }
  0         0  
698              
699             # We've chosen a new atime delta if we've gotten here, so record it
700             # for posterity.
701 0         0 $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $newdelta;
702              
703             # Figure out how old is too old...
704 0         0 my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
705              
706             # Go ahead and do the move to new db/expire run now ...
707 0         0 my ($tok, $packed);
708 0         0 while (($tok, $packed) = each %{$self->{db_toks}}) {
  0         0  
709 0 0       0 next if ($tok =~ MAGIC_RE); # skip magic tokens
710              
711 0         0 my ($ts, $th, $atime) = $self->tok_unpack ($packed);
712              
713 0 0       0 if ($atime < $too_old) {
714 0         0 $deleted++;
715             }
716             else {
717             # if token atime > newest, reset to newest ...
718 0 0       0 if ($atime > $vars[10]) {
719 0         0 $atime = $vars[10];
720             }
721              
722 0         0 $new_toks{$tok} = $self->tok_pack ($ts, $th, $atime); $kept++;
  0         0  
723 0 0 0     0 if (!defined($oldest) || $atime < $oldest) { $oldest = $atime; }
  0         0  
724 0 0 0     0 if ($ts + $th == 1) {
    0          
725 0         0 $num_hapaxes++;
726             } elsif ($ts < 8 && $th < 8) {
727 0         0 $num_lowfreq++;
728             }
729             }
730              
731 0 0       0 if ((($kept + $deleted) % 1000) == 0) {
732 0 0       0 if ($showdots) { print STDERR "."; }
  0         0  
733 0         0 $self->set_running_expire_tok();
734             }
735             }
736              
737             # and add the magic tokens. don't add the expire_running token.
738 0         0 $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
739              
740             # We haven't changed messages of each type seen, so just copy over.
741 0         0 $new_toks{$NSPAM_MAGIC_TOKEN} = $vars[1];
742 0         0 $new_toks{$NHAM_MAGIC_TOKEN} = $vars[2];
743              
744             # We magically haven't removed the newest token, so just copy that value over.
745 0         0 $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $vars[10];
746              
747             # The rest of these have been modified, so replace as necessary.
748 0         0 $new_toks{$NTOKENS_MAGIC_TOKEN} = $kept;
749 0         0 $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = time();
750 0         0 $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest;
751 0         0 $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = $deleted;
752              
753             # Sanity check: if we expired too many tokens, abort!
754 0 0       0 if ($kept < 100000) {
755 0         0 dbg("bayes: token expiration would expire too many tokens, aborting");
756             # set the magic tokens appropriately
757             # make sure the next expire run does a first pass
758 0         0 $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
759 0         0 $self->{db_toks}->{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
760 0         0 $self->{db_toks}->{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
761              
762             # remove the new DB
763 0         0 untie %new_toks;
764 0         0 for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
  0         0  
765              
766             # reset the results for the return
767 0         0 $kept = $vars[3];
768 0         0 $deleted = 0;
769 0         0 $num_hapaxes = 0;
770 0         0 $num_lowfreq = 0;
771             }
772             else {
773             # now untie so we can do renames
774 0         0 untie %{$self->{db_toks}};
  0         0  
775 0         0 untie %new_toks;
776              
777             # This is the critical phase (moving files around), so don't allow
778             # it to be interrupted. Scope the signal changes.
779             {
780 0         0 local $SIG{'INT'} = 'IGNORE';
  0         0  
781 0         0 local $SIG{'TERM'} = 'IGNORE';
782 0 0       0 local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
783              
784             # now rename in the new one. Try several extensions
785 0         0 for my $ext ($self->DB_EXTENSIONS) {
786 0         0 my $newf = $tmpdbname.$ext;
787 0         0 my $oldf = $path.'_toks'.$ext;
788 0 0       0 next unless (-f $newf);
789 0 0       0 if (!rename ($newf, $oldf)) {
790 0         0 warn "bayes: rename $newf to $oldf failed: $!\n";
791             }
792             }
793             }
794             }
795              
796             # Call untie_db() so we unlock correctly.
797 0         0 $self->untie_db();
798              
799 0         0 return ($kept, $deleted, $num_hapaxes, $num_lowfreq);
800             }
801              
802             ###########################################################################
803              
804             # Is a sync due?
805             sub sync_due {
806 10     10 1 31 my ($self) = @_;
807              
808             # don't bother doing old db versions
809 10 50       53 return 0 if ($self->{db_version} < $self->DB_VERSION);
810              
811 10         30 my $conf = $self->{bayes}->{main}->{conf};
812 10 50       37 return 0 if ($conf->{bayes_journal_max_size} == 0);
813              
814 10         34 my @vars = $self->get_storage_variables();
815 10         53 dbg("bayes: DB journal sync: last sync: %s", $vars[7]);
816              
817             ## Ok, should we do a sync?
818              
819             # Not if the journal file doesn't exist, it's not a file, or it's 0
820             # bytes long.
821 10 100 66     50 return 0 unless (stat($self->_get_journal_filename()) && -f _);
822              
823             # Yes if the file size is larger than the specified maximum size.
824 4 50       291 return 1 if (-s _ > $conf->{bayes_journal_max_size});
825              
826             # Yes there has been a sync before, and if it's been at least a day
827             # since that sync.
828 4 50 33     208 return 1 if (($vars[7] > 0) && (time - $vars[7] > 86400));
829              
830             # No, I guess not.
831 4         26 return 0;
832             }
833              
834             ###########################################################################
835             # db_seen reading APIs
836              
837             sub seen_get {
838 24     24 1 86 my ($self, $msgid) = @_;
839 24         487 $self->{db_seen}->{$msgid};
840             }
841              
842             sub seen_put {
843 6     6 1 39 my ($self, $msgid, $seen) = @_;
844              
845 6 100       33 if ($self->{bayes}->{main}->{learn_to_journal}) {
846 2         16 $self->defer_update ("m $seen $msgid");
847             }
848             else {
849 4         25 $self->_seen_put_direct($msgid, $seen);
850             }
851             }
852             sub _seen_put_direct {
853 6     6   32 my ($self, $msgid, $seen) = @_;
854 6         213 $self->{db_seen}->{$msgid} = $seen;
855             }
856              
857             sub seen_delete {
858 4     4 1 24 my ($self, $msgid) = @_;
859              
860 4 50       24 if ($self->{bayes}->{main}->{learn_to_journal}) {
861 0         0 $self->defer_update ("m f $msgid");
862             }
863             else {
864 4         24 $self->_seen_delete_direct($msgid);
865             }
866             }
867             sub _seen_delete_direct {
868 4     4   19 my ($self, $msgid) = @_;
869 4         114 delete $self->{db_seen}->{$msgid};
870             }
871              
872             ###########################################################################
873             # db reading APIs
874              
875             sub tok_get {
876 3878     3878 1 7084 my ($self, $tok) = @_;
877 3878         11517 $self->tok_unpack ($self->{db_toks}->{$tok});
878             }
879            
880             sub tok_get_all {
881 6     6 1 269 my ($self, @tokens) = @_;
882              
883 6         35 my @tokensdata;
884 6         30 foreach my $token (@tokens) {
885 1658         5056 my ($tok_spam, $tok_ham, $atime) = $self->tok_unpack($self->{db_toks}->{$token});
886 1658         6471 push(@tokensdata, [$token, $tok_spam, $tok_ham, $atime]);
887             }
888 6         88 return \@tokensdata;
889             }
890              
891             # return the magic tokens in a specific order:
892             # 0: scan count base
893             # 1: number of spam
894             # 2: number of ham
895             # 3: number of tokens in db
896             # 4: last expire atime
897             # 5: oldest token in db atime
898             # 6: db version value
899             # 7: last journal sync
900             # 8: last atime delta
901             # 9: last expire reduction count
902             # 10: newest token in db atime
903             #
904             sub get_storage_variables {
905 64     64 1 151 my ($self) = @_;
906 64         110 my @values;
907              
908 64         1219 my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN};
909              
910 64 100 66     602 if (!$db_ver || $db_ver =~ /\D/) { $db_ver = 0; }
  4         12  
911              
912 64 100       221 if ($db_ver >= 2) {
    50          
    0          
913 60         139 my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA";
914 60         131 my $DB2_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
915 60         131 my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
916 60         106 my $DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC";
917 60         99 my $DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE";
918 60         108 my $DB2_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
919 60         93 my $DB2_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
920 60         97 my $DB2_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
921 60         110 my $DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
922 60         102 my $DB2_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE";
923              
924             @values = (
925             0,
926             $self->{db_toks}->{$DB2_NSPAM_MAGIC_TOKEN},
927             $self->{db_toks}->{$DB2_NHAM_MAGIC_TOKEN},
928             $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN},
929             $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN},
930             $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
931             $db_ver,
932             $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN},
933             $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN},
934             $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN},
935 60         4474 $self->{db_toks}->{$DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN},
936             );
937             }
938             elsif ($db_ver == 0) {
939 4         20 my $DB0_NSPAM_MAGIC_TOKEN = '**NSPAM';
940 4         18 my $DB0_NHAM_MAGIC_TOKEN = '**NHAM';
941 4         17 my $DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
942 4         7 my $DB0_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
943 4         9 my $DB0_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
944 4         17 my $DB0_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
945              
946             @values = (
947             $self->{db_toks}->{$DB0_SCANCOUNT_BASE_MAGIC_TOKEN},
948             $self->{db_toks}->{$DB0_NSPAM_MAGIC_TOKEN},
949             $self->{db_toks}->{$DB0_NHAM_MAGIC_TOKEN},
950             $self->{db_toks}->{$DB0_NTOKENS_MAGIC_TOKEN},
951             $self->{db_toks}->{$DB0_LAST_EXPIRE_MAGIC_TOKEN},
952 4         75 $self->{db_toks}->{$DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
953             0,
954             0,
955             0,
956             0,
957             0,
958             );
959             }
960             elsif ($db_ver == 1) {
961 0         0 my $DB1_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM";
962 0         0 my $DB1_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM";
963 0         0 my $DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE";
964 0         0 my $DB1_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE";
965 0         0 my $DB1_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS";
966 0         0 my $DB1_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE";
967              
968             @values = (
969             $self->{db_toks}->{$DB1_SCANCOUNT_BASE_MAGIC_TOKEN},
970             $self->{db_toks}->{$DB1_NSPAM_MAGIC_TOKEN},
971             $self->{db_toks}->{$DB1_NHAM_MAGIC_TOKEN},
972             $self->{db_toks}->{$DB1_NTOKENS_MAGIC_TOKEN},
973             $self->{db_toks}->{$DB1_LAST_EXPIRE_MAGIC_TOKEN},
974 0         0 $self->{db_toks}->{$DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
975             1,
976             0,
977             0,
978             0,
979             0,
980             );
981             }
982              
983 64         401 foreach (@values) {
984 704 100 66     1966 if (!$_ || $_ =~ /\D/) {
985 414         739 $_ = 0;
986             }
987             }
988              
989 64         420 return @values;
990             }
991              
992             sub dump_db_toks {
993 0     0 1 0 my ($self, $template, $regex, @vars) = @_;
994              
995 0         0 while (my ($tok, $tokvalue) = each %{$self->{db_toks}}) {
  0         0  
996 0 0       0 next if ($tok =~ MAGIC_RE); # skip magic tokens
997 0 0 0     0 next if (defined $regex && ($tok !~ /$regex/o));
998              
999             # We have the value already, so just unpack it.
1000 0         0 my ($ts, $th, $atime) = $self->tok_unpack ($tokvalue);
1001            
1002 0         0 my $prob = $self->{bayes}->_compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th);
1003 0   0     0 $prob ||= 0.5;
1004            
1005 0         0 my $encoded_tok = unpack("H*",$tok);
1006 0         0 printf $template,$prob,$ts,$th,$atime,$encoded_tok;
1007             }
1008             }
1009              
1010             sub set_last_expire {
1011 0     0 1 0 my ($self, $time) = @_;
1012 0         0 $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
1013             }
1014              
1015             ## Don't bother using get_magic_tokens here. This token should only
1016             ## ever exist when we're running expire, so we don't want to convert it if
1017             ## it's there and we're not expiring ...
1018             sub get_running_expire_tok {
1019 10     10 1 31 my ($self) = @_;
1020 10         201 my $running = $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
1021 10 50 33     77 if (!$running || $running =~ /\D/) { return; }
  10         41  
1022 0         0 return $running;
1023             }
1024              
1025             sub set_running_expire_tok {
1026 2     2 1 8 my ($self) = @_;
1027              
1028             # update the lock and running expire magic token
1029 2         17 $self->{bayes}->{main}->{locker}->refresh_lock ($self->{locked_file});
1030 2         55 $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN} = time();
1031             }
1032              
1033             sub remove_running_expire_tok {
1034 0     0 1 0 my ($self) = @_;
1035 0         0 delete $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
1036             }
1037              
1038             ###########################################################################
1039              
1040             # db abstraction: allow deferred writes, since we will be frequently
1041             # writing while checking.
1042              
1043             sub tok_count_change {
1044 0     0 1 0 my ($self, $ds, $dh, $tok, $atime) = @_;
1045              
1046 0 0       0 $atime = 0 unless defined $atime;
1047              
1048 0 0       0 if ($self->{bayes}->{main}->{learn_to_journal}) {
1049             # we can't store the SHA1 binary value in the journal, so convert it
1050             # to a printable value that can be converted back later
1051 0         0 my $encoded_tok = unpack("H*",$tok);
1052 0         0 $self->defer_update ("c $ds $dh $atime $encoded_tok");
1053             } else {
1054 0         0 $self->tok_sync_counters ($ds, $dh, $atime, $tok);
1055             }
1056             }
1057              
1058             sub multi_tok_count_change {
1059 10     10 1 49 my ($self, $ds, $dh, $tokens, $atime) = @_;
1060              
1061 10 100       49 $atime = 0 unless defined $atime;
1062              
1063 10         23 foreach my $tok (keys %{$tokens}) {
  10         343  
1064 2770 100       9010 if ($self->{bayes}->{main}->{learn_to_journal}) {
1065             # we can't store the SHA1 binary value in the journal, so convert it
1066             # to a printable value that can be converted back later
1067 554         1018 my $encoded_tok = unpack("H*",$tok);
1068 554         1856 $self->defer_update ("c $ds $dh $atime $encoded_tok");
1069             } else {
1070 2216         5421 $self->tok_sync_counters ($ds, $dh, $atime, $tok);
1071             }
1072             }
1073             }
1074              
1075             sub nspam_nham_get {
1076 10     10 1 36 my ($self) = @_;
1077 10         33 my @vars = $self->get_storage_variables();
1078 10         60 ($vars[1], $vars[2]);
1079             }
1080              
1081             sub nspam_nham_change {
1082 10     10 1 35 my ($self, $ds, $dh) = @_;
1083              
1084 10 100       43 if ($self->{bayes}->{main}->{learn_to_journal}) {
1085 2         73 $self->defer_update ("n $ds $dh");
1086             } else {
1087 8         39 $self->tok_sync_nspam_nham ($ds, $dh);
1088             }
1089             }
1090              
1091             sub tok_touch {
1092 0     0 1 0 my ($self, $tok, $atime) = @_;
1093             # we can't store the SHA1 binary value in the journal, so convert it
1094             # to a printable value that can be converted back later
1095 0         0 my $encoded_tok = unpack("H*", $tok);
1096 0         0 $self->defer_update ("t $atime $encoded_tok");
1097             }
1098              
1099             sub tok_touch_all {
1100 4     4 1 20 my ($self, $tokens, $atime) = @_;
1101              
1102 4         18 foreach my $token (@{$tokens}) {
  4         23  
1103             # we can't store the SHA1 binary value in the journal, so convert it
1104             # to a printable value that can be converted back later
1105 282         590 my $encoded_tok = unpack("H*", $token);
1106 282         776 $self->defer_update ("t $atime $encoded_tok");
1107             }
1108             }
1109              
1110             sub defer_update {
1111 840     840 0 1922 my ($self, $str) = @_;
1112 840         2628 $self->{string_to_journal} .= "$str\n";
1113             }
1114              
1115             ###########################################################################
1116              
1117             sub cleanup {
1118 14     14 1 51 my ($self) = @_;
1119              
1120 14         68 my $nbytes = length ($self->{string_to_journal});
1121 14 100       75 return if ($nbytes == 0);
1122              
1123 6         50 my $path = $self->_get_journal_filename();
1124              
1125             # use append mode, write atomically, then close, so simultaneous updates are
1126             # not lost
1127 6         32 my $conf = $self->{bayes}->{main}->{conf};
1128              
1129             # set the umask to the inverse of what we want ...
1130 6         58 my $umask = umask(0777 - (oct ($conf->{bayes_file_mode}) & 0666));
1131              
1132 6 50       581 if (!open (OUT, ">>".$path)) {
1133 0         0 warn "bayes: cannot write to $path, bayes db update ignored: $!\n";
1134 0         0 umask $umask; # reset umask
1135 0         0 return;
1136             }
1137 6         51 umask $umask; # reset umask
1138              
1139             # do not use print() here, it will break up the buffer if it's >8192 bytes,
1140             # which could result in two sets of tokens getting mixed up and their
1141             # touches missed.
1142 6         28 my $write_failure = 0;
1143 6         26 my $original_point = tell OUT;
1144 6 50       32 $original_point >= 0 or die "Can't obtain file position: $!";
1145 6         16 my $len;
1146 6         12 do {
1147 6         244 $len = syswrite (OUT, $self->{string_to_journal}, $nbytes);
1148              
1149             # argh, write failure, give up
1150 6 50 33     107 if (!defined $len || $len < 0) {
1151 0         0 my $err = '';
1152 0 0       0 if (!defined $len) {
1153 0         0 $len = 0;
1154 0         0 $err = " ($!)";
1155             }
1156 0         0 warn "bayes: write failed to Bayes journal $path ($len of $nbytes)!$err\n";
1157 0         0 last;
1158             }
1159              
1160             # This shouldn't happen, but could if the fs is full...
1161 6 50       94 if ($len != $nbytes) {
1162 0         0 warn "bayes: partial write to bayes journal $path ($len of $nbytes), recovering\n";
1163              
1164             # we want to be atomic, so revert the journal file back to where
1165             # we know it's "good". if we can't truncate the journal, or we've
1166             # tried 5 times to do the write, abort!
1167 0 0 0     0 if (!truncate(OUT, $original_point) || ($write_failure++ > 4)) {
1168 0         0 warn "bayes: cannot write to bayes journal $path, aborting!\n";
1169 0         0 last;
1170             }
1171              
1172             # if the fs is full, let's give the system a break
1173 0         0 sleep 1;
1174             }
1175             } while ($len != $nbytes);
1176              
1177 6 50       103 if (!close OUT) {
1178 0         0 warn "bayes: cannot write to $path, bayes db update ignored\n";
1179             }
1180              
1181 6         46 $self->{string_to_journal} = '';
1182             }
1183              
1184             # Return a qr'd RE to match a token with the correct format's magic token
1185             sub get_magic_re {
1186 532     532 1 1011 my ($self) = @_;
1187              
1188 532 50 66     2029 if (!defined $self->{db_version} || $self->{db_version} >= 1) {
1189 532         1410 return MAGIC_RE;
1190             }
1191              
1192             # When in doubt, assume v0
1193 0         0 return qr/^\*\*[A-Z]+$/;
1194             }
1195              
1196             # provide a more generalized public interface into the journal sync
1197              
1198             sub sync {
1199 2     2 1 7 my ($self, $opts) = @_;
1200              
1201 2         36 return $self->_sync_journal($opts);
1202             }
1203              
1204             ###########################################################################
1205             # And this method reads the journal and applies the changes in one
1206             # (locked) transaction.
1207              
1208             sub _sync_journal {
1209 2     2   11 my ($self, $opts) = @_;
1210 2         10 my $ret = 0;
1211              
1212 2         11 my $path = $self->_get_journal_filename();
1213              
1214             # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
1215 2 50 33     85 if (!stat($path) || !-f _ || -z _) {
      33        
1216 0         0 return 0;
1217             }
1218              
1219 2         12 my $eval_stat;
1220             eval {
1221 2         13 local $SIG{'__DIE__'}; # do not run user die() traps in here
1222 2 50       18 if ($self->tie_db_writable()) {
1223 2         32 $ret = $self->_sync_journal_trapped($opts, $path);
1224             }
1225 2         17 1;
1226 2 50       6 } or do {
1227 0 0       0 $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
1228             };
1229              
1230             # ok, untie from write-mode if we can
1231 2 50       18 if (!$self->{bayes}->{main}->{learn_caller_will_untie}) {
1232 2         15 $self->untie_db();
1233             }
1234              
1235             # handle any errors that may have occurred
1236 2 50       19 if (defined $eval_stat) {
1237 0         0 warn "bayes: $eval_stat\n";
1238 0         0 return 0;
1239             }
1240              
1241 2         12 $ret;
1242             }
1243              
1244             sub _sync_journal_trapped {
1245 2     2   17 my ($self, $opts, $path) = @_;
1246              
1247             # Flag that we're doing work
1248 2         21 $self->set_running_expire_tok();
1249              
1250 2         12 my $started = time();
1251 2         7 my $count = 0;
1252 2         6 my $total_count = 0;
1253 2         9 my %tokens;
1254 2         9 my $showdots = $opts->{showdots};
1255 2         9 my $retirepath = $path.".old";
1256              
1257             # if $path doesn't exist, or it's not a file, or is 0 bytes in length,
1258             # return we have to check again since the file may have been removed
1259             # by a recent bayes db upgrade ...
1260 2 50 33     94 if (!stat($path) || !-f _ || -z _) {
      33        
1261 0         0 return 0;
1262             }
1263              
1264 2 50       39 if (!-r $path) { # will we be able to read the file?
1265 0         0 warn "bayes: bad permissions on journal, can't read: $path\n";
1266 0         0 return 0;
1267             }
1268              
1269             # This is the critical phase (moving files around), so don't allow
1270             # it to be interrupted.
1271             {
1272 2         5 local $SIG{'INT'} = 'IGNORE';
  2         62  
1273 2         57 local $SIG{'TERM'} = 'IGNORE';
1274 2 50       17 local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
1275              
1276             # retire the journal, so we can update the db files from it in peace.
1277             # TODO: use locking here
1278 2 50       125 if (!rename ($path, $retirepath)) {
1279 0         0 warn "bayes: failed rename $path to $retirepath\n";
1280 0         0 return 0;
1281             }
1282              
1283             # now read the retired journal
1284 2         10 local *JOURNAL;
1285 2 50       80 if (!open (JOURNAL, "<$retirepath")) {
1286 0         0 warn "bayes: cannot open read $retirepath\n";
1287 0         0 return 0;
1288             }
1289              
1290              
1291             # Read the journal
1292 2         48 for ($!=0; defined($_=<JOURNAL>); $!=0) {
1293 558         1035 $total_count++;
1294              
1295 558 50       3969 if (/^t (\d+) (.+)$/) { # Token timestamp update, cache resultant entries
    100          
    100          
    50          
1296 0         0 my $tok = pack("H*",$2);
1297 0 0 0     0 $tokens{$tok} = $1+0 if (!exists $tokens{$tok} || $1+0 > $tokens{$tok});
1298             } elsif (/^c (-?\d+) (-?\d+) (\d+) (.+)$/) { # Add/full token update
1299 554         2189 my $tok = pack("H*",$4);
1300 554         3367 $self->tok_sync_counters ($1+0, $2+0, $3+0, $tok);
1301 554         4438 $count++;
1302             } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count
1303 2         25 $self->tok_sync_nspam_nham ($1+0, $2+0);
1304 2         23 $count++;
1305             } elsif (/^m ([hsf]) (.+)$/) { # update msgid seen database
1306 2 50       26 if ($1 eq "f") {
1307 0         0 $self->_seen_delete_direct($2);
1308             }
1309             else {
1310 2         25 $self->_seen_put_direct($2,$1);
1311             }
1312 2         35 $count++;
1313             } else {
1314 0         0 warn "bayes: gibberish entry found in journal: $_";
1315             }
1316             }
1317 2 0 33     49 defined $_ || $!==0 or
    50          
1318             $!==EBADF ? dbg("bayes: error reading journal file: $!")
1319             : die "error reading journal file: $!";
1320 2 50       45 close(JOURNAL) or die "Can't close journal file: $!";
1321              
1322             # Now that we've determined what tokens we need to update and their
1323             # final values, update the DB. Should be much smaller than the full
1324             # journal entries.
1325 2         21 while (my ($k,$v) = each %tokens) {
1326 0         0 $self->tok_touch_token ($v, $k);
1327              
1328 0 0       0 if ((++$count % 1000) == 0) {
1329 0 0       0 if ($showdots) { print STDERR "."; }
  0         0  
1330 0         0 $self->set_running_expire_tok();
1331             }
1332             }
1333              
1334 2 50       16 if ($showdots) { print STDERR "\n"; }
  0         0  
1335              
1336             # we're all done, so unlink the old journal file
1337 2 50       181 unlink ($retirepath) || warn "bayes: can't unlink $retirepath: $!\n";
1338              
1339 2         56 $self->{db_toks}->{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $started;
1340              
1341 2         13 my $done = time();
1342 2         23 my $msg = ("bayes: synced databases from journal in " .
1343             ($done - $started) .
1344             " seconds: $count unique entries ($total_count total entries)");
1345              
1346 2 50       27 if ($opts->{verbose}) {
1347 0         0 print $msg,"\n";
1348             } else {
1349 2         22 dbg($msg);
1350             }
1351             }
1352              
1353             # else, that's the lot, we're synced. return
1354 2         25 return 1;
1355             }
1356              
1357             sub tok_touch_token {
1358 0     0 0 0 my ($self, $atime, $tok) = @_;
1359 0         0 my ($ts, $th, $oldatime) = $self->tok_get ($tok);
1360              
1361             # If the new atime is < the old atime, ignore the update
1362             # We figure that we'll never want to lower a token atime, so abort if
1363             # we try. (journal out of sync, etc.)
1364 0 0       0 return if ($oldatime >= $atime);
1365              
1366 0         0 $self->tok_put ($tok, $ts, $th, $atime);
1367             }
1368              
1369             sub tok_sync_counters {
1370 2770     2770 0 7841 my ($self, $ds, $dh, $atime, $tok) = @_;
1371 2770         6759 my ($ts, $th, $oldatime) = $self->tok_get ($tok);
1372 2770 50       6869 $ts += $ds; if ($ts < 0) { $ts = 0; }
  2770         5785  
  0         0  
1373 2770 50       4067 $th += $dh; if ($th < 0) { $th = 0; }
  2770         5151  
  0         0  
1374              
1375             # Don't roll the atime of tokens backwards ...
1376 2770 100       6669 $atime = $oldatime if ($oldatime > $atime);
1377              
1378 2770         6621 $self->tok_put ($tok, $ts, $th, $atime);
1379             }
1380              
1381             sub tok_put {
1382 2770     2770 0 7390 my ($self, $tok, $ts, $th, $atime) = @_;
1383 2770   100     8951 $ts ||= 0;
1384 2770   100     8761 $th ||= 0;
1385              
1386             # Ignore magic tokens, the don't go in this way ...
1387 2770 50       7141 return if ($tok =~ MAGIC_RE);
1388              
1389             # use defined() rather than exists(); the latter is not supported
1390             # by NDBM_File, believe it or not. Using defined() did not
1391             # indicate any noticeable speed hit in my testing. (Mar 31 2003 jm)
1392 2770         12717 my $exists_already = defined $self->{db_toks}->{$tok};
1393              
1394 2770 100 100     10274 if ($ts == 0 && $th == 0) {
1395 1108 50       2255 return if (!$exists_already); # If the token doesn't exist, just return
1396 1108         23898 $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}--;
1397 1108         23679 delete $self->{db_toks}->{$tok};
1398             } else {
1399 1662 50       3342 if (!$exists_already) { # If the token doesn't exist, raise the token count
1400 1662         34609 $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}++;
1401             }
1402              
1403 1662         7131 $self->{db_toks}->{$tok} = $self->tok_pack ($ts, $th, $atime);
1404              
1405 1662         17874 my $newmagic = $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN};
1406 1662 100 66     10496 if (!defined ($newmagic) || $atime > $newmagic) {
1407 4         64 $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
1408             }
1409              
1410             # Make sure to check for either !defined or "" ... Apparently
1411             # sometimes the DB module doesn't return the value correctly. :(
1412 1662         16982 my $oldmagic = $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN};
1413 1662 100 66     17362 if (!defined ($oldmagic) || $oldmagic eq "" || $atime < $oldmagic) {
      66        
1414 4         82 $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
1415             }
1416             }
1417             }
1418              
1419             sub tok_sync_nspam_nham {
1420 10     10 0 36 my ($self, $ds, $dh) = @_;
1421 10         41 my ($ns, $nh) = ($self->get_storage_variables())[1,2];
1422 10 100       49 if ($ds) { $ns += $ds; } if ($ns < 0) { $ns = 0; }
  6 50       17  
  10         34  
  0         0  
1423 10 100       37 if ($dh) { $nh += $dh; } if ($nh < 0) { $nh = 0; }
  4 50       11  
  10         40  
  0         0  
1424 10         307 $self->{db_toks}->{$NSPAM_MAGIC_TOKEN} = $ns;
1425 10         233 $self->{db_toks}->{$NHAM_MAGIC_TOKEN} = $nh;
1426             }
1427              
1428             ###########################################################################
1429              
1430             sub _get_journal_filename {
1431 18     18   56 my ($self) = @_;
1432              
1433 18         52 my $main = $self->{bayes}->{main};
1434 18         134 return $main->sed_path($main->{conf}->{bayes_path}."_journal");
1435             }
1436              
1437             ###########################################################################
1438              
1439             # this is called directly from sa-learn(1).
1440             sub perform_upgrade {
1441 0     0 1 0 my ($self, $opts) = @_;
1442 0         0 my $ret = 0;
1443              
1444 0         0 my $eval_stat;
1445             eval {
1446 0         0 local $SIG{'__DIE__'}; # do not run user die() traps in here
1447              
1448 20     20   245 use File::Basename;
  20         583  
  20         51084  
1449              
1450             # bayes directory
1451 0         0 my $main = $self->{bayes}->{main};
1452 0         0 my $path = $main->sed_path($main->{conf}->{bayes_path});
1453              
1454             # prevent dirname() from tainting the result, it assumes $1 is not tainted
1455 0         0 local($1,$2,$3); # Bug 6310; perl #67962 (fixed in perl 5.12/5.13)
1456 0         0 my $dir = dirname($path);
1457              
1458             # make temporary copy since old dbm and new dbm may have same name
1459 0 0       0 opendir(DIR, $dir) or die "bayes: can't opendir $dir: $!";
1460 0         0 my @files = grep { /^bayes_(?:seen|toks)(?:\.\w+)?$/ } readdir(DIR);
  0         0  
1461 0 0       0 closedir(DIR) or die "bayes: can't close directory $dir: $!";
1462 0 0 0     0 if (@files < 2 || !grep(/bayes_seen/,@files) || !grep(/bayes_toks/,@files))
      0        
1463             {
1464 0         0 die "bayes: unable to find bayes_toks and bayes_seen, stopping\n";
1465             }
1466             # untaint @files (already safe after grep)
1467 0         0 untaint_var(\@files);
1468            
1469 0         0 for (@files) {
1470 0         0 my $src = "$dir/$_";
1471 0         0 my $dst = "$dir/old_$_";
1472 0 0       0 eval q{
1473             use File::Copy;
1474             copy($src, $dst);
1475             } || die "bayes: can't copy $src to $dst: $!\n";
1476             }
1477              
1478             # delete previous to make way for import
1479 0         0 for (@files) { unlink("$dir/$_"); }
  0         0  
1480              
1481             # import
1482 0 0       0 if ($self->tie_db_writable()) {
1483             $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_seen",
1484 0         0 $self->{db_seen});
1485             $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_toks",
1486 0         0 $self->{db_toks});
1487             }
1488              
1489 0 0       0 if ($ret == 2) {
1490 0         0 print "import successful, original files saved with \"old\" prefix\n";
1491             }
1492             else {
1493 0         0 print "import failed, original files saved with \"old\" prefix\n";
1494             }
1495 0         0 1;
1496 0 0       0 } or do {
1497 0 0       0 $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
1498             };
1499              
1500 0         0 $self->untie_db();
1501              
1502             # if we died, untie the dbm files
1503 0 0       0 if (defined $eval_stat) {
1504 0         0 warn "bayes: perform_upgrade: $eval_stat\n";
1505 0         0 return 0;
1506             }
1507 0         0 $ret;
1508             }
1509              
1510             sub upgrade_old_dbm_files_trapped {
1511 0     0 0 0 my ($self, $filename, $output) = @_;
1512              
1513 0         0 my $count;
1514             my %in;
1515              
1516 0         0 print "upgrading to DB_File, please be patient: $filename\n";
1517              
1518             # try each type of file until we find one with > 0 entries
1519 0         0 for my $dbm ('DB_File', 'GDBM_File', 'NDBM_File', 'SDBM_File') {
1520 0         0 $count = 0;
1521             # wrap in eval so it doesn't run in general use. This accesses db
1522             # modules directly.
1523             # Note: (bug 2390), the 'use' needs to be on the same line as the eval
1524             # for RPM dependency checks to work properly. It's lame, but...
1525 0         0 my $eval_stat;
1526             eval 'use ' . $dbm . ';
1527             tie %in, "' . $dbm . '", $filename, O_RDONLY, 0600;
1528             %{ $output } = %in;
1529             $count = scalar keys %{ $output };
1530             untie %in;
1531             1;
1532 0 0       0 ' or do {
1533 0 0       0 $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
1534             };
1535 0 0       0 if (defined $eval_stat) {
    0          
1536 0         0 print "$dbm: $dbm module not installed(?), nothing copied: $eval_stat\n";
1537 0         0 dbg("bayes: error was: $eval_stat");
1538             }
1539             elsif ($count == 0) {
1540 0         0 print "$dbm: no database of that kind found, nothing copied\n";
1541             }
1542             else {
1543 0         0 print "$dbm: copied $count entries\n";
1544 0         0 return 1;
1545             }
1546             }
1547              
1548 0         0 return 0;
1549             }
1550              
1551             sub clear_database {
1552 2     2 1 12 my ($self) = @_;
1553              
1554 2 50       21 return 0 unless ($self->tie_db_writable());
1555              
1556 2         22 dbg("bayes: untie-ing in preparation for removal.");
1557              
1558 2         17 foreach my $dbname (@DBNAMES) {
1559 4         15 my $db_var = 'db_'.$dbname;
1560              
1561 4 50       15 if (exists $self->{$db_var}) {
1562             # dbg("bayes: untie-ing $db_var");
1563 4         9 untie %{$self->{$db_var}};
  4         60  
1564 4         19 delete $self->{$db_var};
1565             }
1566             }
1567              
1568 2         17 my $path = $self->{bayes}->{main}->sed_path($self->{bayes}->{main}->{conf}->{bayes_path});
1569              
1570 2         7 foreach my $dbname (@DBNAMES, 'journal') {
1571 6         22 foreach my $ext ($self->DB_EXTENSIONS) {
1572 12         32 my $name = $path.'_'.$dbname.$ext;
1573 12         616 my $ret = unlink $name;
1574 12 100       71 dbg("bayes: clear_database: %s %s",
1575             $ret ? 'removed' : 'tried to remove', $name);
1576             }
1577             }
1578              
1579             # the journal file needs to be done separately since it has no extension
1580 2         8 foreach my $dbname ('journal') {
1581 2         8 my $name = $path.'_'.$dbname;
1582 2         78 my $ret = unlink $name;
1583 2 50       26 dbg("bayes: clear_database: %s %s",
1584             $ret ? 'removed' : 'tried to remove', $name);
1585             }
1586              
1587 2         17 $self->untie_db();
1588              
1589 2         15 return 1;
1590             }
1591              
1592             sub backup_database {
1593 0     0 1 0 my ($self) = @_;
1594              
1595             # we tie writable because we want the upgrade code to kick in if needed
1596 0 0       0 return 0 unless ($self->tie_db_writable());
1597              
1598 0         0 my @vars = $self->get_storage_variables();
1599              
1600 0         0 print "v\t$vars[6]\tdb_version # this must be the first line!!!\n";
1601 0         0 print "v\t$vars[1]\tnum_spam\n";
1602 0         0 print "v\t$vars[2]\tnum_nonspam\n";
1603              
1604 0         0 while (my ($tok, $packed) = each %{$self->{db_toks}}) {
  0         0  
1605 0 0       0 next if ($tok =~ MAGIC_RE); # skip magic tokens
1606              
1607 0         0 my ($ts, $th, $atime) = $self->tok_unpack($packed);
1608 0         0 my $encoded_token = unpack("H*",$tok);
1609 0         0 print "t\t$ts\t$th\t$atime\t$encoded_token\n";
1610             }
1611              
1612 0         0 while (my ($msgid, $flag) = each %{$self->{db_seen}}) {
  0         0  
1613 0         0 print "s\t$flag\t$msgid\n";
1614             }
1615              
1616 0         0 $self->untie_db();
1617              
1618 0         0 return 1;
1619             }
1620              
1621             sub restore_database {
1622 0     0 1 0 my ($self, $filename, $showdots) = @_;
1623              
1624 0         0 local *DUMPFILE;
1625 0 0       0 if (!open(DUMPFILE, '<', $filename)) {
1626 0         0 dbg("bayes: unable to open backup file $filename: $!");
1627 0         0 return 0;
1628             }
1629            
1630 0 0       0 if (!$self->tie_db_writable()) {
1631 0         0 dbg("bayes: failed to tie db writable");
1632 0         0 return 0;
1633             }
1634              
1635 0         0 my $main = $self->{bayes}->{main};
1636 0         0 my $path = $main->sed_path($main->{conf}->{bayes_path});
1637              
1638             # use a temporary PID-based suffix just in case another one was
1639             # created previously by an interrupted expire
1640 0         0 my $tmpsuffix = "convert$$";
1641 0         0 my $tmptoksdbname = $path.'_toks.'.$tmpsuffix;
1642 0         0 my $tmpseendbname = $path.'_seen.'.$tmpsuffix;
1643 0         0 my $toksdbname = $path.'_toks';
1644 0         0 my $seendbname = $path.'_seen';
1645              
1646 0         0 my %new_toks;
1647             my %new_seen;
1648 0         0 my $umask = umask 0;
1649 0 0       0 unless (tie %new_toks, $self->DBM_MODULE, $tmptoksdbname, O_RDWR|O_CREAT|O_EXCL,
1650             (oct ($main->{conf}->{bayes_file_mode}) & 0666)) {
1651 0         0 dbg("bayes: failed to tie temp toks db: $!");
1652 0         0 $self->untie_db();
1653 0         0 umask $umask;
1654 0         0 return 0;
1655             }
1656 0 0       0 unless (tie %new_seen, $self->DBM_MODULE, $tmpseendbname, O_RDWR|O_CREAT|O_EXCL,
1657             (oct ($main->{conf}->{bayes_file_mode}) & 0666)) {
1658 0         0 dbg("bayes: failed to tie temp seen db: $!");
1659 0         0 untie %new_toks;
1660 0         0 $self->_unlink_file($tmptoksdbname);
1661 0         0 $self->untie_db();
1662 0         0 umask $umask;
1663 0         0 return 0;
1664             }
1665 0         0 umask $umask;
1666              
1667 0         0 my $line_count = 0;
1668 0         0 my $db_version;
1669 0         0 my $token_count = 0;
1670 0         0 my $num_spam;
1671             my $num_ham;
1672 0         0 my $error_p = 0;
1673 0         0 my $newest_token_age = 0;
1674             # Kinda weird I know, but we need a nice big value and we know there will be
1675             # no tokens > time() since we reset atime if > time(), so use that with a
1676             # little buffer just in case.
1677 0         0 my $oldest_token_age = time() + 100000;
1678              
1679 0         0 my $line = <DUMPFILE>;
1680 0 0       0 defined $line or die "Error reading dump file: $!";
1681 0         0 $line_count++;
1682              
1683             # We require the database version line to be the first in the file so we can
1684             # figure out how to properly deal with the file. If it is not the first
1685             # line then fail
1686 0 0       0 if ($line =~ m/^v\s+(\d+)\s+db_version/) {
1687 0         0 $db_version = $1;
1688             }
1689             else {
1690 0         0 dbg("bayes: database version must be the first line in the backup file, correct and re-run");
1691 0         0 untie %new_toks;
1692 0         0 untie %new_seen;
1693 0         0 $self->_unlink_file($tmptoksdbname);
1694 0         0 $self->_unlink_file($tmpseendbname);
1695 0         0 $self->untie_db();
1696 0         0 return 0;
1697             }
1698              
1699 0 0 0     0 unless ($db_version == 2 || $db_version == 3) {
1700 0         0 warn("bayes: database version $db_version is unsupported, must be version 2 or 3");
1701 0         0 untie %new_toks;
1702 0         0 untie %new_seen;
1703 0         0 $self->_unlink_file($tmptoksdbname);
1704 0         0 $self->_unlink_file($tmpseendbname);
1705 0         0 $self->untie_db();
1706 0         0 return 0;
1707             }
1708              
1709 0         0 for ($!=0; defined($line=<DUMPFILE>); $!=0) {
1710 0         0 chomp($line);
1711 0         0 $line_count++;
1712              
1713 0 0       0 if ($line_count % 1000 == 0) {
1714 0 0       0 print STDERR "." if ($showdots);
1715             }
1716              
1717 0 0       0 if ($line =~ /^v\s+/) { # variable line
    0          
    0          
1718 0         0 my @parsed_line = split(/\s+/, $line, 3);
1719 0         0 my $value = $parsed_line[1] + 0;
1720 0 0       0 if ($parsed_line[2] eq 'num_spam') {
    0          
1721 0         0 $num_spam = $value;
1722             }
1723             elsif ($parsed_line[2] eq 'num_nonspam') {
1724 0         0 $num_ham = $value;
1725             }
1726             else {
1727 0         0 dbg("bayes: restore_database: skipping unknown line: $line");
1728             }
1729             }
1730             elsif ($line =~ /^t\s+/) { # token line
1731 0         0 my @parsed_line = split(/\s+/, $line, 5);
1732 0         0 my $spam_count = $parsed_line[1] + 0;
1733 0         0 my $ham_count = $parsed_line[2] + 0;
1734 0         0 my $atime = $parsed_line[3] + 0;
1735 0         0 my $token = $parsed_line[4];
1736              
1737 0         0 my $token_warn_p = 0;
1738 0         0 my @warnings;
1739              
1740 0 0       0 if ($spam_count < 0) {
1741 0         0 $spam_count = 0;
1742 0         0 push(@warnings, 'spam count < 0, resetting');
1743 0         0 $token_warn_p = 1;
1744             }
1745 0 0       0 if ($ham_count < 0) {
1746 0         0 $ham_count = 0;
1747 0         0 push(@warnings, 'ham count < 0, resetting');
1748 0         0 $token_warn_p = 1;
1749             }
1750              
1751 0 0 0     0 if ($spam_count == 0 && $ham_count == 0) {
1752 0         0 dbg("bayes: token has zero spam and ham count, skipping");
1753 0         0 next;
1754             }
1755              
1756 0 0       0 if ($atime > time()) {
1757 0         0 $atime = time();
1758 0         0 push(@warnings, 'atime > current time, resetting');
1759 0         0 $token_warn_p = 1;
1760             }
1761              
1762 0 0       0 if ($token_warn_p) {
1763 0         0 dbg("bayes: token (%s) has the following warnings:\n%s",
1764             $token, join("\n",@warnings));
1765             }
1766              
1767             # database versions < 3 did not encode their token values
1768 0 0       0 if ($db_version < 3) {
1769 0         0 $token = substr(sha1($token), -5);
1770             }
1771             else {
1772             # turn unpacked binary token back into binary value
1773 0         0 $token = pack("H*",$token);
1774             }
1775              
1776 0         0 $new_toks{$token} = $self->tok_pack($spam_count, $ham_count, $atime);
1777 0 0       0 if ($atime < $oldest_token_age) {
1778 0         0 $oldest_token_age = $atime;
1779             }
1780 0 0       0 if ($atime > $newest_token_age) {
1781 0         0 $newest_token_age = $atime;
1782             }
1783 0         0 $token_count++;
1784             }
1785             elsif ($line =~ /^s\s+/) { # seen line
1786 0         0 my @parsed_line = split(/\s+/, $line, 3);
1787 0         0 my $flag = $parsed_line[1];
1788 0         0 my $msgid = $parsed_line[2];
1789              
1790 0 0 0     0 unless ($flag eq 'h' || $flag eq 's') {
1791 0         0 dbg("bayes: unknown seen flag ($flag) for line: $line, skipping");
1792 0         0 next;
1793             }
1794              
1795 0 0       0 unless ($msgid) {
1796 0         0 dbg("bayes: blank msgid for line: $line, skipping");
1797 0         0 next;
1798             }
1799              
1800 0         0 $new_seen{$msgid} = $flag;
1801             }
1802             else {
1803 0         0 dbg("bayes: skipping unknown line: $line");
1804 0         0 next;
1805             }
1806             }
1807 0 0 0     0 defined $line || $!==0 or die "Error reading dump file: $!";
1808 0 0       0 close(DUMPFILE) or die "Can't close dump file: $!";
1809              
1810 0 0       0 print STDERR "\n" if ($showdots);
1811              
1812 0 0       0 unless (defined($num_spam)) {
1813 0         0 dbg("bayes: unable to find num spam, please check file");
1814 0         0 $error_p = 1;
1815             }
1816              
1817 0 0       0 unless (defined($num_ham)) {
1818 0         0 dbg("bayes: unable to find num ham, please check file");
1819 0         0 $error_p = 1;
1820             }
1821              
1822 0 0       0 if ($error_p) {
1823 0         0 dbg("bayes: error(s) while attempting to load $filename, correct and re-run");
1824              
1825 0         0 untie %new_toks;
1826 0         0 untie %new_seen;
1827 0         0 $self->_unlink_file($tmptoksdbname);
1828 0         0 $self->_unlink_file($tmpseendbname);
1829 0         0 $self->untie_db();
1830 0         0 return 0;
1831             }
1832              
1833             # set the calculated magic tokens
1834 0         0 $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION();
1835 0         0 $new_toks{$NTOKENS_MAGIC_TOKEN} = $token_count;
1836 0         0 $new_toks{$NSPAM_MAGIC_TOKEN} = $num_spam;
1837 0         0 $new_toks{$NHAM_MAGIC_TOKEN} = $num_ham;
1838 0         0 $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newest_token_age;
1839 0         0 $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest_token_age;
1840              
1841             # go ahead and zero out these, chances are good that they are bogus anyway.
1842 0         0 $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = 0;
1843 0         0 $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = 0;
1844 0         0 $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
1845 0         0 $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
1846              
1847 0         0 local $SIG{'INT'} = 'IGNORE';
1848 0         0 local $SIG{'TERM'} = 'IGNORE';
1849 0 0       0 local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows();
1850              
1851 0         0 untie %new_toks;
1852 0         0 untie %new_seen;
1853 0         0 $self->untie_db();
1854              
1855             # Here is where something can go horribly wrong and screw up the bayes
1856             # database files. If we are able to copy one and not the other then it
1857             # will leave the database in an inconsistent state. Since this is an
1858             # edge case, and they're trying to replace the DB anyway we should be ok.
1859 0 0       0 unless ($self->_rename_file($tmptoksdbname, $toksdbname)) {
1860 0         0 dbg("bayes: error while renaming $tmptoksdbname to $toksdbname: $!");
1861 0         0 return 0;
1862             }
1863 0 0       0 unless ($self->_rename_file($tmpseendbname, $seendbname)) {
1864 0         0 dbg("bayes: error while renaming $tmpseendbname to $seendbname: $!");
1865 0         0 dbg("bayes: database now in inconsistent state");
1866 0         0 return 0;
1867             }
1868              
1869 0         0 dbg("bayes: parsed $line_count lines");
1870 0         0 dbg("bayes: created database with $token_count tokens based on $num_spam spam messages and $num_ham ham messages");
1871              
1872 0         0 return 1;
1873             }
1874              
1875             ###########################################################################
1876              
1877             # token marshalling format for db_toks.
1878              
1879             # Since we may have many entries with few hits, especially thousands of hapaxes
1880             # (1-occurrence entries), use a flexible entry format, instead of simply "2
1881             # packed ints", to keep the memory and disk space usage down. In my
1882             # 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we
1883             # can use a 1-byte representation for the other 91% of low-hitting entries
1884             # and save masses of space.
1885              
1886             # This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3
1887             # ham-count bits). If XX in the first byte is 11, it's packed as this 1-byte
1888             # representation; otherwise, if XX in the first byte is 00, it's packed as
1889             # "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format.
1890              
1891             # Savings: roughly halves size of toks db, at the cost of a ~10% slowdown.
1892              
1893 20     20   242 use constant FORMAT_FLAG => 0xc0; # 11000000
  20         63  
  20         1533  
1894 20     20   153 use constant ONE_BYTE_FORMAT => 0xc0; # 11000000
  20         43  
  20         1733  
1895 20     20   148 use constant TWO_LONGS_FORMAT => 0x00; # 00000000
  20         572  
  20         1369  
1896              
1897 20     20   157 use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000
  20         712  
  20         1365  
1898 20     20   683 use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111
  20         52  
  20         9859  
1899              
1900             sub tok_unpack {
1901 5536     5536 0 58414 my ($self, $value) = @_;
1902 5536   100     20149 $value ||= 0;
1903              
1904 5536         8356 my ($packed, $atime);
1905 5536 50       12809 if ($self->{db_version} >= 1) {
    0          
1906 5536         14567 ($packed, $atime) = unpack("CV", $value);
1907             }
1908             elsif ($self->{db_version} == 0) {
1909 0         0 ($packed, $atime) = unpack("CS", $value);
1910             }
1911              
1912 5536 100       13734 if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
    50          
1913 3134   50     13319 return (($packed & ONE_BYTE_SSS_BITS) >> 3,
1914             $packed & ONE_BYTE_HHH_BITS,
1915             $atime || 0);
1916             }
1917             elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
1918 2402         3670 my ($packed, $ts, $th, $atime);
1919 2402 50       4508 if ($self->{db_version} >= 1) {
    0          
1920 2402         5113 ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
1921             }
1922             elsif ($self->{db_version} == 0) {
1923 0         0 ($packed, $ts, $th, $atime) = unpack("CLLS", $value);
1924             }
1925 2402   100     16388 return ($ts || 0, $th || 0, $atime || 0);
      100        
      100        
1926             }
1927             # other formats would go here...
1928             else {
1929 0         0 warn "bayes: unknown packing format for bayes db, please re-learn: $packed";
1930 0         0 return (0, 0, 0);
1931             }
1932             }
1933              
1934             sub tok_pack {
1935 1662     1662 0 5444 my ($self, $ts, $th, $atime) = @_;
1936 1662   100     5465 $ts ||= 0; $th ||= 0; $atime ||= 0;
  1662   100     5489  
  1662   50     3924  
1937 1662 50 33     6105 if ($ts < 8 && $th < 8) {
1938 1662         36926 return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime);
1939             } else {
1940 0         0 return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
1941             }
1942             }
1943              
1944             ###########################################################################
1945              
1946             sub db_readable {
1947 16     16 1 46 my ($self) = @_;
1948 16         70 return $self->{already_tied};
1949             }
1950              
1951             sub db_writable {
1952 0     0 1   my ($self) = @_;
1953 0   0       return $self->{already_tied} && $self->{is_locked};
1954             }
1955              
1956             ###########################################################################
1957              
1958             sub _unlink_file {
1959 0     0     my ($self, $filename) = @_;
1960              
1961 0           unlink $filename;
1962             }
1963              
1964             sub _rename_file {
1965 0     0     my ($self, $sourcefilename, $targetfilename) = @_;
1966              
1967 0 0         return 0 unless (rename($sourcefilename, $targetfilename));
1968              
1969 0           return 1;
1970             }
1971              
1972 0     0 0   sub sa_die { Mail::SpamAssassin::sa_die(@_); }
1973              
1974             1;